In [1]:
import seaborn as sns
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import plotly.express as px
import matplotlib as plt
from matplotlib import style

style.use("ggplot")
import re
import nltk

nltk.download("stopwords")
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize  # Import word_tokenize function

nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data = pd.read_csv(
    "C:\\Users\\ayush\\OneDrive\\Desktop\\NLP\\Final Project\\Train_data.csv"
)

In [3]:
data1 = data.sample(n=100000, random_state=42)


data1

Unnamed: 0,category,rating,sentiment,review
76637,negative,1,-1,After seeing the Altoids advertisement in Amaz...
64372,negative,1,-1,I am so disappointed with these almonds. They...
252387,positive,5,1,Great product. Great packaging.<br /><br />I o...
251573,positive,5,1,"Purchased original can at Walmart. Of course, ..."
318742,positive,5,1,I was very happy with this product. You could...
...,...,...,...,...
188636,neutral,3,0,Everything was good about this jerky except th...
117743,negative,1,-1,I bought this product along with Dr. McDougall...
166855,neutral,3,0,Nothing wrong with the product - I buy it in t...
138409,neutral,3,0,This is a great product-my dog loves these. I ...


In [4]:
data1.sentiment.dtype

dtype('int64')

In [5]:
def data_processing(text):
    text = text.lower()  # Making everthing in lower case
    text = re.sub("<br />", "", text)  # removing breaks
    text = re.sub(
        r"https\S+|www\S+|http\S+", "", text, flags=re.MULTILINE
    )  # removing links
    text = re.sub(r"\@w+|\#", "", text)  # removing @ and #
    text = re.sub(r"[^\w\s]", "", text)  # removing punctuations
    text_tokens = word_tokenize(text)
    filtered_text = [
        w for w in text_tokens if not w in stop_words
    ]  # removing stop words
    return " ".join(filtered_text)

In [6]:
data1.review = data1["review"].apply(data_processing)

In [7]:
duplicated_count = data1.duplicated().sum()
print("Number of duplicate entries: ", duplicated_count)

Number of duplicate entries:  26906


In [8]:
data1 = data.drop_duplicates("review")

In [9]:
stemmer = PorterStemmer()  # Stemming for text normalization


def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [10]:
data1.review = data1["review"].apply(lambda x: stemming(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1.review = data1['review'].apply(lambda x: stemming(x))


In [11]:
from collections import Counter  # counting the most common words

count = Counter()
for text in data1["review"].values:
    for word in text.split():
        count[word] += 1
count.most_common(15)

[('the', 531325),
 ('I', 446809),
 ('and', 367314),
 ('a', 355873),
 ('to', 312102),
 ('of', 250670),
 ('is', 218635),
 ('it', 206933),
 ('in', 158855),
 ('this', 157945),
 ('for', 155384),
 ('that', 129385),
 ('was', 111046),
 ('but', 109453),
 ('my', 107818)]

In [12]:
X = data1.review
y = data1.sentiment

In [13]:
vectorizer = TfidfVectorizer(stop_words="english", lowercase=True, max_features=7000)


tfidf_data = vectorizer.fit_transform(X)

In [14]:
df = pd.DataFrame(tfidf_data.toarray(), columns=vectorizer.get_feature_names_out())

In [15]:
df.head()

Unnamed: 0,00,000,03,05,06,07,08,09,0g,0mg,...,zip,ziploc,ziplock,zipper,zoe,zone,zoom,zucchini,zuke,zukes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
xtrain, xtest, ytrain, ytest = train_test_split(
    tfidf_data, y, test_size=0.3, random_state=2
)

In [17]:
clf = MultinomialNB().fit(
    xtrain, ytrain
)  # using multinomial Naive bayes to train and test

predicted = clf.predict(xtest)

In [18]:
print("Accuracy", metrics.accuracy_score(ytest, predicted))

Accuracy 0.7075567087013755


In [19]:
from sklearn.model_selection import (
    train_test_split,
)  # using Gaussian Naive bayes to train and test
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics


xtrain, xtest, ytrain, ytest = train_test_split(
    tfidf_data.toarray(), y, test_size=0.3, random_state=2
)

# Initialize Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the Gaussian Naive Bayes classifier
gnb.fit(xtrain, ytrain)

# Predict on test set
predicted = gnb.predict(xtest)

# Calculate and print accuracy
print("Accuracy:", metrics.accuracy_score(ytest, predicted))

Accuracy: 0.5922506057195185


In [20]:
conf_mat = metrics.confusion_matrix(ytest, predicted)

# Convert confusion matrix array to a DataFrame
confusion_df = pd.DataFrame(conf_mat)
print(confusion_df)

      0     1      2
0  8599  4508   2249
1  2564  4273   2362
2  3592  6098  18172


In [27]:
sentence = "I had a good today"
random_sentence_tfidf = vectorizer.transform([sentence])

predicted_class = clf.predict(random_sentence_tfidf)

In [28]:
predicted_class

array([1], dtype=int64)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

warnings.filterwarnings("ignore")

In [24]:
logreg = LogisticRegression()
logreg.fit(xtrain, ytrain)
logreg_pred = logreg.predict(xtest)
logreg_acc = accuracy_score(logreg_pred, ytest)
print("Test accuracy: {:.2f}%".format(logreg_acc * 100))

Test accuracy: 75.70%
