In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing lib's

In [2]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

import joblib

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading Data

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSet/data.csv')
df.head(3)

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1


In [5]:
df.drop(
    columns=['Reviewer Name','Review Title','Place of Review',
             'Up Votes','Down Votes','Month'],
    axis=1,
    inplace=True
)

In [6]:
df.drop(df[df['Ratings']==3].index,inplace=True)
df.head(3)

Unnamed: 0,Review text,Ratings
0,"Nice product, good quality, but price is now r...",4
1,They didn't supplied Yonex Mavis 350. Outside ...,1
2,Worst product. Damaged shuttlecocks packed in ...,1


In [7]:
df['sentiment'] = df['Ratings'].apply(lambda x: 1 if x > 3 else 0)
df.head(3)

Unnamed: 0,Review text,Ratings,sentiment
0,"Nice product, good quality, but price is now r...",4,1
1,They didn't supplied Yonex Mavis 350. Outside ...,1,0
2,Worst product. Damaged shuttlecocks packed in ...,1,0


In [8]:
df.isnull().sum()

Unnamed: 0,0
Review text,8
Ratings,0
sentiment,0


In [9]:
df.dropna(inplace=True)

In [10]:
df.duplicated().sum()

np.int64(3097)

In [11]:
#df.drop_duplicates(inplace=True)

In [12]:
df.shape

(7895, 3)

## Text Cleaning & Normalization

In [13]:
stop_words = set(stopwords.words("english"))
stop_words.remove("not")
lemmatizer = WordNetLemmatizer()

def clean_text(text):

    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-zA-Z]", " ", text)

    words = text.split()

    words = [w for w in words if w not in stop_words and len(w)>2]

    words = [lemmatizer.lemmatize(w) for w in words]

    return " ".join(words)

df["clean_review"] = df["Review text"].apply(clean_text)


In [14]:
df.head(2)

Unnamed: 0,Review text,Ratings,sentiment,clean_review
0,"Nice product, good quality, but price is now r...",4,1,nice product good quality price rising bad sig...
1,They didn't supplied Yonex Mavis 350. Outside ...,1,0,supplied yonex mavis outside cover yonex insid...


## Numerical Feature Extraction

In [15]:
tfidf = TfidfVectorizer(max_features=7000,ngram_range=(1,2))

X = tfidf.fit_transform(df["clean_review"])
y = df['sentiment']

## Advanced Model Training

In [16]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    X,y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

## Model1 - Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=200)

lr.fit(X_train,y_train)


## Model2 - Linear SVM

In [18]:
from sklearn.svm import LinearSVC

svm = LinearSVC()

svm.fit(X_train,y_train)

## Model3 - Navie_bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(X_train,y_train)

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train,y_train)

## Evaluation

In [21]:
from sklearn.metrics import f1_score

pred_lr = lr.predict(X_test)
pred_svm = svm.predict(X_test)
pred_nb = nb.predict(X_test)
pred_rf = rf.predict(X_test)


print("NB F1:",f1_score(y_test,pred_nb))
print("LR F1:",f1_score(y_test,pred_lr))
print("SVM F1:",f1_score(y_test,pred_svm))

print("RF F1:",f1_score(y_test,pred_rf))

NB F1: 0.9533898305084746
LR F1: 0.9604841580633677
SVM F1: 0.9657410746483952
RF F1: 0.9626436781609196


## Final Model Selection

- Amoung the all Model 'SVM' geting more F1-Score
- while choseing model select which gives high F1-Score

In [22]:
print(classification_report(y_test,pred_svm))

              precision    recall  f1-score   support

           0       0.85      0.68      0.75       214
           1       0.95      0.98      0.97      1365

    accuracy                           0.94      1579
   macro avg       0.90      0.83      0.86      1579
weighted avg       0.94      0.94      0.94      1579



## Save Model and Vectorizer

In [25]:
joblib.dump(svm,'/content/drive/MyDrive/Colab Notebooks/DataSet/sentiment_model.pkl')
joblib.dump(tfidf,'/content/drive/MyDrive/Colab Notebooks/DataSet/tfidf_vectorizer.pkl')

['/content/drive/MyDrive/Colab Notebooks/DataSet/tfidf_vectorizer.pkl']