In [1]:
import numpy as np
import pandas as pd
import sklearn 
import joblib
from sklearn.model_selection import train_test_split as split

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import StandardScaler


from sklearn.pipeline import Pipeline
from joblib import Memory
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn import metrics
import os

import mlflow

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [4]:
df.isnull().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

In [5]:
df = df.dropna(subset = ['Review text'])
df.reset_index(drop=True,inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8510 entries, 0 to 8509
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8510 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.0+ KB


In [7]:
def func1(a):
    if a>= 2.5 :
        return 'positive'
    else:
        return 'negative'
df.Ratings = df.Ratings.apply(func1)

In [8]:
def pre_processing(raw_text):
    #removing special characters
    temp = re.sub('[^a-zA-Z]',' ',raw_text)

    temp = temp.lower()
    
    #tokenizing 
    tokens = temp.split()
    
    #removing stop words
    tokens1 = [i for i in tokens if i not in stopwords.words('english')]
    
    #steming 
    stem = PorterStemmer()
    tokens2 = [stem.stem(i) for i in tokens1]
    
    temp3 = ' '.join(tokens2) 
    return(temp3)

# Identify Input and Output

In [9]:
x = df['Review text']
y = df.Ratings

## Memoization

In [10]:
x = x.apply(pre_processing)

# Split the data into Train and Test 

In [11]:
x_train, x_test , y_train, y_test = split(x,y, random_state = 1)

In [12]:
y_test.unique()

array(['positive', 'negative'], dtype=object)

In [13]:
def func1(a):
    if a == 'negative':
        return 0
    else:
        return 1

In [14]:
mlflow.set_experiment('Flipkart sentiment analysis')

<Experiment: artifact_location='file:///C:/Users/saile/Desktop/Untitled%20Folder/Innomatics/Projects/Internship/Project%204/mlruns/254744688573527451', creation_time=1712492046451, experiment_id='254744688573527451', last_update_time=1712492046451, lifecycle_stage='active', name='Flipkart sentiment analysis', tags={}>

In [15]:
import warnings

warnings.filterwarnings('ignore')
# Define a memory object to cache intermediate results
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory),
    'decision_tree': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ], memory=memory),
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory)
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000], 
            'classifier__alpha' : [1, 10]
        }
    ],
    'decision_tree': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000],
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'logistic_regression': [
        {
            'vectorization': [CountVectorizer(), TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000], 
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ]
}

# Perform GridSearchCV for each algorithm
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='f1', 
                               return_train_score=True,
                               verbose=1
                              )
    
    %time grid_search.fit(x_train, y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    print('Score on Test Data: ',   f1_score(y_test.apply(func1),np.vectorize(func1)(grid_search.predict(x_test))))

********** naive_bayes **********
Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: total: 1.17 s
Wall time: 1.77 s
Score on Test Data:  0.9540561827251247
********** decision_tree **********
Fitting 5 folds for each of 18 candidates, totalling 90 fits
CPU times: total: 6.48 s
Wall time: 9.51 s
Score on Test Data:  0.9231601731601732
********** logistic_regression **********
Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 2min 20s
Wall time: 3min 10s
Score on Test Data:  0.9333698930627914
