In [1]:
#Load the dataset

In [2]:
import pandas as pd
data = pd.read_csv('reviews_badminton.csv')
data.head(10)

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1
5,Baji Sankar,Mind-blowing purchase,"Certified Buyer, Hyderabad",173.0,45.0,Oct 2018,Good quality product. Delivered on time.READ MORE,5
6,Flipkart Customer,Must buy!,"Certified Buyer, Doom Dooma",403.0,121.0,Jan 2020,BEST PURCHASE It is a good quality and is more...,5
7,Flipkart Customer,Classy product,"Certified Buyer, Panjim",59.0,13.0,Oct 2018,Good quality original shuttles.READ MORE,5
8,Jafar Qureshi,Great product,"Certified Buyer, Sheopur",14.0,1.0,Aug 2018,AwesomeREAD MORE,5
9,Bheemesh,Just wow!,"Certified Buyer, Kurnool",50.0,12.0,May 2018,nice original productsREAD MORE,5


In [3]:
print(data.shape)

(8518, 8)


In [4]:
data.loc[0,'Review text']

'Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE'

In [5]:
data.loc[1,'Review text']

"They didn't supplied Yonex Mavis 350. Outside cover was Yonex Ad inside was a cheapest....  Sad to hear this.READ MORE"

In [6]:
print(data.columns)

Index(['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes',
       'Down Votes', 'Month', 'Review text', 'Ratings'],
      dtype='object')


In [7]:
data['Ratings'].value_counts(normalize=True)

Ratings
5    0.596384
4    0.204978
1    0.090279
3    0.072200
2    0.036159
Name: proportion, dtype: float64

In [8]:
# Running the Experiment

In [9]:
# import all the libraires
import pandas as pd
import numpy as np
import re

# For splitting train & test
from sklearn.model_selection import train_test_split 

# Data preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from textblob import TextBlob
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

# K-fold cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# pipelines
from sklearn.pipeline import Pipeline

In [10]:
import warnings

warnings.filterwarnings('ignore')

In [11]:
# Define preprocessing functions
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = text.lower()
    return text

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply preprocessing to your data
data['Review text'] = data['Review text'].apply(preprocess_text)
data['sentiment_score'] = data['Review text'].apply(get_sentiment)
data['Feedback'] = data['sentiment_score'].apply(lambda x: 'Positive' if x > 0 else 'Negative')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['Review text'], data['Feedback'], test_size=0.2, random_state=42)


In [12]:
# import mlflow

import mlflow

mlflow.set_experiment("sentiment_analysis_reviews")

<Experiment: artifact_location='file:///C:/Users/hgpav/Innomatics_Internship/flask_backend/ML_Flow_Experiment_Tracking/mlruns/512294728043272040', creation_time=1711725792208, experiment_id='512294728043272040', last_update_time=1711725792208, lifecycle_stage='active', name='sentiment_analysis_reviews', tags={}>

In [13]:
# Define pipelines for each model
pipelines = {
    'Logistic Regression': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classifier', LogisticRegression())
    ]),
    'Random Forest': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classifier', RandomForestClassifier())
    ]),
    'Support Vector Machine': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classifier', SVC())
    ]),
    'Multinomial Naive Bayes': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classifier', MultinomialNB())
    ])
}

# Define parameter grids for each model
param_grids = {
    'Logistic Regression': {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__penalty': ['elasticnet'],
        'vect__max_features' : [1000, 1500, 2000, 5000],
        'classifier__l1_ratio': [0.4, 0.5, 0.6],
        'classifier__solver': ['saga'],
        'classifier__class_weight': ['balanced']
        
    },
    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'vect__max_features' : [1000, 1500, 2000, 5000],
        'classifier__max_depth': [None, 10, 20]
    },
    'Support Vector Machine': {
        'classifier__C': [0.1, 1.0, 10.0],
        'vect__max_features' : [1000, 1500, 2000, 5000],
        'classifier__kernel': ['linear', 'rbf']
    },
    'Multinomial Naive Bayes': {
        'classifier__alpha': [0.1, 0.5, 1.0],
        'vect__max_features' : [1000, 1500, 2000, 5000],
    }
}

In [14]:
best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='f1_micro',
                               return_train_score=True,
                               verbose=1
                              )
     
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)    
         
    print('Train F1 Score: ', grid_search.best_score_)
    y_pred = grid_search.best_estimator_.predict(X_test)
    test_f1_score = f1_score(y_test, y_pred, average='micro')
    print('Test F1 Score: ', test_f1_score)
    best_models[algo] = grid_search.best_estimator_
    print()


********** Logistic Regression **********


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Fitting 5 folds for each of 36 candidates, totalling 180 fits
CPU times: total: 2min 21s
Wall time: 7min 15s
Train F1 Score:  0.9762255670365212
Test F1 Score:  0.9882629107981221

********** Random Forest **********




Fitting 5 folds for each of 36 candidates, totalling 180 fits
CPU times: total: 4min 16s
Wall time: 12min 15s
Train F1 Score:  0.981362482129448
Test F1 Score:  0.9876760563380281

********** Support Vector Machine **********




Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: total: 1min 53s
Wall time: 2min 12s
Train F1 Score:  0.9823898436010227
Test F1 Score:  0.9900234741784038

********** Multinomial Naive Bayes **********




Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: total: 8.84 s
Wall time: 18.6 s
Train F1 Score:  0.9694742421646987
Test F1 Score:  0.9812206572769953



In [15]:
# Stop the auto logger

mlflow.sklearn.autolog(disable=True)