In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from scipy.stats import ttest_rel
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/MyDrive/ML/Amazon_Unlocked_Mobile.csv'
data = pd.read_csv(file_path)

In [None]:
data = data.dropna()

In [None]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
data['CleanedReviews'] = data['Reviews'].apply(clean_text)

In [None]:
def sentiment(rating):
    if rating < 3:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

In [None]:
data['Sentiment'] = data['Rating'].apply(sentiment)

In [None]:
X = data['CleanedReviews']
y = data['Sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=42)
scoring = make_scorer(accuracy_score)

In [None]:
from sklearn.decomposition import NMF

In [None]:
# Dimensionality reduction
nmf = NMF(n_components=100, random_state=42)
X_train_nmf = nmf.fit_transform(X_train_tfidf)
X_test_nmf = nmf.transform(X_test_tfidf)

In [None]:
# Handle imbalanced data
smote = SMOTE(random_state=42)

##Naive Bayes

In [None]:
naive_bayes_pipeline = make_imb_pipeline(
    smote,
    MultinomialNB()
)

naive_bayes_param_grid = {
    'multinomialnb__alpha': [0.1, 0.5, 1]
}

naive_bayes_grid = GridSearchCV(naive_bayes_pipeline, param_grid=naive_bayes_param_grid, cv=cv, n_jobs=-1, scoring=scoring)
naive_bayes_grid.fit(X_train_nmf, y_train_encoded)

y_pred_nb = naive_bayes_grid.predict(X_test_nmf)
print('Naive Bayes Classification Report:')
print(classification_report(y_test_encoded, y_pred_nb, target_names=label_encoder.classes_))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       0.60      0.76      0.67     15609
     neutral       0.16      0.50      0.25      5228
    positive       0.95      0.64      0.77     46030

    accuracy                           0.66     66867
   macro avg       0.57      0.63      0.56     66867
weighted avg       0.81      0.66      0.70     66867



##Logistic Regression

In [None]:
logistic_regression_pipeline = make_imb_pipeline(
    smote,
    LogisticRegression(max_iter=1000)
)

logistic_regression_param_grid = {
    'logisticregression__C': [0.1, 1, 10],
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__solver': ['liblinear']
}

logistic_regression_grid = GridSearchCV(logistic_regression_pipeline, param_grid=logistic_regression_param_grid, cv=cv, n_jobs=-1, scoring=scoring)
logistic_regression_grid.fit(X_train_nmf, y_train_encoded)

y_pred_lr = logistic_regression_grid.predict(X_test_nmf)
print('Logistic Regression Classification Report:')
print(classification_report(y_test_encoded, y_pred_lr, target_names=label_encoder.classes_))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.63      0.74      0.68     15609
     neutral       0.19      0.47      0.27      5228
    positive       0.94      0.73      0.82     46030

    accuracy                           0.71     66867
   macro avg       0.59      0.65      0.59     66867
weighted avg       0.81      0.71      0.75     66867



##SVM

In [None]:
from sklearn.svm import LinearSVC

In [None]:
svm_pipeline = make_imb_pipeline(
    smote,
    LinearSVC(max_iter=1000)
)

svm_param_grid = {
    'linearsvc__C': [0.1, 1, 10]
}

svm_grid = GridSearchCV(svm_pipeline, param_grid=svm_param_grid, cv=cv, n_jobs=-1, scoring=scoring)
svm_grid.fit(X_train_nmf, y_train_encoded)


y_pred_svm = logistic_regression_grid.predict(X_test_nmf)
print('SVM Classification Report:')
print(classification_report(y_test_encoded, y_pred_svm, target_names=label_encoder.classes_))


SVM Classification Report:
              precision    recall  f1-score   support

    negative       0.63      0.74      0.68     15609
     neutral       0.19      0.47      0.27      5228
    positive       0.94      0.73      0.82     46030

    accuracy                           0.71     66867
   macro avg       0.59      0.65      0.59     66867
weighted avg       0.81      0.71      0.75     66867



In [None]:
naive_bayes_grid.cv_results_

{'mean_fit_time': array([81.48358977, 81.06451187, 80.84771378]),
 'std_fit_time': array([1.571016  , 1.37620287, 2.18979706]),
 'mean_score_time': array([0.02157073, 0.01925163, 0.01694925]),
 'std_score_time': array([0.00460561, 0.00437758, 0.00406248]),
 'param_multinomialnb__alpha': masked_array(data=[0.1, 0.5, 1],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'multinomialnb__alpha': 0.1},
  {'multinomialnb__alpha': 0.5},
  {'multinomialnb__alpha': 1}],
 'split0_test_score': array([0.65786817, 0.65775601, 0.6574943 ]),
 'split1_test_score': array([0.65158709, 0.65136277, 0.65110106]),
 'split2_test_score': array([0.65562493, 0.65528844, 0.65517628]),
 'split3_test_score': array([0.65416682, 0.65409205, 0.65379295]),
 'split4_test_score': array([0.65999925, 0.66007403, 0.66067222]),
 'split5_test_score': array([0.65812988, 0.65820466, 0.65768124]),
 'split6_test_score': array([0.65491457, 0.65476502, 0.6545407 ]),
 'split7

In [None]:
logistic_regression_grid.cv_results_

{'mean_fit_time': array([108.77438936,  94.7413918 , 114.90401936, 103.76044507,
        116.74946494, 112.88734701]),
 'std_fit_time': array([2.78675874, 1.48611977, 3.27518111, 1.80419136, 4.77597841,
        2.28772701]),
 'mean_score_time': array([0.02471251, 0.01785779, 0.02674751, 0.01927283, 0.01794608,
        0.01806889]),
 'std_score_time': array([0.01049227, 0.00372696, 0.00907771, 0.00641793, 0.00313742,
        0.00393502]),
 'param_logisticregression__C': masked_array(data=[0.1, 0.1, 1, 1, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_logisticregression__penalty': masked_array(data=['l1', 'l2', 'l1', 'l2', 'l1', 'l2'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_logisticregression__solver': masked_array(data=['liblinear', 'liblinear', 'liblinear', 'liblinear',
                    'liblinear', 'liblinear'],
   

In [None]:
svm_grid.cv_results_

{'mean_fit_time': array([101.65897124, 138.78668487, 486.85566885]),
 'std_fit_time': array([ 1.78086606,  1.12009002, 42.77678579]),
 'mean_score_time': array([0.02350414, 0.02026355, 0.02046311]),
 'std_score_time': array([0.00761205, 0.00595432, 0.00946834]),
 'param_linearsvc__C': masked_array(data=[0.1, 1, 10],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'linearsvc__C': 0.1}, {'linearsvc__C': 1}, {'linearsvc__C': 10}],
 'split0_test_score': array([0.70007851, 0.70733166, 0.70968707]),
 'split1_test_score': array([0.69753617, 0.70628482, 0.70744383]),
 'split2_test_score': array([0.70164878, 0.70908887, 0.71065914]),
 'split3_test_score': array([0.70138707, 0.70856545, 0.70998617]),
 'split4_test_score': array([0.70299473, 0.71080869, 0.71312671]),
 'split5_test_score': array([0.70179833, 0.7090141 , 0.7116686 ]),
 'split6_test_score': array([0.69791005, 0.70725689, 0.70837851]),
 'split7_test_score': array([0.70318167,