# EDA, Preprocessing & Vectorization

## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Product Reviews

In [3]:
product_reviews = pd.read_csv('/content/drive/MyDrive/AuthentiFeel/processed_data/product_reviews.csv')

### EDA

In [4]:
product_reviews.head()

Unnamed: 0,unique_id,asin,product_name,product_type,helpful,rating,title,date,reviewer,reviewer_location,review_text,positive_or_not
0,0785758968:one_of_the_best_crichton_novels:jos...,0785758968,Sphere: Books: Michael Crichton,books,0 of 1,5.0,One of the best Crichton novels,"July 1, 2006",Joseph M,"Colorado, USA",Sphere by Michael Crichton is an excellant nov...,1
1,0452279550:the_medicine_of_the_future:wafa_rashed,0452279550,Healing from the Heart: A Leading Surgeon Comb...,books,34 of 41,4.0,The Medicine of the Future,"November 6, 2002",Wafa Rashed,"Jabriya, KUWAIT",Dr. Oz is an accomplished heart surgeon in the...,1
2,"1599620065:beautiful!:sarah_silva_""sar""",1599620065,Mythology: DC Comics Art of Alex Ross 2007 Cal...,books,,5.0,Beautiful!,"June 13, 2006","Sarah Silva ""Sar""","San Diego, CA USA",The most gorgeous artwork in comic books. Cont...,1
3,0743277724:for_lovers_of_robicheaux:g._rousseau,0743277724,Pegasus Descending: A Dave Robicheaux Novel (D...,books,1 of 1,4.0,For lovers of Robicheaux,"November 2, 2006",G. Rousseau,"Finistere, France",This book is for lovers of Robicheaux. His de...,1
4,061318114X:excellent_and_broad_survey_of_the_d...,061318114X,"Guns, Germs, and Steel: The Fates of Human Soc...",books,7 of 9,5.0,Excellent and broad survey of the development ...,"October 6, 2006","Patrick D. Goonan ""www.meaningful-life.us""","Pleasanton, CA",This is going to be a short and sweet review b...,1


In [5]:
product_reviews.describe()

Unnamed: 0,rating,positive_or_not
count,8000.0,8000.0
mean,3.05725,0.5
std,1.727598,0.500031
min,1.0,0.0
25%,1.0,0.0
50%,3.0,0.5
75%,5.0,1.0
max,5.0,1.0


In [6]:
product_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   unique_id          8000 non-null   object 
 1   asin               8000 non-null   object 
 2   product_name       8000 non-null   object 
 3   product_type       8000 non-null   object 
 4   helpful            6701 non-null   object 
 5   rating             8000 non-null   float64
 6   title              8000 non-null   object 
 7   date               7999 non-null   object 
 8   reviewer           7833 non-null   object 
 9   reviewer_location  6674 non-null   object 
 10  review_text        8000 non-null   object 
 11  positive_or_not    8000 non-null   int64  
dtypes: float64(1), int64(1), object(10)
memory usage: 750.1+ KB


In [7]:
sns.heatmap(product_reviews.corr(),annot=True)

ValueError: could not convert string to float: '0785758968:one_of_the_best_crichton_novels:joseph_m'

In [None]:
sns.countplot(x='rating',data=product_reviews)

In [None]:
sns.countplot(x='product_type',data=product_reviews)

In [None]:
sns.countplot(x='positive_or_not',data=product_reviews)

In [None]:
product_reviews['date'] = pd.to_datetime(product_reviews['date'])

In [None]:
product_reviews['year'] = product_reviews['date'] .dt.year

In [None]:
sns.countplot(x='year',data=product_reviews)

In [None]:
product_reviews['review_length'] = product_reviews['review_text'].apply(len)

In [None]:
sns.histplot(product_reviews['review_length'])

In [None]:
all_words = ' '.join(product_reviews['review_text']).split(' ')
most_common_words = Counter(all_words).most_common(20)
most_common_words

### Preprocessing

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import random
from sklearn.preprocessing import normalize

In [9]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
def preprocess_text(doc):
    tokens = nltk.word_tokenize(doc.lower())
    #tokens = [word for word in tokens if word not in stopwords.words('english') and word not in string.punctuation]
    # tokens = [stemmer.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [11]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
data = product_reviews['review_text'].to_list()
target = product_reviews['positive_or_not'].to_list()
preprocessed_data = [preprocess_text(doc) for doc in data]

### Vectorization

In [12]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,ngram_range=(1,3),sublinear_tf=True)
tfidf_data = tfidf_vectorizer.fit_transform(preprocessed_data)

In [13]:
# processed_df = pd.DataFrame(tfidf_data.toarray(),columns = tfidf_vectorizer.get_feature_names_out())

In [14]:
# processed_df['label'] = target

# Hyperparameter Tuning

In [15]:
%%capture
!pip install catboost
!pip install xgboost

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [18]:
seed = 42
X_train, X_test, y_train, y_test=train_test_split(tfidf_data,target,random_state=seed)

## Logistic Regression

In [21]:
logistic_param_grid = { 'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
                        'C' : np.logspace(-4, 4, 10),
                        'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
                        'max_iter' : [100, 200,500]
                        }

logistic_search = RandomizedSearchCV(LogisticRegression(), logistic_param_grid, n_iter=75, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
logistic_search.fit(tfidf_data, target)

135 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

----------------------------

In [22]:
logistic_search.best_params_

{'solver': 'sag', 'penalty': 'l2', 'max_iter': 100, 'C': 10000.0}

In [23]:
logistic_search.best_score_

0.8397499999999999

In [24]:
# log_reg =  LogisticRegression(C=4.2813,max_iter=100,penalty='l2',solver='lbfgs')
# log_reg.fit(X_train,y_train)

In [25]:
# log_preds = log_reg.predict(X_test)
# f1_score(y_test,log_preds)

In [None]:
logistic_param_grid = { 'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
                        'C' : np.logspace(-4, 4, 10),
                        'solver' : ['lbfgs','liblinear','sag','saga'],
                        'max_iter' : [100, 200,500]
                        }

logistic_search = RandomizedSearchCV(LogisticRegression(), logistic_param_grid, n_iter=75, cv=5, scoring='precision', n_jobs=-1, random_state=42)
logistic_search.fit(tfidf_data, target)

In [None]:
logistic_search.best_params_

In [None]:
logistic_search.best_score_

In [None]:
logistic_param_grid = { 'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
                        'C' : np.logspace(-4, 4, 10),
                        'solver' : ['newton-cg'],
                        'max_iter' : [100, 200,500]
                        }

logistic_search = RandomizedSearchCV(LogisticRegression(), logistic_param_grid, n_iter=75, cv=5, scoring='precision', n_jobs=-1, random_state=42)
logistic_search.fit(tfidf_data, target)

In [None]:
logistic_search.best_params_

In [None]:
logistic_search.best_score_

## Multinomial NB

In [26]:
multinomial_param_grid = {
    'alpha': np.linspace(0, 1, 10),
    'fit_prior': [True, False]
}
multinomial_search = RandomizedSearchCV(MultinomialNB(), multinomial_param_grid, n_iter=75, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
multinomial_search.fit(tfidf_data, target)



In [27]:
multinomial_search.best_params_

{'fit_prior': True, 'alpha': 0.3333333333333333}

In [28]:
multinomial_search.best_score_

0.8465

In [29]:
# nb = MultinomialNB(alpha=1,fit_prior=True)
# nb.fit(X_train,y_train)

In [30]:
# nb_preds = nb.predict(X_test)
# f1_score(y_test,nb_preds)

## SVC

In [31]:
param_grid_linear = {
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [100, 200,500]
}

param_grid_rbf = {
    'kernel': ['rbf'],
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10, 100]
}

search_linearsvc = RandomizedSearchCV(LinearSVC(dual=False), param_grid_linear,n_iter=75, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
search_rbfsvc = RandomizedSearchCV(SVC(), param_grid_rbf, n_iter=75, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

### Linear

In [32]:
search_linearsvc.fit(tfidf_data, target)

150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 1223, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 1062, in _ge

In [33]:
search_linearsvc.best_params_

{'penalty': 'l2', 'max_iter': 100, 'loss': 'squared_hinge', 'C': 1}

In [34]:
search_linearsvc.best_score_

0.838875

### RBF

In [35]:
search_rbfsvc.fit(tfidf_data, target)



In [36]:
search_rbfsvc.best_params_

{'kernel': 'rbf', 'gamma': 0.01, 'C': 100}

In [37]:
search_rbfsvc.best_score_

0.837875

## Ensemble

In [17]:
param_grid_et = {
    'n_estimators': [100, 200,500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

param_grid_xgb = {
    'n_estimators': [100, 200,500],
    'max_depth': [6, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1],
}

param_grid_cb = {
    'iterations': [100, 200,500],
    'depth': [6, 10, 12],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5],
}
param_grid_lbgm = {
    "n_estimators": [100, 200,500],
    "learning_rate": [0.1, 0.07, 0.03],
    "num_leaves": [ 40, 20, 35],
    "max_depth": [-1, 10],
    "boosting_type": ["gbdt", "goss"],
    "min_child_samples": [10, 15]
}

In [18]:
np.random.seed(42)
et_clf = ExtraTreesClassifier(random_state=42)
xgb_clf = XGBClassifier(random_state=42)
cb_clf = CatBoostClassifier(random_state=42, verbose=0)
lbgm_clf= LGBMClassifier(random_state=42)
# Initialize GridSearchCV for each classifier
grid_search_et = RandomizedSearchCV(et_clf, param_grid_et, n_iter=75, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
grid_search_xgb = RandomizedSearchCV(xgb_clf, param_grid_xgb, n_iter=75, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
grid_search_cb = RandomizedSearchCV(cb_clf, param_grid_cb, n_iter=75, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
grid_search_lbgm = RandomizedSearchCV(lbgm_clf, param_grid_cb, n_iter=75, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

### Extra Trees

In [19]:
grid_search_et.fit(tfidf_data, target)
print("Best parameters for Extra Trees:", grid_search_et.best_params_)



Best parameters for Extra Trees: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}


In [20]:
grid_search_et.best_score_

0.828125

In [21]:
# et_ypreds = grid_search_et.best_estimator_.predict(X_test)
# f1_score(y_test,et_ypreds)

### XGB

In [22]:
grid_search_xgb.fit(tfidf_data, target)
print("Best parameters for Extra Trees:", grid_search_xgb.best_params_)



TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
grid_search_xgb.best_score_

In [None]:
# et_ypreds = grid_search_xgb.best_estimator_.predict(X_test)
# f1_score(np.where(y_test == -1, 0, y_test),et_ypreds)

### CatBoost

In [None]:
grid_search_cb.fit(tfidf_data, target)
print("Best parameters for Extra Trees:", grid_search_cb.best_params_)

In [None]:
grid_search_cb.best_score_

In [None]:
# et_ypreds = grid_search_cb.best_estimator_.predict(X_test)
# f1_score(y_test,et_ypreds)

### LBGM

In [None]:
grid_search_lbgm.fit(tfidf_data, target)
print("Best parameters for LBGM:", grid_search_lbgm.best_params_)

In [None]:
grid_search_lbgm.best_score_

In [None]:
# et_ypreds = grid_search_et.best_estimator_.predict(X_test)
# f1_score(y_test,et_ypreds)