In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sqlite3
import pandas as pd

connection = sqlite3.connect('database.sqlite')

# polarisable_dataset = dataset that contains Score = {1,2,4,5} assuming Score = 3 implies neutral comments and
# Score < 3 implies negative comment and Score > 3 implies positive comment
df = pd.read_sql_query('select * from REVIEWS WHERE Score != 3', connection)
df.shape

(525814, 10)

In [3]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

if dataset includes Score = 3 rows -- <font color='red'>df[df.Score == 3]['Text'].shape ---Output: (42640,)</font>

In [4]:
scores = df['Score']

scores = scores.map(lambda x: 0 if x<3 else 1)

# polarised_scores.head()

df['Score'] = scores

## Cleaning 

In [5]:
df.duplicated(['UserId', 'Time']).sum()

197082

In [8]:
print(df.shape)
df = df.drop_duplicates(subset={'UserId', 'Time'}, keep='first', inplace=False)
print(df.shape)

(525814, 10)
(328732, 10)


In [9]:
# dataset cleaners

import re

def remove_html(sentence):
    html_tag_re_obj = re.compile('<.*>?')
    return re.sub(html_tag_re_obj, ' ', sentence)

def remove_punctuations(sentence):
    cleaned_sentence = re.sub(r'[^a-zA-Z]', r' ', sentence)
    return cleaned_sentence

In [10]:
# call the cleaners

# clean dataset

corpus = df['Text']

cleaned_corpus = []
for doc in corpus.values:
    cleaned_doc = remove_html(doc)
    cleaned_doc = remove_punctuations(cleaned_doc)
    cleaned_corpus.append(cleaned_doc)

df['Text'] = cleaned_corpus

In [17]:
## Removing stopwords

from nltk.corpus import stopwords

In [18]:
print(set(stopwords.words('english')))

{'further', "hasn't", "she's", 'again', 'yours', 'myself', 'itself', 'below', 't', "you'd", 'your', 'are', 'didn', 'with', 'wouldn', 'has', 'our', "shan't", 'will', 'when', 'too', 'my', 'this', 'who', 'in', "doesn't", 'we', 'me', 'how', 'ours', 'more', 'y', "won't", 'can', 'out', 'few', 'what', 'hadn', 'herself', 'she', "that'll", "aren't", "hadn't", 'i', 'because', 'from', 'only', 'some', 'have', 'll', 'shouldn', 'before', 've', 'ain', 'themselves', "you'll", 's', 'm', 're', "haven't", 'as', 'at', 'hasn', "wouldn't", "it's", 'her', 'nor', 'all', 'aren', 'was', 'they', 'o', 'between', 'you', 'through', 'yourself', 'is', 'needn', 'very', 'd', 'does', "couldn't", 'any', 'am', 'by', 'where', 'own', 'ourselves', 'an', 'them', 'did', 'ma', 'now', 'those', 'under', 'theirs', 'each', 'he', 'up', 'doing', "should've", 'these', 'no', "wasn't", 'couldn', 'other', 'weren', 'its', 'doesn', 'hers', "shouldn't", 'it', 'a', 'above', 'having', 'and', 'down', 'that', 'whom', 'for', 'his', 'their', 'had

In [19]:
## Since the negative food reviews are likely to contain words like "don't", "didn't", etc that impart important
## meaning to the review, we check if such words exist in the corpus that we have. If these words are in the corpus,
## then they should not be in the list of stop words that we use for removing the stopwords from our corpus

count = 0
for doc in df['Text']:
    if 'not' in doc:
        count += 1

print(count)

count = 0
for doc in df['Text']:
    if "don't" in doc:
        count += 1

print(count)

count = 0
for doc in df['Text']:
    if "didn't" in doc:
        count += 1

print(count)

104301
0
0


In [21]:
stopwords_list = stopwords.words('english')
print(type(stopwords))

<class 'nltk.corpus.reader.wordlist.WordListCorpusReader'>


In [23]:
## 'not' is present in 104301 docs in the corpus, so we modify the list of stopwords to not contain this word

stopwords_set = set(stopwords_list)

stopwords_set.remove('not')

In [24]:
print(stopwords_set)

{'further', "hasn't", "she's", 'again', 'yours', 'myself', 'itself', 'below', 't', "you'd", 'your', 'are', 'didn', 'with', 'wouldn', 'has', 'our', "shan't", 'will', 'when', 'too', 'my', 'this', 'who', 'in', "doesn't", 'we', 'me', 'how', 'ours', 'more', 'y', "won't", 'can', 'out', 'few', 'what', 'hadn', 'herself', 'she', "that'll", "aren't", "hadn't", 'i', 'because', 'from', 'only', 'some', 'have', 'll', 'shouldn', 'before', 've', 'ain', 'themselves', "you'll", 's', 'm', 're', "haven't", 'as', 'at', 'hasn', "wouldn't", "it's", 'her', 'nor', 'all', 'aren', 'was', 'they', 'o', 'between', 'you', 'through', 'yourself', 'is', 'needn', 'very', 'd', 'does', "couldn't", 'any', 'am', 'by', 'where', 'own', 'ourselves', 'an', 'them', 'did', 'ma', 'now', 'those', 'under', 'theirs', 'each', 'he', 'up', 'doing', "should've", 'these', 'no', "wasn't", 'couldn', 'other', 'weren', 'its', 'doesn', 'hers', "shouldn't", 'it', 'a', 'above', 'having', 'and', 'down', 'that', 'whom', 'for', 'his', 'their', 'had

In [25]:
## lower casing all docs in corpus (deduplicated_dataset['Text'])

lower_cased_docs = [doc.lower() for doc in df['Text']]
df['Text'] = lower_cased_docs

In [27]:
corpus = df['Text'] # corpus contains cleaned docs
print(type(corpus))

<class 'pandas.core.series.Series'>


In [28]:
docs_without_stop_words = []
for i, doc in enumerate(corpus):
    non_stop_words_in_doc = []
    for word in doc.split():
        if word not in stopwords_set:
            non_stop_words_in_doc.append(word)
    docs_without_stop_words.append(' '.join(non_stop_words_in_doc))

In [29]:
from nltk.stem import SnowballStemmer

In [30]:
## stemming 

stemmer = SnowballStemmer('english')

In [40]:
stemmer.stem('proved')

'prove'

In [43]:
# type(docs_without_stop_words)
len(docs_without_stop_words)

328732

In [39]:
stemmed_corpus = [] # docs with stemmed words
for doc in docs_without_stop_words:
    stemmed_words = []
    for word in doc.split():
        stemmed_words.append(stemmer.stem(word))
    stemmed_doc = ' '.join(stemmed_words)
    stemmed_corpus.append(stemmed_doc)

In [41]:
len(stemmed_corpus)

328732

In [None]:
print(stemmed_corpus[:5])

## TF-IDF

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
tfidf_vectorizer = TfidfVectorizer()

In [47]:
tfidf_vectorized_ = tfidf_vectorizer.fit_transform(stemmed_corpus)

In [51]:
len(stemmed_corpus[0].split(' '))

23

In [54]:
# type(tfidf_vectorized_) -- type sparse matrix
tfidf_vectorized_[0].shape

(1, 57059)

In [56]:
tfidf_vectorized_.shape

(328732, 57059)

In [60]:
tfidf_vectorized__subset = tfidf_vectorized_[:5000]

In [61]:
tfidf_vectorized__subset.shape

(5000, 57059)

In [68]:
df_subset = df.iloc[:5000]

In [70]:
df_subset['Score'].value_counts()

1    4191
0     809
Name: Score, dtype: int64

In [65]:
df['Score'].value_counts()

1    275721
0     53011
Name: Score, dtype: int64

### <font color='red'>Comment: </font> The dataset is imbalanced. Even the subset of the dataset is imbalanced

### Handling imbalance in the dataset

### <font color='blue'> 1. Model Selection</font>

#### <font color='green'>First:</font> XGBoost
The model has been chosen because:
1. Dataset is imbalanced: XGBoost comes with a predefined hyperparameter called scale_pos_weight that takes care of the imbalance in the dataset by using over/undersampling.
2. Ensemble learning technique: It is a boosting technique (not a bagging technique)
3. Expected to work better than other alggorithms for smaller datasets as well as high dimensional datasets.

In [None]:
### tune xgboost model using k-fold CV (k=5)

from sklearn.model_selection import GridSearchCV
params = {
  'min_child_weight':[5,6],
  'max_depth': range(3,10,2),
  'n_estimators':[150,200,300,400],
  'scale_pos_weight':[1,2,3,4],
  'colsample_bytree':[0.7,0.8], 
  'subsample':[0.7,0.8],
  'gamma':[0,0.2.0.4]
    
}

In [75]:
from xgboost import XGBClassifier

In [None]:
best_hyperparameter_values = GridSearchCV(estimator = XGBClassifier(objective= 'binary:logistic', nthread=4,
                                                                    random_state=27),
                                          param_grid = params, scoring='f1',n_jobs=4,iid=False, cv=5)

In [None]:
classifier_xgb = XGBClassifier(learning_rate=0.1, n_estimators=200, max_depth=4, min_child_weight=7, 
                      gamma=0.4,nthread=4, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',scale_pos_weight=3,seed=29)
classifer_xg.fit(X_train, y_train)
y_pred = classifier_xg.predict(X_test)