In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pickle
import os
import numpy as np
import pandas as pd

In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from sklearn.metrics import f1_score

### Directory to save pickle files

In [5]:
if not os.path.exists('pickle_files'):
    os.mkdir('pickle_files')

### Function to return np array form of the data passed

In [6]:
def convert_to_np_array(data):
    return np.array(data)

### Function to split data into training and test sets

In [7]:
### X and y, before being passed to this function must be converted to numpy array or must be sparse matrices\
### for consistency throughout the program.

### y will be a numpy vector because y-values originally are stored in a column of the original dataframe (ie, it\
### will be of type pd.Series. When converted into np-array, it will become a numpy 1D array, i.e a column vector)

def train_test_splitter(X, y, test_size = 0.2):
    train_size = 1 - test_size
    
    train_row_upper_index = round(train_size*X.shape[0])
    test_row_lower_index = train_row_upper_index + 1
    
    if(X.ndim == 1):
        X = X.reshape((X.shape[0], 1))
    y = y.reshape((y.shape[0], 1)) # y is 
    
    X_train = X[:train_row_upper_index + 1, :]
    X_test = X[test_row_lower_index:, :]
    
    y_train = y[:train_row_upper_index + 1]
    y_test = y[test_row_lower_index:]
    
    return X_train, y_train, X_test, y_test

### Hyperparameter (k) tuning function

In [122]:
### upper_limit - the max value of k upto which checks have to be made.
### returns the best 'k'.

def tune_k(upper_limit, X_train, y_train, X_cv, y_cv, algorithm, save_name):
    
    best_k = None
    f1score_max = 0
    f1score = None
    
    for i in range(1, upper_limit, 2):
#         print(i)
        knn = KNeighborsClassifier(n_neighbors = i, algorithm = algorithm)
        knn = knn.fit(X_train, y_train)
        
        with open('pickle_files/' + save_name, 'wb') as knn_pickle:
            pickle.dump(knn, knn_pickle)
        
        y_pred = knn.predict(X_cv)
        
        f1score = f1_score(y_cv, y_pred)
#         print('f1' + str(f1score))
        
        if f1score > f1score_max:
            f1score_max = f1score
            print('dd' + str(i))
            best_k = i
            with open('pickle_files/' + save_name, 'wb') as knn_pickle:
                pickle.dump(knn, knn_pickle) ## if file already exists, overwrite it with the knn with best k
            
    return best_k

### Import dataset

In [9]:
import sqlite3

In [10]:
db_connection = sqlite3.connect('database.sqlite')

In [11]:
### read only those rows which have rating score != 3, because for this problem we assume those will be neutral \
### reviews
df = pd.read_sql_query('select * from reviews where Score != 3', db_connection)

In [12]:
### taking only first 10k rows because processing large number of rows on my laptop is not possible
df = df.iloc[:10000, :]

In [13]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

### Replacing the ratings with 0 (for negative reviews) and 1 (for positive reviews).
#### Score of  >3 has been considered as positive and a score of <3 has been taken as negative

In [14]:
type(df['Score'])

pandas.core.series.Series

In [15]:
scores = df['Score']

In [16]:
scores[6:12]

6     5
7     5
8     5
9     5
10    5
11    5
Name: Score, dtype: int64

In [17]:
scores = list(map(lambda x: 0 if x<3 else 1, scores))

In [18]:
scores[6:12]

[1, 1, 1, 1, 1, 1]

In [19]:
df['Score'] = scores

In [20]:
type(df['Score'].head(2))

pandas.core.series.Series

### Data preprocessing

##### 1. Deduplication
If a user id has multiple entries for the same timestamp, then it should be removed because it is likely that multiple entries at the same timestamp were for the same product of different variety which has a different product id than other variants


In [21]:
df.duplicated(subset = ['UserId', 'Time']).sum()

527

In [22]:
deduplicated_df = df.drop_duplicates(subset = ['UserId', 'Time'], inplace = False, keep = 'first')

#### 2. Extracting the data needed (corpus)
#### And removing html and punctuations

In [23]:
corpus = deduplicated_df['Text']

In [24]:
# dataset cleaners

import re

def remove_html(sentence):
    html_tag_re_obj = re.compile('<.*>?')
    return re.sub(html_tag_re_obj, ' ', sentence)

def remove_punctuations(sentence):
    cleaned_sentence = re.sub(r'[^a-zA-Z]', r' ', sentence)
    return cleaned_sentence

In [25]:
cleaned_corpus = []
for doc in corpus:
    cleaned_doc_1 = remove_html(doc)
    cleaned_doc_2 = remove_punctuations(doc)
    cleaned_corpus.append(cleaned_doc_2)

#### 3. Removing stop words

In [26]:
## Since the negative food reviews are likely to contain words like "don't", "didn't", etc that impart important
## meaning to the review, we check if such words exist in the corpus that we have. If these words are in the corpus,
## then they should not be in the list of stop words that we use for removing the stopwords from our corpus

count = 0
for doc in cleaned_corpus:
    if "not" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "don't" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "didn't" in doc:
        count += 1

print(count)

3544
0
0


In [27]:
from nltk.corpus import stopwords

In [28]:
stopwords = stopwords.words('english')

In [29]:
stopwords = set(stopwords)

In [30]:
stopwords.remove('not')

In [31]:
'not' in stopwords

False

In [32]:
a = [1,2,3,0,1,0,5]

In [33]:
# filtered_corpus = corpus with docs having no stop words
# doing with the sexy lambda expression

filtered_corpus = list(map(lambda doc: ' '.join(list(filter(lambda word: True if word not in stopwords else False\
                                                            , doc.split()))), corpus))

In [34]:
len(filtered_corpus)

9473

In [35]:
filtered_corpus[:2]

['I bought several Vitality canned dog food products found good quality. The product looks like stew processed meat smells better. My Labrador finicky appreciates product better most.',
 'Product arrived labeled Jumbo Salted Peanuts...the peanuts actually small sized unsalted. Not sure error vendor intended represent product "Jumbo".']

In [36]:
### classical way of removing the lambda expressions
### verified the output of lambda expression output with the output of following implementation, outputs are same
# docs_without_stop_words = []
# for i, doc in enumerate(corpus):
#     non_stop_words_in_doc = []
#     for word in doc.split():
#         if word not in stopwords:
#             non_stop_words_in_doc.append(word)
            
    
#     docs_without_stop_words.append(' '.join(non_stop_words_in_doc))

#### 4. Stemming the words (SnowballStemmer)

In [37]:
from nltk.stem import SnowballStemmer

In [38]:
stemmer = SnowballStemmer('english')

In [39]:
stemmed_filtered_corpus = list(map(lambda doc: ' '.join(list(map(stemmer.stem, doc.split()))), corpus))

In [40]:
stemmed_filtered_corpus[:3]

['i have bought sever of the vital can dog food product and have found them all to be of good quality. the product look more like a stew than a process meat and it smell better. my labrador is finicki and she appreci this product better than most.',
 'product arriv label as jumbo salt peanuts...th peanut were actual small size unsalted. not sure if this was an error or if the vendor intend to repres the product as "jumbo".',
 'this is a confect that has been around a few centuries. it is a light, pillowi citrus gelatin with nut - in this case filberts. and it is cut into tini squar and then liber coat with powder sugar. and it is a tini mouth of heaven. not too chewy, and veri flavorful. i high recommend this yummi treat. if you are familiar with the stori of c.s. lewi "the lion, the witch, and the wardrobe" - this is the treat that seduc edmund into sell out his brother and sister to the witch.']

## Sorting the dataset according to Time

In [41]:
deduplicated_df['Text'] = stemmed_filtered_corpus

In [42]:
working_df = deduplicated_df

In [43]:
working_df_sorted = working_df.sort_values(by = 'Time')

In [44]:
stemmed_filtered_corpus_sorted = working_df_sorted['Text']

In [45]:
scores = deduplicated_df['Score']

In [46]:
print(stemmed_filtered_corpus_sorted.shape, scores.shape, sep = '\n')

(9473,)
(9473,)


## Review vectorization and training & testing

In [47]:
### making a numpy array for the stemmed_filtered_corpus_sorted and saving it in variable corpus so that writing\
### codes will be easier. converting scores to numpy array as well and saving it in a variable of same name.

corpus = convert_to_np_array(stemmed_filtered_corpus_sorted)
scores = convert_to_np_array(scores)

### Splitting corpus into train, cv, and test sets

In [48]:
X_train_nought, y_train_nought, X_test, y_test = train_test_splitter(corpus, scores, test_size = 0.2)

In [49]:
X_train, y_train, X_cv, y_cv = train_test_splitter(X_train_nought, y_train_nought, test_size = 0.2)

### 1. Bag of Words (CountVectorizer)
##### Note: Vectorization must be done on training set only, not on test set or the cross validation set!

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
if not os.path.exists('pickle_files/bow_pickles'):
    os.mkdir('pickle_files/bow_pickles')
    
if os.path.exists('pickle_files/bow_pickles/document_term_matrix.pkl'):
    with open('pickle_files/bow_pickles/document_term_matrix.pkl', 'rb') as dtm_pickle:
        X_train_bow = pickle.load(dtm_pickle)
        
else:
    count_vectorizer = CountVectorizer()
    
#     fit() method takes 1D array (m,). train_test_splitter() returns (m,n) array. ravel() converts it into (m,)
    count_vectorizer = count_vectorizer.fit(X_train.ravel())
    X_train_bow = count_vectorizer.transform(X_train.ravel()) # document_term_matrix is saved as X_train_bow
    with open('pickle_files/bow_pickles/document_term_matrix.pkl', 'wb') as dtm_pickle:
        pickle.dump(X_train_bow, dtm_pickle)    

### 2. TfIdf (TfIdfVectorizer)

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
if not os.path.exists('pickle_files/tfidf_pickles'):
    os.mkdir('pickle_files/tfidf_pickles')
    
if os.path.exists('pickle_files/tfidf_pickles/document_term_matrix.pkl'):
    with open('pickle_files/tfidf_pickles/document_term_matrix.pkl', 'rb') as dtm_pickle:
        X_train_tfidf = pickle.load(dtm_pickle)
        
else:
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer = tfidf_vectorizer.fit(X_train.ravel())
    X_train_tfidf = tfidf_vectorizer.transform(X_train.ravel())
    with open('pickle_files/tfidf_pickles/document_term_matrix.pkl', 'wb') as dtm_pickle:
        pickle.dump(X_train_tfidf, dtm_pickle)

### 3. Word2Vec

In [55]:
from gensim.models import Word2Vec

In [61]:
if not os.path.exists('pickle_files/word2vec_pickles/'):
    os.mkdir('pickle_files/word2vec_pickles/')

if os.path.exists('pickle_files/word2vec_pickles/w2v_model.pkl'):
    with open('pickle_files/word2vec_pickles/w2v_model.pkl', 'rb') as w2v_pickle:
        w2v = pickle.load(w2v_pickle)
else:
    X_train_for_Word2Vec = []
    for sentence in X_train.ravel():
        X_train_for_Word2Vec.append(sentence.split())
    
    w2v = Word2Vec(X_train_for_Word2Vec, min_count = 1, size = 50, workers = 4)
    with open('pickle_files/word2vec_pickles/w2v_model.pkl', 'wb') as w2v_pickle:
        pickle.dump(w2v, w2v_pickle)

#### 3.1. Avg Word2Vec

In [62]:
if os.path.exists('pickle_files/word2vec_pickles/avgw2v.pkl'):
    with open('pickle_files/word2vec_pickles/avgw2v.pkl', 'rb') as avgw2v_pickle:
        X_train_avgw2v = pickle.load(avgw2v_pickle)

else:
    X_train_avgw2v = []
    for sentence in X_train_for_Word2Vec:
        sum_of_words = 0
        for word in sentence:
            sum_of_words += w2v.wv[word]
        X_train_avgw2v.append(sum_of_words/len(sentence))
        
    with open('pickle_files/word2vec_pickles/avgw2v.pkl', 'wb') as avgw2v_pickle:
        pickle.dump(X_train_avgw2v, avgw2v_pickle)

#### 3.2. TfIdf Word2Vec

In [66]:
if os.path.exists('pickle_files/word2vec_pickles/tfidfw2v.pkl'):
    with open('pickle_files/word2vec_pickles/tfidfw2v.pkl', 'rb') as tfidfw2v_pickle:
        X_train_tfidfw2v = pickle.load(tfidfw2v_pickle)

else:
    X_train_tfidfw2v = []
    for sentence in X_train_for_Word2Vec:
        tfidf_weighted_sum = 0
        for word in sentence:
            if word not in tfidf_vectorizer.vocabulary_ or word not in w2v.wv:
                continue
            tfidf_weighted_sum += tfidf_vectorizer.vocabulary_[word] * w2v.wv[word]
        X_train_tfidfw2v.append(tfidf_weighted_sum)
        with open('pickle_files/word2vec_pickles/tfidfw2v.pkl', 'wb') as tfidfw2v_pickle:
            pickle.dump(X_train_tfidfw2v, tfidfw2v_pickle)

#### Note: At this point, X_train using all the vectorization techniques have been computed. Next is simply training the K-NNmodel.

### K-NN for BoW (CountVectorizer) representation

In [69]:
### transform X_cv using the CountVectorizer fitted on training data
X_cv_bow = count_vectorizer.transform(X_cv.ravel())

In [76]:
### transform X_test using the CountVectorizer fitted on training data
X_test_bow = count_vectorizer.transform(X_test.ravel())

* #### K-NN using brute force algorithm

In [73]:
### get best k (hyperparameter) using brute force algorithm; tune_k also saves the knn object with best k
k = tune_k(upper_limit = 30, X_train = X_train_bow, y_train = y_train, X_cv = X_cv_bow, y_cv = y_cv, \
           algorithm = 'brute', save_name = 'bow_pickles/brute_knn.pkl')

In [75]:
### load the k-nn with best k (for brute force algorithm)
with open('pickle_files/bow_pickles/brute_knn.pkl', 'rb') as knn_pickle:
    knn = pickle.load(knn_pickle)

In [77]:
y_pred = knn.predict(X_test_bow)

In [78]:
f1_score(y_test, y_pred) * 100

90.55186362323029

* #### K-NN using kd-tree algorithm

In [79]:
### get the best k (hyperparameter) using kd-tree algorithm; tune_k also saves the knn object with best k
k = tune_k(upper_limit = 30, X_train = X_train_bow, y_train = y_train, X_cv = X_cv_bow, y_cv = y_cv, \
          algorithm = 'kd_tree', save_name = 'bow_pickles/kd_tree_knn.pkl')

In [80]:
### load the k-nn with best k (for kd_tree algorithm)
with open('pickle_files/bow_pickles/kd_tree_knn.pkl', 'rb') as knn_pickle:
    knn = pickle.load(knn_pickle)

In [81]:
y_pred = knn.predict(X_test_bow)

In [82]:
f1_score(y_test, y_pred) * 100

90.55186362323029

### K-NN for TfIdf (TfidfVectorizer)

In [84]:
### transform X_cv using the TfidfVectorizer trained on training data
X_cv_tfidf = tfidf_vectorizer.transform(X_cv.ravel())

In [85]:
### transform X_test using the TfidfVectorizer trained on training data
X_test_tfidf = tfidf_vectorizer.transform(X_test.ravel())

* #### K-NN using brute force algorithm

In [88]:
k = tune_k(upper_limit = 30, X_train = X_train_tfidf, y_train = y_train, X_cv = X_cv_tfidf, y_cv = y_cv, \
          algorithm = 'brute', save_name = 'tfidf_pickles/knn_brute.pkl')

In [90]:
### load the k-nn with best k (brute force algorithm)
with open('pickle_files/tfidf_pickles/knn_brute.pkl', 'rb') as knn_pickle:
    knn = pickle.load(knn_pickle)

In [92]:
y_pred = knn.predict(X_test_tfidf)

In [93]:
f1_score(y_test, y_pred) * 100

90.55186362323029

* #### K-NN using kd_tree algorithm

In [103]:
k = tune_k(upper_limit = 30, X_train = X_train_tfidf, y_train = y_train, X_cv = X_cv_tfidf, y_cv = y_cv, \
          algorithm = 'kd_tree', save_name = 'tfidf_pickles/knn_kd_tree.pkl')

In [95]:
### load the k-nn with best k (kd_tree algorithm)
with open('pickle_files/tfidf_pickles/knn_kd_tree.pkl', 'rb') as knn_pickle:
    knn = pickle.load(knn_pickle)

In [96]:
y_pred = knn.predict(X_test_tfidf)

In [97]:
f1_score(y_test,y_pred) * 100

90.55186362323029

### K-NN using Word2Vec

1. #### K-NN using Average W2V representation

In [99]:
### compute average w2v represenation for X_cv

X_cv_avgw2v = []
for sentence in X_cv.ravel():
    sum_of_words = 0
    for word in sentence:
        if word not in w2v.wv:
            continue
        sum_of_words += w2v.wv[word]
    X_cv_avgw2v.append(sum_of_words/len(sentence))

In [100]:
### compute average w2v representation for X_test

X_test_avgw2v = []
for sentence in X_test.ravel():
    sum_of_words = 0
    for word in sentence:
        if word not in w2v.wv:
            continue
        sum_of_words += w2v.wv[word]
    X_test_avgw2v.append(sum_of_words/len(sentence))

* #### K-NN using brute force algorithm

In [115]:
k = tune_k(upper_limit = 30, X_train = X_train_avgw2v, y_train = y_train, X_cv = X_cv_avgw2v, y_cv = y_cv, \
          algorithm = 'brute', save_name = 'word2vec_pickles/brute_knn.pkl')

1
dd1
3
5
7
9
11
13
15
17
19
21
23
25
27
29


In [113]:
print(k)

1


In [105]:
### load the knn with best k (brute algo)

with open('pickle_files/word2vec_pickles/brute_knn.pkl', 'rb') as knn_pickle:
    knn = pickle.load(knn_pickle)

In [108]:
y_pred = knn.predict(X_test_avgw2v)

In [109]:
f1_score(y_test, y_pred) * 100

90.55186362323029

* #### K-NN using kd_tree algorithm

In [123]:
k = tune_k(upper_limit = 30, X_train = X_train_avgw2v, y_train = y_train, X_cv = X_cv_avgw2v, y_cv = y_cv, \
          algorithm = 'kd_tree', save_name = 'word2vec_pickles/kd_tree_knn.pkl')

dd1


In [124]:
### load the knn with best k (kd_tree algo)

with open('pickle_files/word2vec_pickles/kd_tree_knn.pkl', 'rb') as knn_pickle:
    knn = pickle.load(knn_pickle)

In [125]:
y_pred = knn.predict(X_test_avgw2v)

In [126]:
f1_score(y_test, y_pred)

0.9055186362323029