In [1]:
import warnings
warnings.filterwarnings('ignore')

## Import the dataset

In [2]:
import sqlite3
import pandas as pd

In [3]:
import os

In [4]:
os.listdir()

['KNN_Amazon_fine_food_dataset-Copy1.ipynb',
 'tutorial_ipynbs',
 'w2v_practice.model',
 '.gitignore.swp',
 '.gitignore',
 'database.sqlite',
 '.ipynb_checkpoints',
 'KNN_Amazon_fine_food_dataset.ipynb']

In [5]:
db_connection = sqlite3.connect('database.sqlite')

In [6]:
polarisable_dataset = pd.read_sql_query('select * from reviews where Score != 3', db_connection)

In [7]:
polarisable_dataset.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [8]:
polarisable_dataset['Time'].head()

0    1303862400
1    1346976000
2    1219017600
3    1307923200
4    1350777600
Name: Time, dtype: int64

In [9]:
polarisable_dataset.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [10]:
df = polarisable_dataset # just to make typing easier

In [11]:
sampled_df = df.sample(n = 5000, axis = 0, random_state = 0) # random sampling of dataset

In [12]:
sampled_df.shape

(5000, 10)

## Replacing the ratings with 0 (for negative reviews) and 1 (for positive reviews).
#### Score of >3 has been considered as positive and a score of <3 has been taken as negative

In [13]:
type(sampled_df['Score'])

pandas.core.series.Series

In [14]:
scores = sampled_df['Score']

In [15]:
scores[6:12]

161937    5
451123    5
401436    5
396416    5
449554    1
379099    1
Name: Score, dtype: int64

In [16]:
scores = list(map(lambda x: 0 if x<3 else 1, scores))

In [17]:
scores[6:12]

[1, 1, 1, 1, 0, 0]

In [18]:
sampled_df['Score'] = scores

In [19]:
type(sampled_df['Score'].head(2))

pandas.core.series.Series

### Data preprocessing

##### 1. Deduplication
If a user id has multiple entries for the same timestamp, then it should be removed because it is likely that multiple entries at the same timestamp were for the same product of different variety which has a different product id than other variants


In [20]:
sampled_df.duplicated(subset = ['UserId', 'Time']).sum()

62

In [21]:
sampled_deduplicated_df = sampled_df.drop_duplicates(subset = ['UserId', 'Time'], inplace = False, keep = 'first')

#### 2. Extracting the data needed (corpus)
#### And removing html and punctuations

In [22]:
corpus = sampled_deduplicated_df['Text']

In [23]:
# dataset cleaners

import re

def remove_html(sentence):
    html_tag_re_obj = re.compile('<.*>?')
    return re.sub(html_tag_re_obj, ' ', sentence)

def remove_punctuations(sentence):
    cleaned_sentence = re.sub(r'[^a-zA-Z]', r' ', sentence)
    return cleaned_sentence

In [24]:
cleaned_corpus = []
for doc in corpus:
    cleaned_doc_1 = remove_html(doc)
    cleaned_doc_2 = remove_punctuations(doc)
    cleaned_corpus.append(cleaned_doc_2)

#### 3. Removing stop words

In [25]:
## Since the negative food reviews are likely to contain words like "don't", "didn't", etc that impart important
## meaning to the review, we check if such words exist in the corpus that we have. If these words are in the corpus,
## then they should not be in the list of stop words that we use for removing the stopwords from our corpus

count = 0
for doc in cleaned_corpus:
    if "not" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "don't" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "didn't" in doc:
        count += 1

print(count)

1890
0
0


In [26]:
from nltk.corpus import stopwords

In [27]:
stopwords = stopwords.words('english')

In [28]:
stopwords = set(stopwords)

In [29]:
stopwords.remove('not')

In [30]:
'not' in stopwords

False

In [31]:
a = [1,2,3,0,1,0,5]

In [32]:
# filtered_corpus = corpus with docs having no stop words
# doing with the sexy lambda expression

filtered_corpus = list(map(lambda doc: ' '.join(list(filter(lambda word: True if word not in stopwords else False\
                                                            , doc.split()))), corpus))

In [33]:
len(filtered_corpus)

4938

In [34]:
filtered_corpus[:2]

["We pretty much given GF pasta restaurant. It's really good disintegrate toss sauce. My celiac husband thrilled! FYI, first ingredient CORN, quinoa second, not QUITE nutritious might think -- cares!",
 'I bought first SproutMaster years ago Dowling Orchard Market passing Banning, California (back sold them).<br /><br />Through bit pricey plastic tray, I cannot complain ease use quality sprouts grown. The divider IS helpful grow different sprouts, BUT helpful grow type sprout two different stages growth. This way, always fresh sprouts handy anytime.<br /><br />They grow fast Sprout Master makes simple care them.']

In [35]:
### classical way of removing the lambda expressions
### verified the output of lambda expression output with the output of following implementation, outputs are same
# docs_without_stop_words = []
# for i, doc in enumerate(corpus):
#     non_stop_words_in_doc = []
#     for word in doc.split():
#         if word not in stopwords:
#             non_stop_words_in_doc.append(word)
            
    
#     docs_without_stop_words.append(' '.join(non_stop_words_in_doc))

#### 4. Stemming the words (SnowballStemmer)

In [36]:
from nltk.stem import SnowballStemmer

In [37]:
stemmer = SnowballStemmer('english')

In [38]:
stemmed_filtered_corpus = list(map(lambda doc: ' '.join(list(map(stemmer.stem, doc.split()))), corpus))

In [39]:
stemmed_filtered_corpus[:3]

["we had pretti much given up on gf pasta until we had this in a restaurant. it realli good and doesn't disintegr when you toss it with sauce. my celiac husband is thrilled! fyi, the first ingredi is corn, and quinoa is second, so it not quit as nutriti as you might think -- but who cares!",
 'i bought my first sproutmast year ago at the dowl orchard market when pass through banning, california (back when they sold them).<br /><br />through a bit pricey for a plastic tray, i cannot complain about the eas of use or the qualiti of the sprout grown. the divid is help to grow differ sprouts, but it is more help to grow the same type of sprout in two differ stage of growth. this way, you can alway have fresh sprout handi at anytime.<br /><br />they do grow fast and the sprout master make it so simpl to care for them.',
 'for bold coffe fan like us, this is absolut delici coffee. it has a complex taste, a hint of someth resembl red wine. i almost hesit to review it becaus it will probabl cau

## Sorting the dataset according to Time

In [40]:
sampled_deduplicated_df['Text'] = stemmed_filtered_corpus

In [41]:
working_df = sampled_deduplicated_df

In [42]:
working_df_sorted = working_df.sort_values(by = 'Time')

In [43]:
stemmed_filtered_corpus_sorted = working_df_sorted['Text']

## Vectorizing the reviews and splitting into train, cv and test sets and TRAINING and TESTING

### 1.1. Bag of Words (CountVectorizer)

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
count_vectorizer = CountVectorizer()

In [46]:
document_term_matrix = count_vectorizer.fit_transform(stemmed_filtered_corpus_sorted)

In [47]:
document_term_matrix.shape

(4938, 13452)

In [48]:
type(document_term_matrix)

scipy.sparse.csr.csr_matrix

In [49]:
X = document_term_matrix

In [50]:
y = working_df_sorted['Score']

### 1.1.1. Splitting into train, cv and test (Simple Cross Validation)

In [51]:
# from sklearn.model_selection import train_test_split

In [52]:
### This will not work because train_test_split() splits data randomly. What we want is a time-based splitting on
### the dataset that we have sorted chronologically
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

In [53]:
print(X.shape, y.shape)

(4938, 13452) (4938,)


In [54]:
type(y)

pandas.core.series.Series

 * ## Function to split dataset into train and test datasets 

In [55]:
def train_test_splitter(X, y, test_size):
    train_size = 1 - test_size
    train_row_upper_index = round(train_size*X.shape[0])
    test_row_lower_index = train_row_upper_index + 1
    
#     print(train_row_upper_index)
    
    X_train = X[:train_row_upper_index + 1, :]
    X_test = X[test_row_lower_index:, :]
    y_train = y.iloc[:train_row_upper_index + 1]
    y_test = y.iloc[test_row_lower_index:]
    
    return X_train, X_test, y_train, y_test

In [56]:
X_train, X_test, y_train, y_test = train_test_splitter(X, y, test_size = 0.25)

In [57]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [58]:
X_train.shape[0] + X_test.shape[0]

4938

In [59]:
### Splitting the previous X_train and y_train into X_train, X_cv and y_train, y_cv
X_train, X_cv, y_train, y_cv = train_test_splitter(X_train, y_train, 0.25)

### 1.1.3. Training and Testing

In [60]:
from sklearn.neighbors import KNeighborsClassifier

In [146]:
from sklearn.metrics import f1_score

* ## KNN Classifier and cross validator using Simple Cross Validation

In [62]:
def knn_trainer_and_cross_validator(k, X_train, y_train, X_cv, y_cv, algorithm):
    
    # Side note: note that X_train and y_train are sparse matrices, and not numpy arrays
    knn = KNeighborsClassifier(n_neighbors = k, algorithm = algorithm)
    knn.fit(X_train, y_train)
    
    y_pred_cv = knn.predict(X_cv)
    
    f1score = f1_score(y_cv, y_pred_cv) * 100
    
    return f1score

### 1.1.4. Brute force k-NN

In [63]:
f1scores_for_diff_k = []
for i in range(1, 100, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'brute')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 87.56281407035176
F1-score for k = 3 is 90.19843656043295
F1-score for k = 5 is 90.54134443783461
F1-score for k = 7 is 90.57498518079431
F1-score for k = 9 is 90.70455891059798
F1-score for k = 11 is 90.82297217288338
F1-score for k = 13 is 90.76923076923077
F1-score for k = 15 is 90.78014184397163
F1-score for k = 17 is 90.79102715466351
F1-score for k = 19 is 90.79102715466351
F1-score for k = 21 is 90.79102715466351
F1-score for k = 23 is 90.79102715466351
F1-score for k = 25 is 90.79102715466351
F1-score for k = 27 is 90.79102715466351
F1-score for k = 29 is 90.79102715466351
F1-score for k = 31 is 90.79102715466351
F1-score for k = 33 is 90.79102715466351
F1-score for k = 35 is 90.79102715466351
F1-score for k = 37 is 90.79102715466351
F1-score for k = 39 is 90.79102715466351
F1-score for k = 41 is 90.79102715466351
F1-score for k = 43 is 90.79102715466351
F1-score for k = 45 is 90.79102715466351
F1-score for k = 47 is 90.79102715466351
F1-score for k = 49 i

In [64]:
max_f1score = max(f1scores_for_diff_k)

In [65]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

90.82297217288338 5


In [66]:
### f1-score at index 5 is maximum, from the above list of f1-scores for different k values, we can find that index\
### 5 corresponds to k = 11, ie, hyperparameter k has been tuned to 11
knn = KNeighborsClassifier(n_neighbors = 11, algorithm = 'brute')

In [67]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [68]:
### finally testing after tuning the hyperparameter 'k' on the cross validation set
y_pred_test = knn.predict(X_test)

In [69]:
f1_score(y_test, y_pred_test)*100

90.83665338645419

### 1.1.5. Kd-tree

In [70]:
f1scores_for_diff_k = []
for i in range(1, 100, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'kd_tree')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 87.56281407035176
F1-score for k = 3 is 90.19843656043295
F1-score for k = 5 is 90.54134443783461
F1-score for k = 7 is 90.57498518079431
F1-score for k = 9 is 90.70455891059798
F1-score for k = 11 is 90.82297217288338
F1-score for k = 13 is 90.76923076923077
F1-score for k = 15 is 90.78014184397163
F1-score for k = 17 is 90.79102715466351
F1-score for k = 19 is 90.79102715466351
F1-score for k = 21 is 90.79102715466351
F1-score for k = 23 is 90.79102715466351
F1-score for k = 25 is 90.79102715466351
F1-score for k = 27 is 90.79102715466351
F1-score for k = 29 is 90.79102715466351
F1-score for k = 31 is 90.79102715466351
F1-score for k = 33 is 90.79102715466351
F1-score for k = 35 is 90.79102715466351
F1-score for k = 37 is 90.79102715466351
F1-score for k = 39 is 90.79102715466351
F1-score for k = 41 is 90.79102715466351
F1-score for k = 43 is 90.79102715466351
F1-score for k = 45 is 90.79102715466351
F1-score for k = 47 is 90.79102715466351
F1-score for k = 49 i

In [71]:
max_f1score = max(f1scores_for_diff_k)

In [73]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

90.82297217288338 5


In [74]:
### k = 11 at index 5, ie, k value is tuned to 11

knn = KNeighborsClassifier(n_neighbors = 11, algorithm = 'kd_tree')

In [75]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [76]:
### finally testing after tuning the hyperparameter 'k' on the cross validation set
y_pred_test = knn.predict(X_test)

In [77]:
f1_score(y_test, y_pred_test)*100

90.83665338645419

### 1.2. Tf-Idf Vectorization

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,1))

In [80]:
tfidf_vectorizer = tfidf_vectorizer.fit(stemmed_filtered_corpus_sorted)

In [81]:
# tfidf_vectorizer.vocabulary_

In [82]:
document_term_matrix = tfidf_vectorizer.transform(stemmed_filtered_corpus_sorted)

In [83]:
type(document_term_matrix)

scipy.sparse.csr.csr_matrix

In [84]:
document_term_matrix.shape

(4938, 13452)

In [85]:
X = document_term_matrix

In [86]:
y = working_df_sorted['Score']

In [87]:
X_train_first, X_test, y_train_first, y_test = train_test_splitter(X, y, test_size = 0.25)

In [88]:
X_train, X_cv, y_train, y_cv = train_test_splitter(X_train_first, y_train_first, test_size = 0.25)

### 1.2.1. Brute force k-NN

In [89]:
f1scores_for_diff_k = []
for i in range(1, 100, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'brute')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 86.44829800899166
F1-score for k = 3 is 89.93288590604027
F1-score for k = 5 is 90.48192771084338
F1-score for k = 7 is 90.94159713945173
F1-score for k = 9 is 90.79025549613785
F1-score for k = 11 is 90.81209247184351
F1-score for k = 13 is 91.00591715976331
F1-score for k = 15 is 90.89834515366431
F1-score for k = 17 is 90.89834515366431
F1-score for k = 19 is 90.89834515366431
F1-score for k = 21 is 90.84465445953927
F1-score for k = 23 is 90.84465445953927
F1-score for k = 25 is 90.84465445953927
F1-score for k = 27 is 90.84465445953927
F1-score for k = 29 is 90.84465445953927
F1-score for k = 31 is 90.84465445953927
F1-score for k = 33 is 90.89834515366431
F1-score for k = 35 is 90.84465445953927
F1-score for k = 37 is 90.84465445953927
F1-score for k = 39 is 90.84465445953927
F1-score for k = 41 is 90.84465445953927
F1-score for k = 43 is 90.84465445953927
F1-score for k = 45 is 90.79102715466351
F1-score for k = 47 is 90.79102715466351
F1-score for k = 49 i

In [90]:
print(max(f1scores_for_diff_k), f1scores_for_diff_k.index(max(f1scores_for_diff_k)), sep = '\t')

91.00591715976331	6


In [91]:
### index = 6 has the maximum f1_score. This index corresponds to k = 13 (from the above printed list of f1scores)

knn = KNeighborsClassifier(n_neighbors = 13, algorithm = 'brute')

In [92]:
## training the final 13-NN

knn = knn.fit(X_train, y_train)

In [93]:
### testing on test data

y_pred = knn.predict(X_test)

In [94]:
### f1_score score

f1_score(y_test, y_pred) * 100

91.06269453090263

### 1.2.2. kd_tree k-NN

In [164]:
f1scores_for_diff_k = []
for i in range(1, 100, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'kd_tree')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 84.46726572528883
F1-score for k = 3 is 87.23926380368098
F1-score for k = 5 is 89.21036769138037
F1-score for k = 7 is 90.11904761904762
F1-score for k = 9 is 90.32640949554896
F1-score for k = 11 is 90.57498518079431
F1-score for k = 13 is 90.52132701421802
F1-score for k = 15 is 90.71555292726197
F1-score for k = 17 is 90.72652096869463
F1-score for k = 19 is 90.72652096869463
F1-score for k = 21 is 90.79102715466351
F1-score for k = 23 is 90.79102715466351
F1-score for k = 25 is 90.79102715466351
F1-score for k = 27 is 90.79102715466351
F1-score for k = 29 is 90.79102715466351
F1-score for k = 31 is 90.79102715466351
F1-score for k = 33 is 90.79102715466351
F1-score for k = 35 is 90.79102715466351
F1-score for k = 37 is 90.79102715466351
F1-score for k = 39 is 90.79102715466351
F1-score for k = 41 is 90.79102715466351
F1-score for k = 43 is 90.79102715466351
F1-score for k = 45 is 90.79102715466351
F1-score for k = 47 is 90.79102715466351
F1-score for k = 49 i

In [165]:
print(max(f1scores_for_diff_k), f1scores_for_diff_k.index(max(f1scores_for_diff_k)), sep = '\t')

90.79102715466351	10


In [166]:
### index = 10 has the maximum f1_score. This index corresponds to k = 21 (from the above printed list of f1scores)

knn = KNeighborsClassifier(n_neighbors = 21, algorithm = 'kd_tree')

In [167]:
## training the final 13-NN

knn = knn.fit(X_train, y_train)

In [168]:
### testing on test data

y_pred = knn.predict(X_test)

In [169]:
### f1_score score

f1_score(y_test, y_pred) * 100

90.88495575221238

### 1.3. Average W2V

In [95]:
import gensim

### 1.3.1. Tokenizing each document in the corpus
#### gensim w2v requires each document to be tokenized into words. The corpus will be a list of lists of words

In [96]:
## stemmed_filtered_corpus_sorted is a pandas Series object. It should be converted into a list first.
## after that each sentence in the resulted list should be tokenized into words stored in a list.
## all these lists should be stored into another list so as to give a list of lists as required by gensim w2v

stemmed_filtered_corpus_sorted_list = list(stemmed_filtered_corpus_sorted)

In [97]:
stemmed_filtered_sorted_list_of_tokenized_sentences = []

for sentence in stemmed_filtered_corpus_sorted:
    tokenized_sentence = sentence.split()
    stemmed_filtered_sorted_list_of_tokenized_sentences.append(tokenized_sentence)

In [105]:
len(stemmed_filtered_sorted_list_of_tokenized_sentences)

4938

In [98]:
w2v = gensim.models.Word2Vec(stemmed_filtered_sorted_list_of_tokenized_sentences, min_count = 1, size = 50, workers = 4)

In [99]:
type(w2v.wv)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [100]:
w2v.wv['happen']

array([ 0.6763431 , -0.14938074,  0.05023941, -0.1246966 , -0.24975461,
        0.15813228,  0.20640458, -0.5254332 ,  0.02193829,  0.25592807,
        0.19895834,  0.18302375,  0.46257624, -0.3379368 ,  0.22671726,
       -0.8687569 ,  0.13498752,  0.23509061, -0.18519531,  0.325798  ,
       -0.01091082,  0.25493273,  0.15161154,  0.18357141, -0.23946032,
       -0.15996416, -0.10509461, -0.22375233,  0.19674689,  0.04969831,
       -0.02148063,  0.05979318,  0.23409437,  0.2690375 , -0.43541342,
        0.11692525, -0.47766817, -0.2244401 ,  0.4068373 , -0.1372834 ,
        0.43264347,  0.28680465,  0.01365148, -0.07711545,  0.42574415,
       -0.01223365, -0.17536448,  0.24513943, -0.37865487, -0.29269525],
      dtype=float32)

In [101]:
type(w2v.wv['happen'])

numpy.ndarray

In [102]:
w2v.wv['happen'].shape

(50,)

In [103]:
### saving the w2v model for later use

import os

if(not os.path.exists('w2v_practice.model')):
    w2v.save('w2v_practice.model')
    
else:
    w2v = gensim.models.Word2Vec.load('w2v_practice.model')

In [106]:
### computing avg w2v representation for the reviews dataset

avg_w2v = []
for tokenized_sentence in stemmed_filtered_sorted_list_of_tokenized_sentences:
    sum_of_vectors_for_each_word = 0
    for word in tokenized_sentence:
        sum_of_vectors_for_each_word += w2v.wv[word]
    avg_w2v.append(sum_of_vectors_for_each_word / len(tokenized_sentence))

In [107]:
type(avg_w2v[0])

numpy.ndarray

In [108]:
avg_w2v[0].shape

(50,)

In [109]:
avg_w2v[0]

array([ 0.661626  ,  0.92628944, -0.013446  ,  0.62764466,  0.18625233,
       -1.4530652 ,  0.15930279, -0.49002123, -0.5853565 ,  0.8513393 ,
       -0.32445616, -0.17482114, -0.07569546,  1.2116948 ,  0.07239655,
        1.691816  ,  0.03975666, -0.9989167 ,  0.23586877, -0.10961175,
        0.56027454,  0.7376916 , -0.17771463,  0.45288604, -1.0219226 ,
       -0.4630284 ,  0.5353912 , -0.05520993, -0.45619234, -0.51378196,
       -1.5669233 ,  0.05079133, -0.03936822,  0.09876107, -0.44751152,
        0.80088794,  0.17019792,  0.03535123, -0.5714997 ,  0.27900317,
        0.9409237 , -0.13131371, -0.5932932 ,  0.17622764, -0.17409271,
       -0.42300984, -0.04031445,  0.95066714, -0.5954353 , -0.60495836],
      dtype=float32)

In [110]:
X = avg_w2v
## y is the same as before

In [111]:
import numpy as np

In [112]:
### the train_test_splitter() assumes X to be a numpy array
X = np.array(avg_w2v)

In [113]:
X.shape

(4938, 50)

In [114]:
X_train_first, X_test, y_train_first, y_test = train_test_splitter(X, y, test_size = 0.25)

In [115]:
X_train, X_cv, y_train, y_cv = train_test_splitter(X_train_first, y_train_first, test_size = 0.25)

### 1.3.2. Brute force k-NN

In [120]:
f1scores_for_diff_k = []
for i in range(1, 100, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'brute')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 85.8974358974359
F1-score for k = 3 is 88.36923076923077
F1-score for k = 5 is 90.16294508147254
F1-score for k = 7 is 90.03601440576232
F1-score for k = 9 is 90.8982748364069
F1-score for k = 11 is 90.97387173396673
F1-score for k = 13 is 90.74733096085409
F1-score for k = 15 is 90.76923076923077
F1-score for k = 17 is 90.78014184397163
F1-score for k = 19 is 90.78014184397163
F1-score for k = 21 is 90.78014184397163
F1-score for k = 23 is 90.72652096869463
F1-score for k = 25 is 90.79102715466351
F1-score for k = 27 is 90.79102715466351
F1-score for k = 29 is 90.79102715466351
F1-score for k = 31 is 90.79102715466351
F1-score for k = 33 is 90.79102715466351
F1-score for k = 35 is 90.79102715466351
F1-score for k = 37 is 90.79102715466351
F1-score for k = 39 is 90.79102715466351
F1-score for k = 41 is 90.79102715466351
F1-score for k = 43 is 90.79102715466351
F1-score for k = 45 is 90.79102715466351
F1-score for k = 47 is 90.79102715466351
F1-score for k = 49 is 

In [121]:
max_f1score = max(f1scores_for_diff_k)

In [124]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

90.97387173396673 5


In [125]:
### max_f1_score is indexed at 5 which corresponds to k = 11; hypertuned on CV set. Training knn for k = 5

knn = KNeighborsClassifier(n_neighbors = 5, algorithm = 'brute')

In [126]:
knn = knn.fit(X_train, y_train)

In [127]:
y_pred = knn.predict(X_test)

In [128]:
f1_score(y_test, y_pred) * 100

89.13142337426102

### 1.3.3. kd_tree k-NN

In [129]:
f1scores_for_diff_k = []
for i in range(1, 100, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'kd_tree')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 85.8974358974359
F1-score for k = 3 is 88.36923076923077
F1-score for k = 5 is 90.16294508147254
F1-score for k = 7 is 90.03601440576232
F1-score for k = 9 is 90.8982748364069
F1-score for k = 11 is 90.97387173396673
F1-score for k = 13 is 90.74733096085409
F1-score for k = 15 is 90.76923076923077
F1-score for k = 17 is 90.78014184397163
F1-score for k = 19 is 90.78014184397163
F1-score for k = 21 is 90.78014184397163
F1-score for k = 23 is 90.72652096869463
F1-score for k = 25 is 90.79102715466351
F1-score for k = 27 is 90.79102715466351
F1-score for k = 29 is 90.79102715466351
F1-score for k = 31 is 90.79102715466351
F1-score for k = 33 is 90.79102715466351
F1-score for k = 35 is 90.79102715466351
F1-score for k = 37 is 90.79102715466351
F1-score for k = 39 is 90.79102715466351
F1-score for k = 41 is 90.79102715466351
F1-score for k = 43 is 90.79102715466351
F1-score for k = 45 is 90.79102715466351
F1-score for k = 47 is 90.79102715466351
F1-score for k = 49 is 

In [130]:
max_f1score = max(f1scores_for_diff_k)

In [131]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

90.97387173396673 5


In [132]:
### max_f1_score is indexed at 5 which corresponds to k = 11; hypertuned on CV set. Training knn for k = 5

knn = KNeighborsClassifier(n_neighbors = 11, algorithm = 'kd_tree')

In [134]:
knn = knn.fit(X_train, y_train)

In [135]:
y_pred = knn.predict(X_test)

In [136]:
f1_score(y_test, y_pred) * 100

90.18733273862622

### 1.4. TfIdf weighted Word2Vec

In [137]:
type(tfidf_vectorizer.vocabulary_)

dict

In [139]:
tfidf_weighted_w2v = []
for sentence in stemmed_filtered_sorted_list_of_tokenized_sentences:
    tfidf_weighted_sum_of_vectors_for_each_word = 0
    for word in sentence:
        if word not in tfidf_vectorizer.vocabulary_ or word not in w2v.wv:
            continue
        tfidf_weighted_sum_of_vectors_for_each_word += tfidf_vectorizer.vocabulary_[word] * w2v.wv[word]
    tfidf_weighted_w2v.append(tfidf_weighted_sum_of_vectors_for_each_word)

In [140]:
X = np.array(tfidf_weighted_w2v)

In [141]:
X.shape

(4938, 50)

In [142]:
X_train_first, X_test, y_train_first, y_test = train_test_splitter(X, y, test_size = 0.25)

In [143]:
X_train, X_cv, y_train, y_cv = train_test_splitter(X_train_first, y_train_first, test_size = 0.25)

### 1.4.1. Brute force k-NN

In [147]:
f1scores_for_diff_k = []

for i in range(1, 100, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'brute')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 84.46726572528883
F1-score for k = 3 is 87.23926380368098
F1-score for k = 5 is 89.21036769138037
F1-score for k = 7 is 90.11904761904762
F1-score for k = 9 is 90.32640949554896
F1-score for k = 11 is 90.57498518079431
F1-score for k = 13 is 90.52132701421802
F1-score for k = 15 is 90.71555292726197
F1-score for k = 17 is 90.72652096869463
F1-score for k = 19 is 90.72652096869463
F1-score for k = 21 is 90.79102715466351
F1-score for k = 23 is 90.79102715466351
F1-score for k = 25 is 90.79102715466351
F1-score for k = 27 is 90.79102715466351
F1-score for k = 29 is 90.79102715466351
F1-score for k = 31 is 90.79102715466351
F1-score for k = 33 is 90.79102715466351
F1-score for k = 35 is 90.79102715466351
F1-score for k = 37 is 90.79102715466351
F1-score for k = 39 is 90.79102715466351
F1-score for k = 41 is 90.79102715466351
F1-score for k = 43 is 90.79102715466351
F1-score for k = 45 is 90.79102715466351
F1-score for k = 47 is 90.79102715466351
F1-score for k = 49 i

In [148]:
max_f1score = max(f1scores_for_diff_k)

In [149]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

90.79102715466351 10


In [150]:
### index = 10 corresponds to k = 21, ie, hyperparamter k is tuned to 21

knn = KNeighborsClassifier(n_neighbors = 21, algorithm = 'brute')

In [151]:
knn = knn.fit(X_train, y_train)

In [152]:
y_pred = knn.predict(X_test)

In [153]:
f1_score(y_test, y_pred) * 100

90.88495575221238

### 1.4.2. kd_tree k-NN

In [154]:
f1scores_for_diff_k = []

for i in range(1, 100, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'kd_tree')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 84.46726572528883
F1-score for k = 3 is 87.23926380368098
F1-score for k = 5 is 89.21036769138037
F1-score for k = 7 is 90.11904761904762
F1-score for k = 9 is 90.32640949554896
F1-score for k = 11 is 90.57498518079431
F1-score for k = 13 is 90.52132701421802
F1-score for k = 15 is 90.71555292726197
F1-score for k = 17 is 90.72652096869463
F1-score for k = 19 is 90.72652096869463
F1-score for k = 21 is 90.79102715466351
F1-score for k = 23 is 90.79102715466351
F1-score for k = 25 is 90.79102715466351
F1-score for k = 27 is 90.79102715466351
F1-score for k = 29 is 90.79102715466351
F1-score for k = 31 is 90.79102715466351
F1-score for k = 33 is 90.79102715466351
F1-score for k = 35 is 90.79102715466351
F1-score for k = 37 is 90.79102715466351
F1-score for k = 39 is 90.79102715466351
F1-score for k = 41 is 90.79102715466351
F1-score for k = 43 is 90.79102715466351
F1-score for k = 45 is 90.79102715466351
F1-score for k = 47 is 90.79102715466351
F1-score for k = 49 i

In [155]:
max_f1score = max(f1scores_for_diff_k)

In [156]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

90.79102715466351 10


In [157]:
### index = 10 corresponds to k = 21, ie, hyperparamter k is tuned to 21

knn = KNeighborsClassifier(n_neighbors = 21, algorithm = 'kd_tree')

In [158]:
knn = knn.fit(X_train, y_train)

In [159]:
f1_score(y_test, y_pred) * 100

90.88495575221238

# Summary

In [160]:
from prettytable import PrettyTable

In [161]:
pretty_table = PrettyTable()

In [162]:
pretty_table.field_names = ['Vectorizer', 'Model', 'Hyperparameter (k) value', 'F1-score']

In [172]:
pretty_table.add_row(['BoW', 'Brute k-NN', '11', '90.84'])
pretty_table.add_row(['BoW', 'kd_tree k-NN', '11', '90.823'])
pretty_table.add_row(['Tf-Idf', 'Brute k-NN', '13', '91.063'])
pretty_table.add_row(['Tf-Idf', 'kd_tree k-NN', '21', '90.885'])
pretty_table.add_row(['Avg_W2V', 'Brute k-NN', '11', '89.13'])
pretty_table.add_row(['Avg_W2V', 'kd_tree k-NN', '11', '90.187'])
pretty_table.add_row(['Tf-Idf_W2V', 'Brute k-NN', '21', '90.885'])
pretty_table.add_row(['Tf-Idf_W2V', 'kd_tree k-NN', '21', '90.885'])

In [173]:
print(pretty_table)

+------------+--------------+--------------------------+----------+
| Vectorizer |    Model     | Hyperparameter (k) value | F1-score |
+------------+--------------+--------------------------+----------+
|    BoW     |  Brute k-NN  |            11            |  90.84   |
|    BoW     | kd_tree k-NN |            11            |  90.823  |
|   Tf-Idf   |  Brute k-NN  |            13            |  91.063  |
|   Tf-Idf   | kd_tree k-NN |            21            |  90.885  |
|  Avg_W2V   |  Brute k-NN  |            11            |  89.13   |
|  Avg_W2V   | kd_tree k-NN |            11            |  90.187  |
| Tf-Idf_W2V |  Brute k-NN  |            21            |  90.885  |
| Tf-Idf_W2V | kd_tree k-NN |            21            |  90.885  |
|    BoW     |  Brute k-NN  |            11            |  90.84   |
|    BoW     | kd_tree k-NN |            11            |  90.823  |
|   Tf-Idf   |  Brute k-NN  |            13            |  91.063  |
|   Tf-Idf   | kd_tree k-NN |            21     