In [51]:
import warnings
warnings.filterwarnings('ignore')

In [52]:
import pickle
import numpy as np
import os

 * ## Function to split dataset into train and test datasets 

In [132]:
### X may be sparse matrix or pd.DataFrame
### y is pd.Series
def train_test_splitter(X, y, test_size, return_only_training_split = False):
    train_size = 1 - test_size
    train_row_upper_index = round(train_size*X.shape[0])
    test_row_lower_index = train_row_upper_index + 1
    
    print('X     y\t', X.shape, y.shape)
#     print(train_row_upper_index)
#     print(test_row_lower_index)
#     print(type(train_row_upper_index))
#     print(type(test_row_lower_index))
#     print(type(X))

    X_train = X[:train_row_upper_index + 1]
    X_test = X[test_row_lower_index:]
    
#     if y is not None:
#         y_train = y.iloc[:train_row_upper_index + 1]
#         y_test = y.iloc[test_row_lower_index:]
    
    y_train = y.iloc[:train_row_upper_index + 1]
    y_test = y.iloc[test_row_lower_index:]
    
    if return_only_training_split == True:
        return X_train, y_train
    
    print(X_test.shape, y_test.shape)
    
    return X_train, X_test, y_train, y_test

* ## KNN Classifier and cross validator using Simple Cross Validation

In [54]:

def knn_trainer_and_cross_validator(k, X_train, y_train, X_cv, y_cv, algorithm, save_name):
    
    # Side note: note that X_train and y_train are sparse matrices, and not numpy arrays
#     knn = KNeighborsClassifier(n_neighbors = k, algorithm = algorithm)
#     knn.fit(X_train, y_train)
    
    if os.path.exists(save_name + '.pkl'):
        with open(save_name + '.pkl', 'rb') as trained_knn_pkl:
            knn = pickle.load(trained_knn_pkl)
    else:
        knn = KNeighborsClassifier(n_neighbors = k, algorithm = algorithm)
        knn.fit(X_train, y_train)
        
        with open(save_name + '.pkl', 'wb') as trained_knn_pkl:
            pickle.dump(knn, trained_knn_pkl)
        
    
    y_pred_cv = knn.predict(X_cv)
    
    f1score = f1_score(y_cv, y_pred_cv) * 100
    
    return f1score

## Import the dataset

In [55]:
import sqlite3
import pandas as pd

In [56]:
db_connection = sqlite3.connect('database.sqlite')

In [57]:
polarisable_dataset = pd.read_sql_query('select * from reviews where Score != 3', db_connection)

In [58]:
polarisable_dataset.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [59]:
polarisable_dataset['Time'].head()

0    1303862400
1    1346976000
2    1219017600
3    1307923200
4    1350777600
Name: Time, dtype: int64

In [60]:
polarisable_dataset.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [61]:
df = polarisable_dataset # just to make typing easier

In [62]:
### sampled_df contains top 100k examples from the dataset (instead of doing random sampling)

sampled_df = df.iloc[:10000, :]

In [63]:
sampled_df.shape

(10000, 10)

## Replacing the ratings with 0 (for negative reviews) and 1 (for positive reviews).
#### Score of >3 has been considered as positive and a score of <3 has been taken as negative

In [64]:
type(sampled_df['Score'])

pandas.core.series.Series

In [65]:
scores = sampled_df['Score']

In [66]:
scores[6:12]

6     5
7     5
8     5
9     5
10    5
11    5
Name: Score, dtype: int64

In [67]:
scores = list(map(lambda x: 0 if x<3 else 1, scores))

In [68]:
scores[6:12]

[1, 1, 1, 1, 1, 1]

In [69]:
sampled_df['Score'] = scores

In [70]:
type(sampled_df['Score'].head(2))

pandas.core.series.Series

### Data preprocessing

##### 1. Deduplication
If a user id has multiple entries for the same timestamp, then it should be removed because it is likely that multiple entries at the same timestamp were for the same product of different variety which has a different product id than other variants


In [71]:
sampled_df.duplicated(subset = ['UserId', 'Time']).sum()

527

In [72]:
sampled_deduplicated_df = sampled_df.drop_duplicates(subset = ['UserId', 'Time'], inplace = False, keep = 'first')

#### 2. Extracting the data needed (corpus)
#### And removing html and punctuations

In [73]:
corpus = sampled_deduplicated_df['Text']

In [74]:
# dataset cleaners

import re

def remove_html(sentence):
    html_tag_re_obj = re.compile('<.*>?')
    return re.sub(html_tag_re_obj, ' ', sentence)

def remove_punctuations(sentence):
    cleaned_sentence = re.sub(r'[^a-zA-Z]', r' ', sentence)
    return cleaned_sentence

In [75]:
cleaned_corpus = []
for doc in corpus:
    cleaned_doc_1 = remove_html(doc)
    cleaned_doc_2 = remove_punctuations(doc)
    cleaned_corpus.append(cleaned_doc_2)

#### 3. Removing stop words

In [76]:
## Since the negative food reviews are likely to contain words like "don't", "didn't", etc that impart important
## meaning to the review, we check if such words exist in the corpus that we have. If these words are in the corpus,
## then they should not be in the list of stop words that we use for removing the stopwords from our corpus

count = 0
for doc in cleaned_corpus:
    if "not" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "don't" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "didn't" in doc:
        count += 1

print(count)

3544
0
0


In [77]:
from nltk.corpus import stopwords

In [78]:
stopwords = stopwords.words('english')

In [79]:
stopwords = set(stopwords)

In [80]:
stopwords.remove('not')

In [81]:
'not' in stopwords

False

In [82]:
a = [1,2,3,0,1,0,5]

In [83]:
# filtered_corpus = corpus with docs having no stop words
# doing with the sexy lambda expression

filtered_corpus = list(map(lambda doc: ' '.join(list(filter(lambda word: True if word not in stopwords else False\
                                                            , doc.split()))), corpus))

In [84]:
len(filtered_corpus)

9473

In [85]:
filtered_corpus[:2]

['I bought several Vitality canned dog food products found good quality. The product looks like stew processed meat smells better. My Labrador finicky appreciates product better most.',
 'Product arrived labeled Jumbo Salted Peanuts...the peanuts actually small sized unsalted. Not sure error vendor intended represent product "Jumbo".']

In [86]:
### classical way of removing the lambda expressions
### verified the output of lambda expression output with the output of following implementation, outputs are same
# docs_without_stop_words = []
# for i, doc in enumerate(corpus):
#     non_stop_words_in_doc = []
#     for word in doc.split():
#         if word not in stopwords:
#             non_stop_words_in_doc.append(word)
            
    
#     docs_without_stop_words.append(' '.join(non_stop_words_in_doc))

#### 4. Stemming the words (SnowballStemmer)

In [87]:
from nltk.stem import SnowballStemmer

In [88]:
stemmer = SnowballStemmer('english')

In [89]:
stemmed_filtered_corpus = list(map(lambda doc: ' '.join(list(map(stemmer.stem, doc.split()))), corpus))

In [90]:
stemmed_filtered_corpus[:3]

['i have bought sever of the vital can dog food product and have found them all to be of good quality. the product look more like a stew than a process meat and it smell better. my labrador is finicki and she appreci this product better than most.',
 'product arriv label as jumbo salt peanuts...th peanut were actual small size unsalted. not sure if this was an error or if the vendor intend to repres the product as "jumbo".',
 'this is a confect that has been around a few centuries. it is a light, pillowi citrus gelatin with nut - in this case filberts. and it is cut into tini squar and then liber coat with powder sugar. and it is a tini mouth of heaven. not too chewy, and veri flavorful. i high recommend this yummi treat. if you are familiar with the stori of c.s. lewi "the lion, the witch, and the wardrobe" - this is the treat that seduc edmund into sell out his brother and sister to the witch.']

## Sorting the dataset according to Time

In [91]:
sampled_deduplicated_df['Text'] = stemmed_filtered_corpus

In [92]:
working_df = sampled_deduplicated_df

In [93]:
working_df_sorted = working_df.sort_values(by = 'Time')

In [94]:
stemmed_filtered_corpus_sorted = working_df_sorted['Text']

## Vectorizing the reviews and splitting into train, cv and test sets and TRAINING and TESTING

### 1.1. Bag of Words (CountVectorizer)

#### Note: The vectorization should be done on the training set and not the entire dataset, if the entire dataset is used for vectorization it is called as Memory leaks.

In [95]:
from sklearn.feature_extraction.text import CountVectorizer

In [96]:
type(stemmed_filtered_corpus_sorted)

pandas.core.series.Series

In [134]:
### load the vector if a document_term_matrix was already computed earlier, and saved in local dir
### if not saved earlier, then fit a CountVectorizer, and then transform on the dataset to obtain the
### document_term_matrix, and then save the document_term_matrix

if os.path.exists('document_term_matrix_pickle_bow.pkl'):
    with open('document_term_matrix_pickle_bow.pkl', 'rb') as document_term_matrix_pickle:
        document_term_matrix = pickle.load(document_term_matrix_pickle)
else:
    count_vectorizer = CountVectorizer()
    
    ### X_train and y_train will be used throughout this prgram, we shall need it always
    X_train, X_test, y_train, y_test = train_test_splitter(stemmed_filtered_corpus_sorted, working_df_sorted['Score'], test_size = 0.2)
    
    document_term_matrix = count_vectorizer.fit_transform(X_train)
    with open('document_term_matrix_pickle_bow.pkl', 'wb') as document_term_matrix_pickle:
        pickle.dump(document_term_matrix, document_term_matrix_pickle)

In [98]:
document_term_matrix.shape

(7579, 15706)

In [99]:
type(document_term_matrix)

scipy.sparse.csr.csr_matrix

#### We've got the X_train and y_train for now, and also X_test, y_test

In [100]:
X_train_bow_repr = document_term_matrix

In [101]:
y_train_bow_repr = y_train

### 1.1.1. Splitting into train, cv and test (Simple Cross Validation)

In [102]:
# from sklearn.model_selection import train_test_split

In [103]:
### This will not work because train_test_split() splits data randomly. What we want is a time-based splitting on
### the dataset that we have sorted chronologically
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

In [104]:
# print(X.shape, y.shape)

(7579, 15706) (9473,)


In [105]:
# type(y)

pandas.core.series.Series

In [130]:
# X_train, X_test, y_train, y_test = train_test_splitter(X, y, test_size = 0.2)

X     y
 (7579, 15706) (9473,)
(1515, 15706) (3409,)


In [107]:
# type(X_train)

scipy.sparse.csr.csr_matrix

In [108]:
# X_train.shape[0] + X_test.shape[0]

7579

In [109]:
### Splitting the previous X_train and y_train into X_train, X_cv and y_train, y_cv
X_train_bow_repr, X_cv_bow_repr, y_train_bow_repr, y_cv_repr = train_test_splitter(X_train_bow_, y_train_bow_repr, 0.2)

### 1.1.3. Training and Testing

In [110]:
from sklearn.neighbors import KNeighborsClassifier

In [111]:
from sklearn.metrics import f1_score

### 1.1.4. Brute force k-NN

In [117]:
save_name = 'bow_brute_knn'
f1scores_for_diff_k = []
for i in range(1, 30, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train_bow_repr, y_train_bow_repr, X_cv_repr, y_cv_repr, algorithm = 'brute', save_name = save_name + str(i)))
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

F1-score for k = 1 is 87.88167938931298
F1-score for k = 3 is 90.24839006439743
F1-score for k = 5 is 90.91734786557674
F1-score for k = 7 is 90.9090909090909
F1-score for k = 9 is 90.990990990991
F1-score for k = 11 is 91.08108108108108
F1-score for k = 13 is 91.27697841726618
F1-score for k = 15 is 91.19496855345912
F1-score for k = 17 is 91.24382577458464
F1-score for k = 19 is 91.20287253141831
F1-score for k = 21 is 91.20287253141831
F1-score for k = 23 is 91.20287253141831
F1-score for k = 25 is 91.15401885945218
F1-score for k = 27 is 91.15401885945218
F1-score for k = 29 is 91.15401885945218


In [118]:
max_f1score = max(f1scores_for_diff_k)

In [119]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

91.27697841726618 6


In [120]:
### f1-score at index 6 is maximum, from the above list of f1-scores for different k values, we can find that index\
### 5 corresponds to k = 13, ie, hyperparameter k has been tuned to 11

## since the model has already been trained and saved with 'k' value suffixed to the file name, we directly load it
with open(save_name + str(13) + '.pkl', 'rb') as knn_pkl:
    knn = pickle.load(knn_pkl)
    
# knn = KNeighborsClassifier(n_neighbors = 11, algorithm = 'brute')

In [None]:
### commenting the following code because knn object loaded in the previous cell is already trained
# knn.fit(X_train, y_train)

In [None]:
X_test_bow_repr = count_vectorizer.fit_transform(X_test)

In [121]:
### finally testing after tuning the hyperparameter 'k' on the cross validation set
y_pred_test = knn.predict(X_test_bow_repr)

In [122]:
f1_score(y_test, y_pred_test)*100

ValueError: Found input variables with inconsistent numbers of samples: [3409, 1515]

### 1.1.5. Kd-tree

In [None]:
save_name = 'bow_kd_knn'
f1scores_for_diff_k = []
for i in range(1, 30, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train_bow_repr, y_train_bow_repr, X_cv_bow_repr, y_cv_bow_repr), algorithm = 'kd_tree', save_name = (save_name + str(i)))
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

In [None]:
max_f1score = max(f1scores_for_diff_k)

In [None]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

In [None]:
### k = 11 at index 5, ie, k value is tuned to 11

with open(save_name + str(11) + '.pkl', 'rb') as knn_pkl:
    knn = pickel.load(knn_pkl)

# knn = KNeighborsClassifier(n_neighbors = 11, algorithm = 'kd_tree')

In [None]:
# knn.fit(X_train, y_train)

In [None]:
### finally testing after tuning the hyperparameter 'k' on the cross validation set
y_pred_test = knn.predict(X_test_bow_repr)

In [None]:
f1_score(y_test, y_pred_test)*100

### 1.2. Tf-Idf Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
### load the vector if a document_term_matrix was already computed earlier, and saved in local dir
### if not saved earlier, then fit a TfIdfVectorizer, and then transform on the dataset to obtain the
### document_term_matrix, and then save the document_term_matrix

if os.path.exists('document_term_matrix_pickle_tfidf.pkl'):
    with open('document_term_matrix_pickle_tfidf.pkl', 'rb') as document_term_matrix_pickle:
        document_term_matrix = pickle.load(document_term_matrix_pickle)
else:
    tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,1))
#     X_train, X_test, y_train, y_test = train_test_splitter(stemmed_filtered_corpus_sorted, stemm, test_size = 0.2, return_only_training_split = True)
    tfidf_vectorizer = tfidf_vectorizer.fit(X_train)
    document_term_matrix = tfidf_vectorizer.transform(X_train)
    with open('document_term_matrix_pickle_tfidf.pkl', 'wb') as document_term_matrix_pickle:
        pickle.dump(document_term_matrix, document_term_matrix_pickle)

In [None]:
# tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,1))

In [None]:
# tfidf_vectorizer = tfidf_vectorizer.fit(stemmed_filtered_corpus_sorted)

In [None]:
# tfidf_vectorizer.vocabulary_

In [None]:
# document_term_matrix = tfidf_vectorizer.transform(stemmed_filtered_corpus_sorted)

In [None]:
type(document_term_matrix)

In [None]:
document_term_matrix.shape

In [None]:
X_train_tfidf_repr = document_term_matrix

In [None]:
y_train_tfidf_repr = y_train

In [None]:
X_train_tfidf_repr, X_cv_tfidf_repr, y_train_tfidf_repr, y_cv_tfidf_repr = train_test_splitter(X_train_tfidf_repr, y_train_tfidf_repr, test_size = 0.2)

### 1.2.1. Brute force k-NN

In [None]:
save_name = 'tfidf_brute_knn'
f1scores_for_diff_k = []
for i in range(1, 30, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train_tfidf_repr, y_train_tfidf_repr, X_cv_tfidf_repr, y_cv_tfidf_repr, algorithm = 'brute', save_name = save_name + str(i))
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

In [None]:
print(max(f1scores_for_diff_k), f1scores_for_diff_k.index(max(f1scores_for_diff_k)), sep = '\t')

In [None]:
### index = 6 has the maximum f1_score. This index corresponds to k = 13 (from the above printed list of f1scores)

with open(save_name + str(13) + '.pkl', 'rb') as knn_pkl:
    knn = pickle.load(knn_pkl)

# knn = KNeighborsClassifier(n_neighbors = 13, algorithm = 'brute')

In [None]:
## training the final 13-NN

# knn = knn.fit(X_train, y_train)

In [None]:
X_test_tfidf_repr = tfidf_vectorizer.fit_transform(X_test)

In [None]:
### testing on test data

y_pred = knn.predict(X_test_tfidf_repr)

In [None]:
### f1_score score

f1_score(y_test, y_pred) * 100

### 1.2.2. kd_tree k-NN

In [None]:
save_name = 'tfidf_kd_knn'
f1scores_for_diff_k = []
for i in range(1, 30, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train_tfidf_repr, y_train_repr, X_cv_repr, y_cv_repr, algorithm = 'kd_tree', save_name = save_name + str(i)
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

In [None]:
print(max(f1scores_for_diff_k), f1scores_for_diff_k.index(max(f1scores_for_diff_k)), sep = '\t')

In [None]:
### index = 10 has the maximum f1_score. This index corresponds to k = 21 (from the above printed list of f1scores)

with open(save_name + str(21), '.pkl', 'rb') as knn_pkl:
    knn = pickle.load(knn_pkl)

# knn = KNeighborsClassifier(n_neighbors = 21, algorithm = 'kd_tree')

In [None]:
## training the final 13-NN

# knn = knn.fit(X_train, y_train)

In [None]:
### testing on test data

y_pred = knn.predict(X_test_tffidf_repr)

In [None]:
### f1_score score

f1_score(y_test, y_pred) * 100

### 1.3. Average W2V

In [None]:
import gensim

### 1.3.1. Tokenizing each document in the corpus
#### gensim w2v requires each document to be tokenized into words. The corpus will be a list of lists of words

In [None]:
## stemmed_filtered_corpus_sorted is a pandas Series object. It should be converted into a list first.
## after that each sentence in the resulted list should be tokenized into words stored in a list.
## all these lists should be stored into another list so as to give a list of lists as required by gensim w2v
# X_train, y_train = train_test_splitter(stemmed_filtered_corpus_sorted, test_size = 0.2, return_only_training_split = True)
# stemmed_filtered_corpus_sorted_list = list(stemmed_filtered_corpus_sorted)

In [None]:
X_train = train_test_splitter(stemmed_filtered_corpus_sorted, None, test_size = 0.2, return_only_training_split = True)

stemmed_filtered_sorted_list_of_tokenized_sentences = []

for sentence in X_train:
    tokenized_sentence = sentence.split()
    stemmed_filtered_sorted_list_of_tokenized_sentences.append(tokenized_sentence)

In [None]:
len(stemmed_filtered_sorted_list_of_tokenized_sentences)

In [None]:
### load the vector if a w2v model was already trained earlier, and saved in local dir
### if not saved earlier, then fit a Word2Vec model, on the dataset and then save it

if os.path.exists('word2vec_trained_model.pkl'):
    with open('word2vec_trained_model.pkl', 'rb') as word2vec_trained_model:
        word2vec_trained_model = pickle.load(word2vec_trained_model)
else:
    w2v = gensim.models.Word2Vec(stemmed_filtered_sorted_list_of_tokenized_sentences, min_count = 1, size = 50, workers = 4)
    with open('word2vec_trained_model.pkl', 'wb') as word2vec_trained_model:
        pickle.dump(w2v, word2vec_trained_model)

In [None]:
# w2v = gensim.models.Word2Vec(stemmed_filtered_sorted_list_of_tokenized_sentences, min_count = 1, size = 50, workers = 4)

In [None]:
type(w2v.wv)

In [None]:
w2v.wv['happen']

In [None]:
type(w2v.wv['happen'])

In [None]:
w2v.wv['happen'].shape

In [None]:
# ### saving the w2v model for later use

# import os

# if(not os.path.exists('w2v_practice.model')):
#     w2v.save('w2v_practice.model')
    
# else:
#     w2v = gensim.models.Word2Vec.load('w2v_practice.model')

In [None]:
### load the vector if a avg_w2v representation of each sentence was already computed earlier and saved in local dir
### if not saved earlier, then compute an avg_w2v representation for all the sentences and then save the list

if os.path.exists('avg_word2vec.pkl'):
    with open('avg_word2vec.pkl', 'rb') as avg_w2v_pkl:
        avg_w2v = pickle.load(avg_w2v_pkl)
else:
    avg_w2v = []
    for tokenized_sentence in stemmed_filtered_sorted_list_of_tokenized_sentences:
        sum_of_vectors_for_each_word = 0
        for word in tokenized_sentence:
            sum_of_vectors_for_each_word += w2v.wv[word]
        avg_w2v.append(sum_of_vectors_for_each_word / len(tokenized_sentence))
    with open('avg_word2vec.pkl', 'wb') as avg_w2v_pkl:
        pickle.dump(avg_w2v, avg_w2v_pkl)

In [None]:
# ### computing avg w2v representation for the reviews dataset

# avg_w2v = []
# for tokenized_sentence in stemmed_filtered_sorted_list_of_tokenized_sentences:
#     sum_of_vectors_for_each_word = 0
#     for word in tokenized_sentence:
#         sum_of_vectors_for_each_word += w2v.wv[word]
#     avg_w2v.append(sum_of_vectors_for_each_word / len(tokenized_sentence))

In [None]:
type(avg_w2v[0])

In [None]:
avg_w2v[0].shape

In [None]:
avg_w2v[0]

In [None]:
X = avg_w2v
## y is the same as before

In [None]:
import numpy as np

In [None]:
### the train_test_splitter() assumes X to be a numpy array
X = np.array(avg_w2v)

In [None]:
X.shape

In [None]:
X_train_first, X_test, y_train_first, y_test = train_test_splitter(X, y, test_size = 0.2)

In [None]:
X_train, X_cv, y_train, y_cv = train_test_splitter(X_train_first, y_train_first, test_size = 0.2)

### 1.3.2. Brute force k-NN

In [None]:
f1scores_for_diff_k = []
for i in range(1, 30, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'brute', save_name = 'avgw2v_brute_knn')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

In [None]:
max_f1score = max(f1scores_for_diff_k)

In [None]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

In [None]:
### max_f1_score is indexed at 5 which corresponds to k = 11; hypertuned on CV set. Training knn for k = 5

with open(save_name + str(11) + '.pkl', 'rb') as knn_pkl:
    knn = pickle.load(knn_pkl)

# knn = KNeighborsClassifier(n_neighbors = 5, algorithm = 'brute')

In [None]:
# knn = knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
f1_score(y_test, y_pred) * 100

### 1.3.3. kd_tree k-NN

In [None]:
f1scores_for_diff_k = []
for i in range(1, 30, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'kd_tree', save_name = 'avgw2v_kd_knn')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

In [None]:
max_f1score = max(f1scores_for_diff_k)

In [None]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

In [None]:
### max_f1_score is indexed at 5 which corresponds to k = 11; hypertuned on CV set. Training knn for k = 5

with open(save_name + str(11) + '.pkl', 'rb') as knn_pkl:
    knn = pickle.load(knn_pkl)

# knn = KNeighborsClassifier(n_neighbors = 11, algorithm = 'kd_tree')

In [None]:
knn = knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
f1_score(y_test, y_pred) * 100

### 1.4. TfIdf weighted Word2Vec

In [None]:
### load the vector if a tfidf_weighted_w2v representation of each sentence was already computed earlier and saved
### if not saved earlier, then compute an tfidf_weighted_w2v representation for all the sentences and then save
### the list of representation of each sentence

is os.path.exists('tfidf_weighted_word2vec.pkl'):
    with open('tfidf_weighted_word2vec.pkl', 'rb') as tfidf_weighted_w2v_pkl:
        tfidf_weighted_w2v = pickle.load(tfidf_weighted_w2v_pkl)
else:
    tfidf_weighted_w2v = []
    for sentence in stemmed_filtered_sorted_list_of_tokenized_sentences:
        tfidf_weighted_sum_of_vectors_for_each_word = 0
        for word in sentence:
            if word not in tfidf_vectorizer.vocabulary_ or word not in w2v.wv:
                continue
            tfidf_weighted_sum_of_vectors_for_each_word += tfidf_vectorizer.vocabulary_[word] * w2v.wv[word]
        tfidf_weighted_w2v.append(tfidf_weighted_sum_of_vectors_for_each_word)
    with open('tfidf_weighted_word2vec.pkl', 'wb') as tfidf_weighted_w2v_pkl:
        pickle.dump(tfidf_weighted_w2v, tfidf_weighted_w2v_pkl)

In [None]:
# tfidf_weighted_w2v = []
# for sentence in stemmed_filtered_sorted_list_of_tokenized_sentences:
#     tfidf_weighted_sum_of_vectors_for_each_word = 0
#     for word in sentence:
#         if word not in tfidf_vectorizer.vocabulary_ or word not in w2v.wv:
#             continue
#         tfidf_weighted_sum_of_vectors_for_each_word += tfidf_vectorizer.vocabulary_[word] * w2v.wv[word]
#     tfidf_weighted_w2v.append(tfidf_weighted_sum_of_vectors_for_each_word)

In [None]:
X = np.array(tfidf_weighted_w2v)

In [None]:
X.shape

In [None]:
X_train_first, X_test, y_train_first, y_test = train_test_splitter(X, y, test_size = 0.2)

In [None]:
X_train, X_cv, y_train, y_cv = train_test_splitter(X_train_first, y_train_first, test_size = 0.2)

### 1.4.1. Brute force k-NN

In [None]:
f1scores_for_diff_k = []

for i in range(1, 30, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'brute', save_name = 'tfidf_brute_knn')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

In [None]:
max_f1score = max(f1scores_for_diff_k)

In [None]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

In [None]:
### index = 10 corresponds to k = 21, ie, hyperparamter k is tuned to 21

with open(save_name + str(21) + '.pkl', 'rb') ass knn_pkl:
    knn = pickle.load(knn_pkl)

# knn = KNeighborsClassifier(n_neighbors = 21, algorithm = 'brute')

In [None]:
# knn = knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
f1_score(y_test, y_pred) * 100

### 1.4.2. kd_tree k-NN

In [None]:
f1scores_for_diff_k = []

for i in range(1, 30, 2):
    f1score = knn_trainer_and_cross_validator(i, X_train, y_train, X_cv, y_cv, algorithm = 'kd_tree', save_name = 'tfidf_kd_knn')
    f1scores_for_diff_k.append(f1score)
    print('F1-score for k = ' + str(i) + ' is ' + str(f1score))

In [None]:
max_f1score = max(f1scores_for_diff_k)

In [None]:
print(max_f1score, f1scores_for_diff_k.index(max_f1score))

In [None]:
### index = 10 corresponds to k = 21, ie, hyperparamter k is tuned to 21

with open(save_name + str(21) + '.pkl', 'rb') as knn_pkl:
    knn = pickle.load(knn_pkl)

# knn = KNeighborsClassifier(n_neighbors = 21, algorithm = 'kd_tree')

In [None]:
# knn = knn.fit(X_train, y_train)

In [None]:
f1_score(y_test, y_pred) * 100

# Summary

In [None]:
from prettytable import PrettyTable

In [None]:
pretty_table = PrettyTable()

In [None]:
pretty_table.field_names = ['Vectorizer', 'Model', 'Hyperparameter (k) value', 'F1-score']

In [None]:
pretty_table.add_row(['BoW', 'Brute k-NN', '11', '90.84'])
pretty_table.add_row(['BoW', 'kd_tree k-NN', '11', '90.823'])
pretty_table.add_row(['Tf-Idf', 'Brute k-NN', '13', '91.063'])
pretty_table.add_row(['Tf-Idf', 'kd_tree k-NN', '21', '90.885'])
pretty_table.add_row(['Avg_W2V', 'Brute k-NN', '11', '89.13'])
pretty_table.add_row(['Avg_W2V', 'kd_tree k-NN', '11', '90.187'])
pretty_table.add_row(['Tf-Idf_W2V', 'Brute k-NN', '21', '90.885'])
pretty_table.add_row(['Tf-Idf_W2V', 'kd_tree k-NN', '21', '90.885'])

In [None]:
print(pretty_table)