In [6]:
import warnings
warnings.filterwarnings('ignore')

## Import the dataset

In [7]:
import sqlite3
import pandas as pd

In [8]:
import os

In [9]:
os.listdir()

['tutorial_ipynbs',
 '.gitignore.swp',
 '.gitignore',
 'database.sqlite',
 '.ipynb_checkpoints',
 'KNN_Amazon_fine_food_dataset.ipynb']

In [10]:
db_connection = sqlite3.connect('database.sqlite')

In [11]:
polarisable_dataset = pd.read_sql_query('select * from reviews where Score != 3', db_connection)

In [12]:
polarisable_dataset.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [13]:
polarisable_dataset['Time'].head()

0    1303862400
1    1346976000
2    1219017600
3    1307923200
4    1350777600
Name: Time, dtype: int64

In [14]:
polarisable_dataset.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [15]:
df = polarisable_dataset # just to make typing easier

In [16]:
sampled_df = df.sample(n = 5000, axis = 0, random_state = 0) # random sampling of dataset

In [17]:
sampled_df.shape

(5000, 10)

## Replacing the ratings with 0 (for negative reviews) and 1 (for positive reviews).
#### Score of >3 has been considered as positive and a score of <3 has been taken as negative

In [18]:
type(sampled_df['Score'])

pandas.core.series.Series

In [19]:
scores = sampled_df['Score']

In [20]:
scores[6:12]

161937    5
451123    5
401436    5
396416    5
449554    1
379099    1
Name: Score, dtype: int64

In [21]:
scores = list(map(lambda x: 0 if x<3 else 1, scores))

In [22]:
scores[6:12]

[1, 1, 1, 1, 0, 0]

In [23]:
sampled_df['Score'] = scores

In [24]:
type(sampled_df['Score'].head(2))

pandas.core.series.Series

### Data preprocessing

##### 1. Deduplication
If a user id has multiple entries for the same timestamp, then it should be removed because it is likely that multiple entries at the same timestamp were for the same product of different variety which has a different product id than other variants


In [25]:
sampled_df.duplicated(subset = ['UserId', 'Time']).sum()

62

In [26]:
sampled_deduplicated_df = sampled_df.drop_duplicates(subset = ['UserId', 'Time'], inplace = False, keep = 'first')

#### 2. Extracting the data needed (corpus)
#### And removing html and punctuations

In [27]:
corpus = sampled_deduplicated_df['Text']

In [28]:
# dataset cleaners

import re

def remove_html(sentence):
    html_tag_re_obj = re.compile('<.*>?')
    return re.sub(html_tag_re_obj, ' ', sentence)

def remove_punctuations(sentence):
    cleaned_sentence = re.sub(r'[^a-zA-Z]', r' ', sentence)
    return cleaned_sentence

In [29]:
cleaned_corpus = []
for doc in corpus:
    cleaned_doc_1 = remove_html(doc)
    cleaned_doc_2 = remove_punctuations(doc)
    cleaned_corpus.append(cleaned_doc_2)

#### 3. Removing stop words

In [30]:
## Since the negative food reviews are likely to contain words like "don't", "didn't", etc that impart important
## meaning to the review, we check if such words exist in the corpus that we have. If these words are in the corpus,
## then they should not be in the list of stop words that we use for removing the stopwords from our corpus

count = 0
for doc in cleaned_corpus:
    if "not" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "don't" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "didn't" in doc:
        count += 1

print(count)

1890
0
0


In [31]:
from nltk.corpus import stopwords

In [32]:
stopwords = stopwords.words('english')

In [33]:
stopwords = set(stopwords)

In [34]:
stopwords.remove('not')

In [35]:
'not' in stopwords

False

In [36]:
a = [1,2,3,0,1,0,5]

In [37]:
# filtered_corpus = corpus with docs having no stop words
# doing with the sexy lambda expression

filtered_corpus = list(map(lambda doc: ' '.join(list(filter(lambda word: True if word not in stopwords else False\
                                                            , doc.split()))), corpus))

In [38]:
len(filtered_corpus)

4938

In [39]:
filtered_corpus[:2]

["We pretty much given GF pasta restaurant. It's really good disintegrate toss sauce. My celiac husband thrilled! FYI, first ingredient CORN, quinoa second, not QUITE nutritious might think -- cares!",
 'I bought first SproutMaster years ago Dowling Orchard Market passing Banning, California (back sold them).<br /><br />Through bit pricey plastic tray, I cannot complain ease use quality sprouts grown. The divider IS helpful grow different sprouts, BUT helpful grow type sprout two different stages growth. This way, always fresh sprouts handy anytime.<br /><br />They grow fast Sprout Master makes simple care them.']

In [40]:
### classical way of removing the lambda expressions
### verified the output of lambda expression output with the output of following implementation, outputs are same
# docs_without_stop_words = []
# for i, doc in enumerate(corpus):
#     non_stop_words_in_doc = []
#     for word in doc.split():
#         if word not in stopwords:
#             non_stop_words_in_doc.append(word)
            
    
#     docs_without_stop_words.append(' '.join(non_stop_words_in_doc))

#### 4. Stemming the words (SnowballStemmer)

In [41]:
from nltk.stem import SnowballStemmer

In [42]:
stemmer = SnowballStemmer('english')

In [43]:
stemmed_filtered_corpus = list(map(lambda doc: ' '.join(list(map(stemmer.stem, doc.split()))), corpus))

In [44]:
stemmed_filtered_corpus[:3]

["we had pretti much given up on gf pasta until we had this in a restaurant. it realli good and doesn't disintegr when you toss it with sauce. my celiac husband is thrilled! fyi, the first ingredi is corn, and quinoa is second, so it not quit as nutriti as you might think -- but who cares!",
 'i bought my first sproutmast year ago at the dowl orchard market when pass through banning, california (back when they sold them).<br /><br />through a bit pricey for a plastic tray, i cannot complain about the eas of use or the qualiti of the sprout grown. the divid is help to grow differ sprouts, but it is more help to grow the same type of sprout in two differ stage of growth. this way, you can alway have fresh sprout handi at anytime.<br /><br />they do grow fast and the sprout master make it so simpl to care for them.',
 'for bold coffe fan like us, this is absolut delici coffee. it has a complex taste, a hint of someth resembl red wine. i almost hesit to review it becaus it will probabl cau

## Sorting the dataset according to Time

In [45]:
sampled_deduplicated_df['Text'] = stemmed_filtered_corpus

In [46]:
working_df = sampled_deduplicated_df

In [47]:
working_df_sorted = working_df.sort_values(by = 'Time')

In [48]:
stemmed_filtered_corpus_sorted = working_df_sorted['Text']

## Vectorizing the reviews and splitting into train, cv and test sets and TRAINING and TESTING

### 1.1. Bag of Words (CountVectorizer)

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
count_vectorizer = CountVectorizer()

In [51]:
document_term_matrix = count_vectorizer.fit_transform(stemmed_filtered_corpus_sorted)

In [52]:
document_term_matrix.shape

(4938, 13452)

In [53]:
type(document_term_matrix)

scipy.sparse.csr.csr_matrix

In [54]:
X = document_term_matrix

In [55]:
y = working_df_sorted['Score']

### 1.1.1. Splitting into train, cv and test (Simple Cross Validation)

In [56]:
# from sklearn.model_selection import train_test_split

In [57]:
### This will not work because train_test_split() splits data randomly. What we want is a time-based splitting on
### the dataset that we have sorted chronologically
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

In [58]:
print(X.shape, y.shape)

(4938, 13452) (4938,)


In [59]:
type(y)

pandas.core.series.Series

 * ## Function to split dataset into train and test datasets 

In [60]:
def train_test_splitter(X, y, test_size):
    train_size = 1 - test_size
    train_row_upper_index = round(train_size*X.shape[0])
    test_row_lower_index = train_row_upper_index + 1
    
#     print(train_row_upper_index)
    
    X_train = X[:train_row_upper_index + 1, :]
    X_test = X[test_row_lower_index:, :]
    y_train = y.iloc[:train_row_upper_index + 1]
    y_test = y.iloc[test_row_lower_index:]
    
    return X_train, X_test, y_train, y_test

In [61]:
X_train, X_test, y_train, y_test = train_test_splitter(X, y, test_size = 0.25)

In [62]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [63]:
X_train.shape[0] + X_test.shape[0]

4938

In [64]:
### Splitting the previous X_train and y_train into X_train, X_cv and y_train, y_cv
X_train, X_cv, y_train, y_cv = train_test_splitter(X_train, y_train, 0.25)

### 1.1.3. Training and Testing

In [65]:
from sklearn.neighbors import KNeighborsClassifier

In [66]:
from sklearn.metrics import accuracy_score

* ## KNN Classifier and cross validator using Simple Cross Validation

In [67]:
def knn_trainer_and_cross_validator(k):
    
    # Side note: note that X_train and y_train are sparse matrices, and not numpy arrays
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train)
    
    y_pred_cv = knn.predict(X_cv)
    
    accuracy = accuracy_score(y_cv, y_pred_cv) * 100
    
    return accuracy

In [68]:
## tried for k = 1 till 30 and found that for odd k values in this range, accuracy always came below 86%.
## accuracy for k = 31 to 41 (inclusive) was same, 86.8876% and from 43 to 59 it was 86.7435%.
## accuracy for k = 61 to 90 is clearly less than the accuracy obtained for k = 31 to 41.
## we shall choose one of the k values from k = 31 to 41

for i in range(61, 90, 2):
    accuracy = knn_trainer_and_cross_validator(i)
    print('Accuracy for k = ' + str(i) + ' is: ' + str(accuracy))

Accuracy for k = 61 is: 83.13513513513513
Accuracy for k = 63 is: 83.13513513513513
Accuracy for k = 65 is: 83.13513513513513
Accuracy for k = 67 is: 83.13513513513513
Accuracy for k = 69 is: 83.13513513513513
Accuracy for k = 71 is: 83.13513513513513
Accuracy for k = 73 is: 83.13513513513513
Accuracy for k = 75 is: 83.13513513513513
Accuracy for k = 77 is: 83.13513513513513
Accuracy for k = 79 is: 83.13513513513513
Accuracy for k = 81 is: 83.13513513513513
Accuracy for k = 83 is: 83.13513513513513
Accuracy for k = 85 is: 83.13513513513513
Accuracy for k = 87 is: 83.13513513513513
Accuracy for k = 89 is: 83.13513513513513


In [69]:
##### Since the cross-validation accuracy obtained for k = 31, 33, 35, 37, 39 and 41 are same and max, we choose
##### k = 33.

In [70]:
knn = KNeighborsClassifier(n_neighbors = 33)

In [71]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=33, p=2,
           weights='uniform')

In [72]:
### finally testing after tuning the hyperparameter 'k' on the cross validation set
y_pred_test = knn.predict(X_test)

In [73]:
accuracy_score(y_test, y_pred_test)*100

83.29278183292782

### 1.2. Tf-Idf Vectorization

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [75]:
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,1))

In [77]:
tfidf_vectorizer = tfidf_vectorizer.fit(stemmed_filtered_corpus_sorted)

In [79]:
# tfidf_vectorizer.vocabulary_

In [80]:
document_term_matrix = tfidf_vectorizer.transform(stemmed_filtered_corpus_sorted)

In [81]:
type(document_term_matrix)

scipy.sparse.csr.csr_matrix

In [82]:
document_term_matrix.shape

(4938, 13452)

In [83]:
X = document_term_matrix

In [84]:
y = working_df_sorted['Score']

In [86]:
X_train_first, X_test, y_train_first, y_test = train_test_splitter(X, y, test_size = 0.25)

In [87]:
X_train, X_cv, y_train, y_cv = train_test_splitter(X_train_first, y_train_first, test_size = 0.25)

### 1.2.1. Training and Testing

In [93]:
accuracies_for_diff_k = []
for i in range(1, 100, 2):
    accuracy = knn_trainer_and_cross_validator(i)
    accuracies_for_diff_k.append(accuracy)
    print('Accuracy for k = ' + str(i) + ' is ' + str(accuracy))

Accuracy for k = 1 is 77.1891891891892
Accuracy for k = 3 is 82.16216216216216
Accuracy for k = 5 is 82.91891891891892
Accuracy for k = 7 is 83.56756756756756
Accuracy for k = 9 is 83.24324324324324
Accuracy for k = 11 is 83.24324324324324
Accuracy for k = 13 is 83.56756756756756
Accuracy for k = 15 is 83.35135135135135
Accuracy for k = 17 is 83.35135135135135
Accuracy for k = 19 is 83.35135135135135
Accuracy for k = 21 is 83.24324324324324
Accuracy for k = 23 is 83.24324324324324
Accuracy for k = 25 is 83.24324324324324
Accuracy for k = 27 is 83.24324324324324
Accuracy for k = 29 is 83.24324324324324
Accuracy for k = 31 is 83.24324324324324
Accuracy for k = 33 is 83.35135135135135
Accuracy for k = 35 is 83.24324324324324
Accuracy for k = 37 is 83.24324324324324
Accuracy for k = 39 is 83.24324324324324
Accuracy for k = 41 is 83.24324324324324
Accuracy for k = 43 is 83.24324324324324
Accuracy for k = 45 is 83.13513513513513
Accuracy for k = 47 is 83.13513513513513
Accuracy for k = 49 is

In [95]:
print(max(accuracies_for_diff_k), accuracies_for_diff_k.index(max(accuracies_for_diff_k)), sep = '\t')

83.56756756756756	3


In [97]:
### k = 3 had the best accuracy on cross validation set, so our hyperparameter k is tuned to be taken as 3

knn = KNeighborsClassifier(n_neighbors = 3)

In [98]:
## training the final 3-NN

knn = knn.fit(X_train, y_train)

In [99]:
### testing on test data

y_pred = knn.predict(X_test)

In [100]:
### accuracy score

accuracy_score(y_test, y_pred) * 100

81.58961881589619

### 1.3. Average W2V

In [102]:
import gensim

### 1.3.1. Tokenizing each document in the corpus
#### gensim w2v requires each document to be tokenized into words. The corpus will be a list of lists of words

In [108]:
## stemmed_filtered_corpus_sorted is a pandas Series object. It should be converted into a list first.
## after that each sentence in the resulted list should be tokenized into words stored in a list.
## all these lists should be stored into another list so as to give a list of lists as required by gensim w2v

stemmed_filtered_corpus_sorted_list = list(stemmed_filtered_corpus_sorted)

In [110]:
stemmed_filtered_sorted_list_of_tokenized_sentences = []

for sentence in stemmed_filtered_corpus_sorted:
    tokenized_sentence = sentence.split()
    stemmed_filtered_sorted_list_of_tokenized_sentences.append(tokenized_sentence)

In [112]:
w2v = gensim.models.Word2Vec(stemmed_filtered_sorted_list_of_tokenized_sentences, min_count = 1, size = 50, workers = 4)

In [118]:
type(w2v.wv)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [114]:
w2v.wv['happen']

array([ 0.23448707,  0.3602921 , -0.04205541,  0.24961829,  0.06703568,
       -0.56954414, -0.04139027, -0.10449755, -0.24704807,  0.3226019 ,
       -0.16563085, -0.06909975,  0.12574764,  0.7040844 ,  0.03780265,
        0.80083275, -0.12604529, -0.5191217 ,  0.08033808, -0.0251145 ,
        0.39306724,  0.41053078,  0.04081402,  0.20275305, -0.55352265,
       -0.14907712,  0.25555494, -0.18186875, -0.30055696, -0.39865583,
       -0.58898985,  0.06406677, -0.00765801, -0.01547717, -0.13172501,
        0.2741572 ,  0.08689801, -0.06458356, -0.21395421,  0.06347103,
        0.31692132, -0.01164706, -0.21336176,  0.19902211, -0.01482673,
       -0.14805146, -0.04206163,  0.53220314, -0.2595097 , -0.19508953],
      dtype=float32)

In [119]:
### saving the w2v model for later use

import os

if(not os.path.exists('w2v_practice.model')):
    w2v.save('w2v_practice.model')
    
else:
    w2v = gensim.models.Word2Vec.load('w2v_practice.model')