In [128]:
import warnings
warnings.filterwarnings('ignore')

## Import the dataset

In [129]:
import sqlite3
import pandas as pd

In [130]:
import os

In [131]:
os.listdir()

['tutorial_ipynbs',
 '.gitignore.swp',
 '.gitignore',
 'database.sqlite',
 '.ipynb_checkpoints',
 'KNN_Amazon_fine_food_dataset.ipynb']

In [132]:
db_connection = sqlite3.connect('database.sqlite')

In [133]:
polarisable_dataset = pd.read_sql_query('select * from reviews where Score != 3', db_connection)

In [134]:
polarisable_dataset.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [135]:
polarisable_dataset['Time'].head()

0    1303862400
1    1346976000
2    1219017600
3    1307923200
4    1350777600
Name: Time, dtype: int64

In [136]:
polarisable_dataset.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [137]:
df = polarisable_dataset # just to make typing easier

In [138]:
sampled_df = df.sample(n = 5000, axis = 0, random_state = 0) # random sampling of dataset

In [139]:
sampled_df.shape

(5000, 10)

## Replacing the ratings with 0 (for negative reviews) and 1 (for positive reviews).
#### Score of >3 has been considered as positive and a score of <3 has been taken as negative

In [140]:
type(sampled_df['Score'])

pandas.core.series.Series

In [141]:
scores = sampled_df['Score']

In [142]:
scores[6:12]

161937    5
451123    5
401436    5
396416    5
449554    1
379099    1
Name: Score, dtype: int64

In [143]:
scores = list(map(lambda x: 0 if x<3 else 1, scores))

In [144]:
scores[6:12]

[1, 1, 1, 1, 0, 0]

In [145]:
sampled_df['Score'] = scores

In [146]:
type(sampled_df['Score'].head(2))

pandas.core.series.Series

### Data preprocessing

##### 1. Deduplication
If a user id has multiple entries for the same timestamp, then it should be removed because it is likely that multiple entries at the same timestamp were for the same product of different variety which has a different product id than other variants


In [147]:
sampled_df.duplicated(subset = ['UserId', 'Time']).sum()

62

In [148]:
sampled_deduplicated_df = sampled_df.drop_duplicates(subset = ['UserId', 'Time'], inplace = False, keep = 'first')

#### 2. Extracting the data needed (corpus)
#### And removing html and punctuations

In [149]:
corpus = sampled_deduplicated_df['Text']

In [150]:
# dataset cleaners

import re

def remove_html(sentence):
    html_tag_re_obj = re.compile('<.*>?')
    return re.sub(html_tag_re_obj, ' ', sentence)

def remove_punctuations(sentence):
    cleaned_sentence = re.sub(r'[^a-zA-Z]', r' ', sentence)
    return cleaned_sentence

In [151]:
cleaned_corpus = []
for doc in corpus:
    cleaned_doc_1 = remove_html(doc)
    cleaned_doc_2 = remove_punctuations(doc)
    cleaned_corpus.append(cleaned_doc_2)

#### 3. Removing stop words

In [152]:
## Since the negative food reviews are likely to contain words like "don't", "didn't", etc that impart important
## meaning to the review, we check if such words exist in the corpus that we have. If these words are in the corpus,
## then they should not be in the list of stop words that we use for removing the stopwords from our corpus

count = 0
for doc in cleaned_corpus:
    if "not" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "don't" in doc:
        count += 1

print(count)

count = 0
for doc in cleaned_corpus:
    if "didn't" in doc:
        count += 1

print(count)

1890
0
0


In [153]:
from nltk.corpus import stopwords

In [154]:
stopwords = stopwords.words('english')

In [155]:
stopwords = set(stopwords)

In [156]:
stopwords.remove('not')

In [157]:
'not' in stopwords

False

In [158]:
a = [1,2,3,0,1,0,5]

In [159]:
# filtered_corpus = corpus with docs having no stop words
# doing with the sexy lambda expression

filtered_corpus = list(map(lambda doc: ' '.join(list(filter(lambda word: True if word not in stopwords else False\
                                                            , doc.split()))), corpus))

In [160]:
len(filtered_corpus)

4938

In [161]:
filtered_corpus[:2]

["We pretty much given GF pasta restaurant. It's really good disintegrate toss sauce. My celiac husband thrilled! FYI, first ingredient CORN, quinoa second, not QUITE nutritious might think -- cares!",
 'I bought first SproutMaster years ago Dowling Orchard Market passing Banning, California (back sold them).<br /><br />Through bit pricey plastic tray, I cannot complain ease use quality sprouts grown. The divider IS helpful grow different sprouts, BUT helpful grow type sprout two different stages growth. This way, always fresh sprouts handy anytime.<br /><br />They grow fast Sprout Master makes simple care them.']

In [162]:
### classical way of removing the lambda expressions
### verified the output of lambda expression output with the output of following implementation, outputs are same
# docs_without_stop_words = []
# for i, doc in enumerate(corpus):
#     non_stop_words_in_doc = []
#     for word in doc.split():
#         if word not in stopwords:
#             non_stop_words_in_doc.append(word)
            
    
#     docs_without_stop_words.append(' '.join(non_stop_words_in_doc))

#### 4. Stemming the words (SnowballStemmer)

In [163]:
from nltk.stem import SnowballStemmer

In [164]:
stemmer = SnowballStemmer('english')

In [165]:
stemmed_filtered_corpus = list(map(lambda doc: ' '.join(list(map(stemmer.stem, doc.split()))), corpus))

In [166]:
stemmed_filtered_corpus[:3]

["we had pretti much given up on gf pasta until we had this in a restaurant. it realli good and doesn't disintegr when you toss it with sauce. my celiac husband is thrilled! fyi, the first ingredi is corn, and quinoa is second, so it not quit as nutriti as you might think -- but who cares!",
 'i bought my first sproutmast year ago at the dowl orchard market when pass through banning, california (back when they sold them).<br /><br />through a bit pricey for a plastic tray, i cannot complain about the eas of use or the qualiti of the sprout grown. the divid is help to grow differ sprouts, but it is more help to grow the same type of sprout in two differ stage of growth. this way, you can alway have fresh sprout handi at anytime.<br /><br />they do grow fast and the sprout master make it so simpl to care for them.',
 'for bold coffe fan like us, this is absolut delici coffee. it has a complex taste, a hint of someth resembl red wine. i almost hesit to review it becaus it will probabl cau

## Sorting the dataset according to Time

In [167]:
sampled_deduplicated_df['Text'] = stemmed_filtered_corpus

In [168]:
working_df = sampled_deduplicated_df

In [169]:
working_df_sorted = working_df.sort_values(by = 'Time')

In [170]:
stemmed_filtered_corpus_sorted = working_df_sorted['Text']

## Vectorizing the reviews and splitting into train, cv and test sets

### 1.1. Bag of Words (CountVectorizer)

In [171]:
from sklearn.feature_extraction.text import CountVectorizer

In [172]:
count_vectorizer = CountVectorizer()

In [173]:
document_term_matrix = count_vectorizer.fit_transform(stemmed_filtered_corpus_sorted)

In [174]:
document_term_matrix.shape

(4938, 13452)

In [175]:
type(document_term_matrix)

scipy.sparse.csr.csr_matrix

In [176]:
X = document_term_matrix

In [177]:
y = working_df_sorted['Score']

### 1.2. Splitting into train, cv and test

In [178]:
# from sklearn.model_selection import train_test_split

In [179]:
### This will not work because train_test_split() splits data randomly. What we want is a time-based splitting on
### the dataset that we have sorted chronologically
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

In [181]:
print(X.shape, y.shape)

(4938, 13452) (4938,)


In [195]:
type(y)

pandas.core.series.Series

#### 1.2.1. Function to split dataset into train and test datasets

In [196]:
def train_test_splitter(X, y, test_size):
    train_size = 1 - test_size
    train_row_upper_index = round(train_size*X.shape[0])
    test_row_lower_index = train_row_upper_index + 1
    
#     print(train_row_upper_index)
    
    X_train = X[:train_row_upper_index + 1, :]
    X_test = X[test_row_lower_index:, :]
    y_train = y.iloc[:train_row_upper_index + 1]
    y_test = y.iloc[test_row_lower_index:]
    
    return X_train, X_test, y_train, y_test

In [197]:
X_train, X_test, y_train, y_test = train_test_splitter(X, y, test_size = 0.25)

In [199]:
X_train.shape[0] + X_test.shape[0]

4938