In [1]:
import warnings
warnings.filterwarnings('ignore')

### Import the sql dataset

In [2]:
import sqlite3
import pandas as pd

connection = sqlite3.connect('database.sqlite')

# polarisable_dataset = dataset that contains Score = {1,2,4,5} assuming Score = 3 implies neutral comments and
# Score < 3 implies negative comment and Score > 3 implies positive comment
polarisable_dataset = pd.read_sql_query('select * from REVIEWS WHERE Score != 3', connection)
polarisable_dataset.shape

(525814, 10)

### Replace values in Score column in polarisable dataset with 'positive' and 'negative'

In [3]:
scores = polarisable_dataset['Score']

polarised_scores = scores.map(lambda x: 0 if x<3 else 1)

# polarised_scores.head()

polarisable_dataset['Score'] = polarised_scores
polarised_dataset = polarisable_dataset

In [4]:
polarised_dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Exploratory Data Analysis

##### 1. Deduplication
If a user id has multiple entries for the same timestamp, then it should be removed because it is likely that multiple entries at the same timestamp were for the same product of different variety which has a different product id than other variants

##### Observation 1- There are 197082 duplicate entries (using rule-1)

In [5]:
print(polarised_dataset.duplicated(['UserId', 'Time']).sum())



197082


In [6]:
deduplicated_dataset = polarised_dataset.drop_duplicates(subset = {'UserId', 'Time'}, keep = 'first', inplace = False)
deduplicated_dataset.shape

(328732, 10)

#### 2. Text preprocessing- Stemming of words

In [7]:
deduplicated_dataset.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [8]:
deduplicated_dataset['Text'].head(1)

0    I have bought several of the Vitality canned d...
Name: Text, dtype: object

In [9]:
# dataset cleaners

import re

def remove_html(sentence):
    html_tag_re_obj = re.compile('<.*>?')
    return re.sub(html_tag_re_obj, ' ', sentence)

def remove_punctuations(sentence):
    cleaned_sentence = re.sub(r'[^a-zA-Z]', r' ', sentence)
    return cleaned_sentence

In [10]:
# clean dataset

corpus = deduplicated_dataset['Text']

cleaned_corpus = []
for doc in corpus.values:
    cleaned_doc = remove_html(doc)
    cleaned_doc = remove_punctuations(cleaned_doc)
    cleaned_corpus.append(cleaned_doc)

deduplicated_dataset['Text'] = cleaned_corpus

#### 2. Text preprocessing- Removing stop words

In [11]:
from nltk.corpus import stopwords

In [12]:
print(set(stopwords.words('english')))

{'ain', 'through', 'own', 'for', 'we', "needn't", 'were', 'should', 'isn', 'ourselves', 'why', 'aren', "isn't", 'd', 'an', 'which', 'our', 'their', 'the', 'wasn', 'herself', 'a', 's', "aren't", 'on', 'shouldn', 'myself', 'other', "hasn't", 'same', 'about', 'theirs', "don't", 'him', "doesn't", 'until', 'where', 'has', 'once', 'wouldn', 'more', 'y', 'again', 'too', 'll', 'his', 'below', 'under', 'and', 'her', 'this', 'he', 'it', 'of', 'between', 'me', 'you', "won't", "you've", 'them', 'been', 'but', 'needn', 've', "didn't", 'both', 'not', "that'll", 'couldn', "couldn't", 'itself', 'very', 'above', 'after', 'during', 'i', 'hadn', 'before', "wouldn't", 'into', 'mightn', 'because', 'from', 'just', "wasn't", 'haven', 'to', 'further', 'that', 'now', "haven't", 'doesn', 'here', "shan't", 'she', 'while', 'each', 'most', 'do', 'ma', 'being', 'against', 'themselves', "mustn't", "weren't", 'only', 'don', 'nor', 're', 'yourselves', 'as', 'any', 'few', 'hasn', 'or', 'my', "shouldn't", 'no', 'who', '

In [13]:
## Since the negative food reviews are likely to contain words like "don't", "didn't", etc that impart important
## meaning to the review, we check if such words exist in the corpus that we have. If these words are in the corpus,
## then they should not be in the list of stop words that we use for removing the stopwords from our corpus

count = 0
for doc in deduplicated_dataset['Text']:
    if "not" in doc:
        count += 1

print(count)

count = 0
for doc in deduplicated_dataset['Text']:
    if "don't" in doc:
        count += 1

print(count)

count = 0
for doc in deduplicated_dataset['Text']:
    if "didn't" in doc:
        count += 1

print(count)

104301
0
0


In [14]:
stopwords = stopwords.words('english')

In [30]:
## 'not' is present in 104301 docs in the corpus, so we modify the list of stopwords to not contain this word

stopwords_set = set(stopwords)

stopwords_set.remove('not')

In [32]:
print(stopwords_set)

{'ain', 'through', 'own', 'for', 'we', "needn't", 'were', 'should', 'isn', 'ourselves', 'why', 'aren', "isn't", 'd', 'an', 'which', 'our', 'their', 'the', 'wasn', 'herself', 'a', 's', "aren't", 'on', 'shouldn', 'myself', 'other', "hasn't", 'same', 'about', 'theirs', "don't", 'him', "doesn't", 'until', 'where', 'has', 'once', 'wouldn', 'more', 'y', 'again', 'too', 'll', 'his', 'below', 'under', 'and', 'her', 'this', 'he', 'it', 'of', 'between', 'me', 'you', "won't", "you've", 'them', 'been', 'but', 'needn', 've', "didn't", 'both', "that'll", 'couldn', "couldn't", 'itself', 'very', 'above', 'after', 'during', 'i', 'hadn', 'before', "wouldn't", 'into', 'mightn', 'because', 'from', 'just', "wasn't", 'haven', 'to', 'further', 'that', 'now', "haven't", 'doesn', 'here', "shan't", 'she', 'while', 'each', 'most', 'do', 'ma', 'being', 'against', 'themselves', "mustn't", "weren't", 'only', 'don', 'nor', 're', 'yourselves', 'as', 'any', 'few', 'hasn', 'or', 'my', "shouldn't", 'no', 'who', 'whom', 

In [34]:
corpus = deduplicated_dataset['Text'] # corpus contains cleaned docs
print(type(corpus))

<class 'pandas.core.series.Series'>


In [None]:
# TODO
for i, doc in enumerate(corpus):
    for word in doc:
        if word in stopwords_set:
            doc = doc.replace(word, '')
    corpus[i] = doc

#### 3. Using tf-idf to identify stop words in the corpus

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range = (2, 2))

In [None]:
corpus = deduplicated_dataset['Text'] # deduplicated_dataset['Text'] has been replaced with cleaned corpus
corpus.head(2)

In [None]:
sparse_matrix = tfidf_vectorizer.fit_transform(corpus)

In [None]:
type(sparse_matrix)

In [None]:
sparse_matrix.get_shape()

In [None]:
# tfidf_vectorizer.get_feature_names()

In [None]:
# check = tfidf_vectorizer.fit_transform(corpus.head(2))
# tfidf_vectorizer.get_feature_names()

In [None]:
tfidf_vectorizer.vocabulary_