In [22]:
## Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sqlite3    ## SQL Interface
import pickle     ## Used to save your data - Converts objects to byte stream and vice versa

from sklearn.feature_extraction.text import CountVectorizer  ## BOW Model
from sklearn.feature_extraction.text import TfidfVectorizer  ## TFIDF Model

from sklearn.manifold import TSNE    ## To visualize high dimensional data

## Modules to perform Text Preprocessing
import re
import nltk
from nltk.stem.snowball import SnowballStemmer as sno
from nltk.corpus import stopwords

import gensim    ## To build Word2Vec model

from sklearn.utils import shuffle

In [2]:
# using the SQLite Table to read data.
conn = sqlite3.connect('database.sqlite')

#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3
""", conn)

conn.close()

filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
def partition(x):
    if x < 3:
        return 0
    return 1

## Pandas Series have a map function which apply function object to all the elements
filtered_data['Score'] = filtered_data['Score'].map(partition)

In [4]:
final = filtered_data.drop_duplicates(subset = {"UserId","ProfileName","Time","Text"})   #Removing all the duplicate entries in the DataFrame.
final.shape

(364173, 10)

In [5]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]  
final.shape

(364171, 10)

In [6]:
stop = set(stopwords.words('english')) #set of stopwords

In [7]:
lst = ['won', 'nor', 'not', 'against']  #Removing the words meaning negation from stopwords as it is fundamental for the analysis
for word in lst:
    stop.remove(word)
print(stop)

{'where', "isn't", 'him', 'were', 'at', 'again', 've', 'be', "don't", 'other', 'no', 's', "couldn't", 'with', "doesn't", 'his', "you'll", "you've", 't', 'that', 'down', 'further', 'more', 'such', 'only', 'here', "needn't", 'before', "hasn't", 'we', "you'd", 'whom', 'having', 'couldn', 'this', 'of', "won't", 'does', 'you', 'had', 'just', "shouldn't", 'am', 'them', 'or', 'during', 'a', 'hadn', 'up', 'as', 'if', "hadn't", 'hasn', 'each', 'what', 'why', "wasn't", 'ain', 'hers', 'very', 'because', 'when', 'being', 'over', "wouldn't", 'after', 'below', 'these', 'has', 'in', 'once', 'shouldn', 'is', 'myself', 'itself', 'mustn', 'shan', 'me', 'ourselves', 'which', 'herself', "aren't", 'above', "shan't", 'ours', 'he', "mightn't", 'will', 'don', 'y', 'now', 'then', 'your', 'an', 'm', 'they', 'doing', 'didn', 'but', 'haven', 'ma', 'its', 'those', 'was', 'by', "weren't", 'between', 'into', 'most', 'any', 'under', 'wasn', 'how', 'her', "that'll", 'are', 'o', 'she', 're', 'doesn', 'themselves', 'nee

In [8]:
def cleanhtml(sentence):
    '''This function removes all the html tags in the given sentence'''
    cleanr = re.compile('<.*?>')    ## find the index of the html tags
    cleantext = re.sub(cleanr, ' ', sentence)  ## Substitute <space> in place of any html tag
    return cleantext

In [9]:
def cleanpunc(sentence):
    '''This function cleans all the punctuation or special characters from a given sentence'''
    cleaned = re.sub(r'[?|@|!|^|%|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [10]:
def preprocessing(series):
    '''The function takes a Pandas Series object containing text in all the cells
       And performs following Preprocessing steps on each cell:
       1. Clean text from html tags
       2. Clean text from punctuations and special characters
       3. Retain only non-numeric Latin characters with lenght > 2
       4. Remove stopwords from the sentence
       5. Apply stemming to all the words in the sentence
       
       Return values:
       1. final_string - List of cleaned sentences
       2. list_of_sent - List of lists which can be used as input to the W2V model'''
    
    i = 0
    str1=" "
    final_string = []    ## This list will contain cleaned sentences
    list_of_sent = []    ## This is a list of lists used as input to the W2V model at a later stage
    #sno = SnowballStemmer("english")
    
    ## Creating below lists for future use
    all_positive_words=[] # store words from +ve reviews here
    all_negative_words=[] # store words from -ve reviews here
    
    
    for sent in series.values:
        ## 
        filtered_sent = []
        sent = cleanhtml(sent)    ## Clean the HTML tags
        sent = cleanpunc(sent)    ## Clean the punctuations and special characters
        ## Sentences are cleaned and words are handled individually
        for cleaned_words in sent.split():
            ## Only consider non-numeric words with length at least 3
            if((cleaned_words.isalpha()) and (len(cleaned_words) > 2)):
                ## Only consider words which are not stopwords and convert them to lowet case
                if(cleaned_words.lower() not in stop):
                    ## Apply snowball stemmer and add them to the filtered_sent list
                    #s = (sno.stem(cleaned_words.lower()))#.encode('utf-8')
                    filtered_sent.append(cleaned_words.lower())    ## This contains all the cleaned words for a sentence
                    ''''
                    if (final['Score'].values)[i] == 1:
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 0:
                        all_negative_words.append(s) #list of all words used to describe negative reviews
                    '''
        ## Below list is a list of lists used as input to W2V model later
        list_of_sent.append(filtered_sent)
        ## Join back all the words belonging to the same sentence
        str1 = " ".join(filtered_sent)
        ## Finally add the cleaned sentence in the below list
        final_string.append(str1)
        #print(i)
        i += 1
    return final_string, list_of_sent

In [30]:
final_string, list_of_sent = preprocessing(final['Summary'])  #Preprocessing the 'Text' column in the DataFrame

In [31]:
final['Summary_Cleaned']=final_string

In [32]:
final.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText_NoStem,CleanedText_Stemmed,Summary_Cleaned
253564,343681,89476,B0014WYXQK,A1WXFL6IXQKAM5,"Barb Caffrey ""writer-for-hire""",0,0,1,1208908800,"Tart, but good",Izze's All-Natural Sparkling Pomegranate Juice...,izzes sparkling pomegranate juice good tart ju...,izz sparkl pomegran juic good tart juic tast g...,tart good
123620,148757,232613,B000E243RA,A3EGPIZHPZ4Z4N,"vegasmama ""Angelgirl""",0,0,1,1347840000,Heavenly,Saw these online thought something different. ...,saw online thought something different healthy...,saw onlin thought someth differ healthi tasti ...,heavenly
111670,132523,14254,B0045XE32E,A3OSHB0AACYN95,Tom Tracy,0,0,1,1302048000,My Four Medium - Large Dogs Love Them!,"These ""certified organic"" (by Oregon Tilth) ba...",certified organic oregon tilth baked dog treat...,certifi organ oregon tilth bake dog treat made...,four medium large dogs love
91849,106421,66426,B001JKGQHQ,AGJ80C07WF11Z,pinjam,1,2,1,1297900800,By far the best chocolate,I first had these chocolates over 20 years ago...,first chocolates years ago mom would bring bac...,first chocol year ago mom would bring back wou...,far best chocolate
155724,194252,293177,B003YVMUFK,A27VBMB8NNKEJU,A. Patterson,3,3,1,1308441600,My dogs really like it,I needed a grain free canned dog food (to mix ...,needed grain free canned dog food mix grain fr...,need grain free can dog food mix grain free dr...,dogs really like


In [33]:
with open ('list_of_sent_summary.pkl','wb') as pickle_file:
    pickle.dump(list_of_sent, pickle_file)

In [35]:
conn = sqlite3.connect('final2.sqlite')    #Saving the changes made above into a new sqlite file
c=conn.cursor()
final.to_sql('Reviews', conn, if_exists='replace')
conn.close()