In [2]:
#importing essential libraries
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.externals import joblib
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

In [None]:
# using the SQLite Table to read data.
con = sqlite3.connect('database.sqlite')

In [None]:
# filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, con) 

# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative

In [None]:
# Score values for both positive and negative reviews
filtered_data['Score'].value_counts()

In [None]:
# Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

# Dealing with Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

#Checking to see how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [None]:
# checking whether anything is NaN or not
final.isnull().sum().sum()

In [None]:
# Sorting the dataframe for time based slicing
final.sort_values(by =['Time'])

In [None]:
# Taking Only 100k sample points with 85197 positive and 14803 negative
sample100K_data = final.iloc[:100000,:]

In [None]:
type(sample100K_data)

In [None]:
# checking number of positive and negative points 
sample100K_data['Score'].value_counts()

In [None]:
sample100K_data.head()

In [None]:
# find sentences containing HTML tags
import re
i=0;
for sent in sample100K_data['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;


import nltk
nltk.download('stopwords')

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [None]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in sample100K_data['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (sample100K_data['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(sample100K_data['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [None]:
sample100K_data.shape

In [None]:
sample100K_data['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 
sample100K_data['CleanedText']=sample100K_data['CleanedText'].str.decode("utf-8")

In [None]:
sample100K_data.shape

In [None]:
# Taking out the class variable or score into separate series.
target = sample100K_data['Score']

In [None]:
# Saving class variable or score into separate file for future use.
from sklearn.externals import joblib
joblib.dump(target, 'target.joblib')
joblib.dump(sample100K_data, 'sample100K_data_prepros.joblib')

In [1]:
# Reading data from previously created class variable or score and preprossed sample 100k data.
from sklearn.externals import joblib
target = joblib.load('target.joblib')
sample100K_data = joblib.load('sample100K_data_prepros.joblib')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import model_selection
X = sample100K_data.iloc[:,:]
y = target

# split the data set into train and test
X_1, X_test, y_1, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=0,stratify = y)

# split the train data set into cross validation train and cross validation test
X_tr, X_cv, y_tr, y_cv = model_selection.train_test_split(X_1, y_1, test_size=0.3, random_state=1, stratify = y_1)

In [14]:
# Saving class variable or score into separate file for future use.
from sklearn.externals import joblib
joblib.dump(y_tr, 'y_tr.joblib')
joblib.dump(y_cv, 'y_cv.joblib')
joblib.dump(y_test, 'y_test.joblib')

['y_test.joblib']

# Bag of Words (BoW) implementation for Feature Matrix

In [None]:
# Creating BoW model. Taking max_features = 2000 as obtained from literature.
count_vect = CountVectorizer(max_features = 2000, min_df=50)
x_train_BOW = count_vect.fit_transform(X_tr['CleanedText'].values)

In [None]:
x_CV_BOW = count_vect.transform(X_cv['CleanedText'].values)
x_test_BOW = count_vect.transform(X_test['CleanedText'].values)

In [None]:
# Saving BoW matrix for future use.
from sklearn.externals import joblib
joblib.dump(x_train_BOW, 'x_train_BOW.joblib')
joblib.dump(x_CV_BOW, 'x_CV_BOW.joblib')
joblib.dump(x_test_BOW, 'x_test_BOW.joblib')

# TF-IDF implementation for Feature Matrix

In [None]:
# Making a tf_idf vector. Taking max_features = 2000 and min_df=50 as obtained from literature.
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2),max_features = 2000, min_df=50)
x_train_TFIDF = tf_idf_vect.fit_transform(X_tr['CleanedText'].values)
x_CV_TFIDF = tf_idf_vect.transform(X_cv['CleanedText'].values)
x_test_TFIDF = tf_idf_vect.transform(X_test['CleanedText'].values)

In [None]:
# Saving TF-IDF matrix for future use.
from sklearn.externals import joblib
joblib.dump(x_train_TFIDF, 'x_train_TFIDF.joblib')
joblib.dump(x_CV_TFIDF, 'x_CV_TFIDF.joblib')
joblib.dump(x_test_TFIDF, 'x_test_TFIDF.joblib')

# W2V

In [4]:
# Train your own Word2Vec model using your own text corpus
# For train data
i=0
list_of_TRsent=[]
for sent in X_tr['CleanedText'].values:
    list_of_TRsent.append(sent.split())

In [5]:
# min_count = 5 considers only words that occured atleast 5 times
w2v_model=Word2Vec(list_of_TRsent,min_count=5,size=50, workers=8)

# Avg W2V implementation for Feature Matrix

In [6]:
# For CV data
list_of_CVsent=[]
for sent in X_cv['CleanedText'].values:
    list_of_CVsent.append(sent.split())
    
# For test data
list_of_TSsent=[]
for sent in X_test['CleanedText'].values:
    list_of_TSsent.append(sent.split())

In [7]:
# average Word2Vec
# compute average word2vec for each review.
w2v_words = list(w2v_model.wv.vocab)
def Avg_W2V(list_of_sent):   
    sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
    for sent in list_of_sent: # for each review/sentence
        sent_vec = np.zeros(50) # as word vectors are of zero length
        cnt_words =0; # num of words with a valid vector in the sentence/review
        for word in sent: # for each word in a review/sentence
            if word in w2v_words:
                vec = w2v_model.wv[word]
                sent_vec += vec
                cnt_words += 1
        if cnt_words != 0:
            sent_vec /= cnt_words
        sent_vectors.append(sent_vec)
    return(sent_vectors)

In [13]:
#x_train_AVG_W2V = Avg_W2V(list_of_TRsent)
x_CV_AVG_W2V = Avg_W2V(list_of_CVsent)
x_test_AVG_W2V = Avg_W2V(list_of_TSsent)

In [15]:
x_train_AVG_W2V = pd.DataFrame(x_train_AVG_W2V)
x_CV_AVG_W2V = pd.DataFrame(x_CV_AVG_W2V)
x_test_AVG_W2V = pd.DataFrame(x_test_AVG_W2V)

In [16]:
# Saving Avg W2V matrix for future use.
from sklearn.externals import joblib
joblib.dump(x_train_AVG_W2V, 'x_train_AVG_W2V.joblib')
joblib.dump(x_CV_AVG_W2V, 'x_CV_AVG_W2V.joblib')
joblib.dump(x_test_AVG_W2V, 'x_test_AVG_W2V.joblib')

['x_test_AVG_W2V.joblib']