In [None]:
#loading needed libraries
import numpy as np
import pandas as pd
import seaborn as sns
import re
import spacy 
import sqlite3
%matplotlib inline
import nltk
#nltk.download()
from nltk.stem import SnowballStemmer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
import lime 
import lime.lime_tabular
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 60)
from IPython.core.pylabtools import figsize
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from mlxtend.classifier import StackingClassifier
from sklearn.preprocessing import LabelEncoder  
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder  
le = LabelEncoder()
import matplotlib.pyplot as plt

#change sample size
sample_size = 1000

In [None]:
#importing the amazon reviews CSV file
#Df is the database we will be using for our analysis
#fulldf was created to be able to tests assumptions 
df = pd.read_csv("Reviews.csv")
fulldf = pd.read_csv("Reviews.csv")

In [None]:
#getting infomration on dataset
df.info()

In [None]:
#to be able to proprely run the program we took a fraction of the dataset 
#random subset of the total dataset
#frac = 0.05 is 5% of the dataset
df = df.sample(frac = 0.05, random_state=42)

In [None]:
#this gives information on the new dataset
df.info()

In [None]:
#first 5 rows of the sample dataset
df.head(5)

In [None]:
#first step of clean is taking the needed columns 
df = df[["Score","ProductId","Summary","Text"]]

In [None]:
#this function cleans up all special characters generated in the amazon review database 
def clean(s):
    s = str(s)
    s = s.lower()
    s = re.sub(r'\d+','', s)
    s = s.replace(r'<lb>', "\n")
    s = re.sub(r'<br */*>', "\n", s)
    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
    s = s.replace("&amp;", "&")
    s = re.sub(r'\(*https*://[^\)]*\)*', "", s)
    s = re.sub(r'\*', '', s)
    s = re.sub(r'_+', ' ', s)
    s = re.sub(r'"+', '"', s)
    s = re.sub(r"[-()\"#/&@;:<>{}`+=~|.!?,]", "", s)
    s = re.sub(r'[^\w]', ' ', s)
    s = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", s)
    s = re.sub(r"what's", "what is ", s)
    s = re.sub(r"\'s", " ", s)
    s = re.sub(r"\'ve", " have ", s)
    s = re.sub(r"n't", " not ", s)
    s = re.sub(r"i'm", "i am ", s)
    s = re.sub(r"\'re", " are ", s)
    s = re.sub(r"\'d", " would ", s)
    s = re.sub(r"\'ll", " will ", s)
    s = re.sub(r",", " ", s)
    s = re.sub(r"\.", " ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\/", " ", s)
    s = re.sub(r"\^", " ^ ", s)
    s = re.sub(r"\+", " + ", s)
    s = re.sub(r"\-", " - ", s)
    s = re.sub(r"\=", " = ", s)
    s = re.sub(r"'", " ", s)
    s = re.sub(r"(\d+)(k)", r"\g<1>000", s)
    s = re.sub(r":", " : ", s)
    s = re.sub(r" e g ", " eg ", s)
    s = re.sub(r" b g ", " bg ", s)
    s = re.sub(r"\0s", "0", s)
    s = re.sub(r"\s{2,}", " ", s)
    
    s = s.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in s]
    s = " ".join(stemmed_words)

    return(s)

In [None]:
#first 5 rows of the dataset
df.head(5)


In [None]:
#graph plotting the distribution of scores and sentiment of the customers 
#dataset is unbalanced
fig = plt.figure(figsize=(8,6))
df.groupby("Score").Summary.count().plot.bar(ylim=0)
plt.show()

In [None]:
#frequency of unbalanced dataset 
df[["Score","ProductId","Summary","Text"]].describe(include=np.object).transpose()


In [None]:
df.describe().transpose()

In [None]:
#since we have an unbalanced dataset we reloaded the full dataset
#to be able to balance it 
shuffled_df = fulldf.sample(frac=1,random_state=4)

In [None]:
shuffled_df.info()
shuffled_df["Score"].value_counts()


In [None]:
#each of these represents all the reviews with a score from 1 to 5 which is stored into a seperate sets 
one_df = shuffled_df.loc[shuffled_df['Score'] == 1]
two_df = shuffled_df.loc[shuffled_df['Score'] == 2]
four_df = shuffled_df.loc[shuffled_df['Score'] == 4]
five_df = shuffled_df.loc[shuffled_df['Score'] == 5]

In [None]:
#reviews with score of two had 29k reviews we used an undersampling method to set all the 5 categories to 25k
one_df = shuffled_df.loc[shuffled_df['Score'] == 1].sample(n=25000,random_state=42)
two_df = shuffled_df.loc[shuffled_df['Score'] == 2].sample(n=25000,random_state=42)
four_df = shuffled_df.loc[shuffled_df['Score'] == 4].sample(n=25000,random_state=42)
five_df = shuffled_df.loc[shuffled_df['Score'] == 5].sample(n=25000,random_state=42)

In [None]:
def score_sentiment(Score):
    if Score == 1 or Score == 2:
        return "negative"
    else:
        return "positive"
    
    
def binary_sentiment(sentiment):
    if sentiment == "negative":
        return 0
    else:
        return 1

In [None]:
#combined all 5 datasets into one to create our new balanced dataset 
normalized_df = pd.concat([one_df, two_df, four_df, five_df])

In [None]:
normalized_df['sentiment'] = normalized_df["Score"].apply (score_sentiment)

In [None]:
normalized_df["binary_sentiment"] = normalized_df["sentiment"].apply(binary_sentiment)

In [None]:
normalized_df.tail(30)

In [None]:
#plot shows that each of the scores has 25k each for a total of 125k entries 
fig = plt.figure(figsize=(8,6))
normalized_df.groupby("Score").Summary.count().plot.bar(ylim=0)
plt.show()

In [None]:
#information on th dataset
normalized_df.info()

In [None]:
#we reduced our dataset to 15% of the 125k to be able to run the data properly 
#took a random sampling the the dataset 
df = normalized_df.sample(n = sample_size,random_state=42)

In [None]:
df.info()

In [None]:
#graph reprsenting each of the scores number of datapoints 
fig = plt.figure(figsize=(8,6))
df.groupby("Score").Summary.count().plot.bar(ylim=0)
plt.show()

In [None]:
#data cleaning
#only kept the columns we needed
#used the clean function to remove special characters

df = df[['ProductId','Score','Summary','Text', 'sentiment', "binary_sentiment"]]  # Using only the useful columns
df["Text"] = df.Text.apply(clean)
df["Summary"] = df.Summary.apply(clean)
#clean the dataset further by removing punctuation
import string
table = str.maketrans(' ', ' ', string.punctuation)
df["Text"] = [w.translate(table) for w in df["Text"]]
df["Summary"] = [w.translate(table) for w in df["Summary"]]

In [None]:
df.head(5)  # Outputting the data set 


In [None]:
#removed all duplicate reviews
df[["Score","ProductId","Summary","Text", 'sentiment']].describe(include=np.object).transpose()  #We noticed that there is 32


In [None]:
df.drop_duplicates(subset ="Text",keep = False, inplace = True) # So we deleted duplicate reviews

In [None]:
df.info()                            #info of the data

In [None]:
df[["Score","ProductId","Summary","Text", 'sentiment']].describe(include=np.object).transpose() #We no longer have duplicate text

In [None]:
df.describe().transpose()  # Average rating of products 

In [None]:
#stop words function
#added t.lower because the stop_words are all lower case 

stop_words = set(stopwords.words('english'))
#function to remove stopwords
def remove_stopwords(tokens):
    """Remove stopwords from a list of tokens."""
    return [t for t in tokens if t.lower() not in stop_words]

#function takes most common words and puts them in a list 
#the function will pick up the number of times a words comes in a dictiiiiiiiionnary
#takes the word as a key and number as a value
def return_uncommon(words, number):
    tuple_len = len(words)
    i = 0
    cleaned_list = []
    
    while i!=tuple_len:
        
         for key, repetition in words.items():
           
             number_repetition = repetition
             if number_repetition <= number:
                cleaned_words = key
                cleaned_list.append(cleaned_words)
             i+=1
    return cleaned_list


In [None]:
#ran our tfidfvectoriser with the summary

tfidf = TfidfVectorizer(min_df=10, norm='l2', encoding='latin-1', 
                        ngram_range=(1, 2), stop_words=stop_words)

features = tfidf.fit_transform(df.Text).toarray()

labels = df.sentiment
labels_binary = df.binary_sentiment
features.shape

In [None]:
#created a function to split sentences into words vectors (tokens)

def my_tokenizer(text):
    return text.split() if text != (None or "." or "," or "!" or ")" or "(" or "\"") else []



In [None]:
tokens = df.Text.map(my_tokenizer).sum()


In [None]:
#count of words before removal of stopwords
#we notice that most words are stopwords
from collections import Counter
#count the number of times the tokens come
counter = Counter(tokens) 
counter.most_common(20)

In [None]:


#count words without stopwords 
counter = Counter(remove_stopwords(tokens))
list_counter = dict(counter)
uncommon_words = return_uncommon(list_counter,2)
stop_words = stop_words.union(uncommon_words)
counter = Counter(remove_stopwords(tokens))

In [None]:
#20 most common words 
counter.most_common(20)

In [None]:
#frequency plot of the most common words 
freq_df = pd.DataFrame.from_records(counter.most_common(20), columns = ["token", "count"])

freq_df.plot(kind="bar", x="token")

In [None]:
#counting the 20 most common bigrams 
bi_grams = list(ngrams(remove_stopwords(tokens), 2)) 
bi_counter = Counter(bi_grams)
bi_counter.most_common(20)

In [None]:
#frequency plot of the most common bigrams 
freq_df = pd.DataFrame.from_records(bi_counter.most_common(20), columns = ["token", "count"])

freq_df.plot(kind="bar", x="token")

In [None]:
#imported wordcloud function
from wordcloud import WordCloud

#created a word cloud visualize the most common words 
def wordcloud(counter):
    """A small wordloud wrapper"""
    wc = WordCloud(width=1200, height=800,
                   background_color ="white",
                   max_words=200)
    wc.generate_from_frequencies(counter)

    #plot
    fig=plt.figure(figsize=(6, 4))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
wordcloud(counter)

In [None]:
#Word 2 Vec Model

In [None]:
import nltk
from gensim.models import word2vec
from gensim.parsing.preprocessing import remove_stopwords

# tokenize sentences in corpus
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(text) for text in df["Text"]]
cleaned_tokenized_corpus = [wpt.tokenize(remove_stopwords(text)) for text in df["Text"]]

# Set values for various parameters
feature_size = 100    # Word vector dimensionality  
window_context = 5          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=50)

clean_w2v_model = word2vec.Word2Vec(cleaned_tokenized_corpus, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=50)

In [None]:
# view similar words based on gensim's model
similar_words = {search_term: [item[0] for item in w2v_model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['good', 'bad', 'food', 'market']}
similar_words

In [None]:
# view similar words based on gensim's model without stopwords 
clean_similar_words = {search_term: [item[0] for item in clean_w2v_model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['good', 'bad', 'food', 'market']}
clean_similar_words

In [None]:
from sklearn.manifold import TSNE

words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = w2v_model.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
w2v_labels = words

plt.figure(figsize=(14, 8))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for w2v_label, x, y in zip(w2v_labels, T[:, 0], T[:, 1]):
    plt.annotate(w2v_label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

In [None]:
#removing stop words
from sklearn.manifold import TSNE

words = sum([[k] + v for k, v in clean_similar_words.items()], [])
wvs = clean_w2v_model.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
w2v_labels = words

plt.figure(figsize=(14, 8))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for w2v_label, x, y in zip(w2v_labels, T[:, 0], T[:, 1]):
    plt.annotate(w2v_label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

In [None]:
#word embeddings using word2vec
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    w2v_features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(w2v_features)


# get document level embeddings
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
pd.DataFrame(w2v_feature_array)

In [None]:
# tokenize sentences in corpus
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(text) for text in df["Text"]]
cleaned_tokenized_corpus = [wpt.tokenize(remove_stopwords(text)) for text in df["Text"]]

# Set values for various parameters
feature_size = 100    # Word vector dimensionality  
window_context = 5          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=50)

clean_w2v_model = word2vec.Word2Vec(cleaned_tokenized_corpus, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=50)

#PROJECT MILESTONE 2 


Tested 4 models Random Forest, Linear SVC, Multinomial NB and Logistic Regression

Linear SVC

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from mlxtend.plotting import plot_learning_curves
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score

model = LinearSVC()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
linearsvc_pred = y_pred

In [None]:

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = ["negative", "positive"], yticklabels=["negative", "positive"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, model, print_model=False, style='ggplot')
plt.show()

Random Forest Classifier

In [None]:

model = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42)

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.3, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rfc_pred = y_pred

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = ["negative", "positive"], yticklabels=["negative","positive"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, model, print_model=False, style='ggplot')
plt.show()

In [None]:
model = MultinomialNB()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
NB_pred = y_pred
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = ["negative", "positive"], yticklabels=["negative", "positive"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, model, print_model=False, style='ggplot')
plt.show()

Logistic Regression

In [None]:
model = LogisticRegression(random_state=42)

X_train, X_test, y_train, y_test,  indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
lgr_pred = y_pred
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = ["negative", "positive"], yticklabels=["negative","positive"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, model, print_model=False, style='ggplot')
plt.show()

In [None]:
import scikitplot as skplt
from sklearn.metrics import roc_curve, auc
lw = 2
plt.figure()
fpr1, tpr1, _ = metrics.roc_curve(le.fit_transform(y_test),  le.fit_transform(rfc_pred))
fpr2, tpr2, _ = metrics.roc_curve(le.fit_transform(y_test),  le.fit_transform(linearsvc_pred))
fpr3, tpr3, _ = metrics.roc_curve(le.fit_transform(y_test),  le.fit_transform(NB_pred))
fpr4, tpr4, _ = metrics.roc_curve(le.fit_transform(y_test),  le.fit_transform(lgr_pred))
auc1 = metrics.roc_auc_score(le.fit_transform(y_test),  le.fit_transform(rfc_pred))
auc2 = metrics.roc_auc_score(le.fit_transform(y_test),  le.fit_transform(linearsvc_pred))
auc3 = metrics.roc_auc_score(le.fit_transform(y_test),  le.fit_transform(NB_pred))
auc4 = metrics.roc_auc_score(le.fit_transform(y_test),  le.fit_transform(lgr_pred))
plt.plot(fpr1,tpr1, color='red',lw=lw, label='ROC curve RFC (area = %0.2f)'% auc1)
plt.plot(fpr2,tpr2, color='blue',lw=lw, label='ROC curve Linear SVC (area = %0.2f)'% auc2)
plt.plot(fpr3,tpr3, color='darkorange',lw=lw, label='ROC curve Naive Bayes (area = %0.2f)'% auc3)
plt.plot(fpr4,tpr4, color='green',lw=lw, label='ROC curve Linear Reg (area = %0.2f)'% auc4)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.title('ROC Curve with TFIDF Features')
plt.legend(loc=4)
plt.show()


BAGGING

In [None]:
%matplotlib inline

import itertools

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions


le = LabelEncoder()

In [None]:
X, y = X_train, le.fit_transform(y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
clf1 = DecisionTreeClassifier(criterion='entropy', max_depth=10)
clf2 = KNeighborsClassifier(n_neighbors=1)    

bagging1 = BaggingClassifier(base_estimator=clf1, n_estimators=10, max_samples=0.8, max_features=0.8)
bagging2 = BaggingClassifier(base_estimator=clf2, n_estimators=10, max_samples=0.8, max_features=0.8)

In [None]:
label = ['Decision Tree', 'K-NN', 'Bagging Tree', 'Bagging K-NN']
clf_list = [clf1, clf2, bagging1, bagging2]


for clf, label in zip(clf_list, label):        
    scores = cross_val_score(clf, X, y, cv=3, scoring='f1')
    accuracy_scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')

    print ("F1 Score: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    print ("Acccuracy Score: %.2f (+/- %.2f) [%s]" %(accuracy_scores.mean(), accuracy_scores.std(), label))

   

ROC Curve with Word2Vec

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, bagging1, print_model=False, style='ggplot')
plt.show()

BOOSTING

In [None]:
#XOR dataset
#X = np.random.randn(200, 2)
#y = np.array(map(int,np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)))
    
clf = DecisionTreeClassifier(criterion='entropy', max_depth=1)

num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=10)']

In [None]:
for n_est, label in zip(num_est, label):     
    boosting = AdaBoostClassifier(base_estimator=clf, n_estimators=n_est)   
    boosting.fit(X, y)


In [None]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    boosting = AdaBoostClassifier(base_estimator=clf, n_estimators=10, learning_rate=learning_rate)
        
    plt.figure()
    plot_learning_curves(X_train, y_train, X_test, y_test, boosting, print_model=False, style='ggplot')
    plt.show()
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(boosting.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(boosting.score(X_test, y_test)))

Adaboost With F1 Score

In [None]:
for learning_rate in lr_list:
#Ensemble Size
    num_est = np.linspace(1,20,10).astype(int)
    bg_clf_cv_mean = []
    bg_clf_cv_std = []
    for n_est in num_est:
        ada_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=n_est,learning_rate=learning_rate)
        scores = cross_val_score(ada_clf, X, y, cv=3, scoring='f1')
        bg_clf_cv_mean.append(scores.mean())
        bg_clf_cv_std.append(scores.std())

In [None]:
for learning_rate in lr_list:
    plt.figure()
    (_, caps, _) = plt.errorbar(num_est, bg_clf_cv_mean, yerr=bg_clf_cv_std, c='blue', fmt='-o', capsize=5)
    for cap in caps:
        cap.set_markeredgewidth(1)                                                                                                                                
    plt.ylabel('F1 Score'); plt.xlabel('Ensemble Size'); plt.title('AdaBoost Ensemble learning rate of: '+ str(learning_rate));
    plt.show()

Adaboost with Accuracy Score

In [None]:
for learning_rate in lr_list:
#Ensemble Size
    num_est = np.linspace(1,20,10).astype(int)
    acc_bg_clf_cv_mean = []
    acc_bg_clf_cv_std = []
    for n_est in num_est:
        ada_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=n_est,learning_rate=learning_rate)
        scores = cross_val_score(ada_clf, X, y, cv=3, scoring='accuracy')
        acc_bg_clf_cv_mean.append(scores.mean())
        acc_bg_clf_cv_std.append(scores.std())

In [None]:
for learning_rate in lr_list:
    plt.figure()
    (_, caps, _) = plt.errorbar(num_est, acc_bg_clf_cv_mean, yerr=acc_bg_clf_cv_std, c='blue', fmt='-o', capsize=5)
    for cap in caps:
        cap.set_markeredgewidth(1)                                                                                                                                
    plt.ylabel('Accuracy'); plt.xlabel('Ensemble Size'); plt.title('AdaBoost Ensemble learning rate of: '+ str(learning_rate));
    plt.show()

STACKING

In [None]:
import warnings

warnings.simplefilter('ignore')

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr1 = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2],meta_classifier=lr1)
label = ['KNN', 'Random Forest', "Naive Bayes", 'Stacking Classifier']
clf_list = [clf1, clf2, clf3, sclf]

clf_cv_mean = []
clf_cv_std = []
acc_clf_cv_mean = []
acc_clf_cv_std = []

for clf, label in zip(clf_list, label):
        
    scores = cross_val_score(clf, X, y, cv=2, scoring='f1')
    accuracy_scores = cross_val_score(clf, X, y, cv=2, scoring='accuracy')
    print ("F1 Score: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    print ("Accuracy: %.2f (+/- %.2f) [%s]" %(accuracy_scores.mean(), accuracy_scores.std(), label))

    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())
    acc_clf_cv_mean.append(accuracy_scores.mean())
    acc_clf_cv_std.append(accuracy_scores.std())

In [None]:
#plot classifier accuracy    
plt.figure()
(_, caps, _) = plt.errorbar(range(4), clf_cv_mean, yerr=clf_cv_std, c='blue', fmt='-o', capsize=4)
for cap in caps:
    cap.set_markeredgewidth(1)                                                                                                                                
plt.xticks(range(4), ['KNN', 'RF', "Naive Bayes", 'Stacking Classifier'])        
plt.ylabel('f1'); plt.xlabel('Classifier'); plt.title('Stacking Ensemble W/ F1 Score');
plt.show()

In [None]:
#plot classifier accuracy    
plt.figure()
(_, caps, _) = plt.errorbar(range(4), acc_clf_cv_mean, yerr=acc_clf_cv_std, c='blue', fmt='-o', capsize=4)
for cap in caps:
    cap.set_markeredgewidth(1)                                                                                                                                
plt.xticks(range(4), ['KNN', 'RF', "Naive Bayes", 'Stacking Classifier'])        
plt.ylabel('accurancy'); plt.xlabel('Classifier'); plt.title('Stacking Ensemble W/ Accuracy Score');
plt.show()

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, le.fit_transform(y_train), X_test, le.fit_transform(y_test), sclf, print_model=False, style='ggplot')
plt.show()

Linear SVC with Word2Vec 

In [None]:
SVCmodel = LinearSVC()
X = w2v_feature_array
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle = True, stratify = None)
SVCmodel.fit(X_train, y_train)
y_pred = SVCmodel.predict(X_test)
linearsvc_pred = y_pred

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = ["negative", "positive"], yticklabels=["negative","positive"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
print(metrics.classification_report(y_test, y_pred))

In [None]:
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, SVCmodel, print_model=False, style='ggplot')
plt.show()

Random Forest with Word2Vec

In [None]:
RfCmodel = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle = True, stratify = None)
RfCmodel.fit(X_train, y_train)
y_pred = RfCmodel.predict(X_test)
rfc_pred = y_pred
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = ["negative","positive"], yticklabels=["negative","positive"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
print(metrics.classification_report(y_test, y_pred))

In [None]:
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, RfCmodel, print_model=False, style='ggplot')
plt.show()

Gaussian with Word2Vec

In [None]:
#using Gaussian NB to take into account negative values 
from sklearn.naive_bayes import GaussianNB
GNBmodel = GaussianNB()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle = True, stratify = None)
GNBmodel.fit(X_train, y_train)
y_pred = GNBmodel.predict(X_test)
gnb_pred = y_pred
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = ["negative","positive"], yticklabels=["negative","positive"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
print(metrics.classification_report(y_test, y_pred))

In [None]:
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, GNBmodel, print_model=False, style='ggplot')
plt.show()

In [None]:
model = LogisticRegression(random_state=42)

X_train, X_test, y_train, y_test,  indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
lgr_pred = y_pred
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = ["negative", "positive"], yticklabels=["negative","positive"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, model, print_model=False, style='ggplot')
plt.show()

In [None]:
plt.figure()
fpr1, tpr1, _ = metrics.roc_curve(le.fit_transform(y_test),  le.fit_transform(rfc_pred))
fpr2, tpr2, _ = metrics.roc_curve(le.fit_transform(y_test),  le.fit_transform(linearsvc_pred))
fpr3, tpr3, _ = metrics.roc_curve(le.fit_transform(y_test),  le.fit_transform(NB_pred))
fpr4, tpr4, _ = metrics.roc_curve(le.fit_transform(y_test),  le.fit_transform(lgr_pred))
auc1= metrics.roc_auc_score(le.fit_transform(y_test),  le.fit_transform(rfc_pred))
auc2 = metrics.roc_auc_score(le.fit_transform(y_test),  le.fit_transform(linearsvc_pred))
auc3 = metrics.roc_auc_score(le.fit_transform(y_test),  le.fit_transform(NB_pred))
auc4 = metrics.roc_auc_score(le.fit_transform(y_test),  le.fit_transform(lgr_pred))
plt.plot(fpr1,tpr1, color='red',lw=lw, label='ROC curve RFC (area = %0.2f)'% auc1)
plt.plot(fpr2,tpr2, color='blue',lw=lw, label='ROC curve Linear SVC (area = %0.2f)'% auc2)
plt.plot(fpr3,tpr3, color='darkorange',lw=lw, label='ROC curve Naive Bayes (area = %0.2f)'% auc3)
plt.plot(fpr4,tpr4, color='green',lw=lw, label='ROC curve Linear Reg (area = %0.2f)'% auc4)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.title('ROC Curve with Word2Vec Features')
plt.legend(loc=4)
plt.show()


Ensemble Models using Word2Vec

Bagging using Word2Vec

In [None]:
X, y = X_train, le.fit_transform(y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#XOR dataset
#X = np.random.randn(200, 2)
#y = np.array(map(int,np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)))
    
clf = DecisionTreeClassifier(criterion='gini', max_depth=10)



In [None]:
label = ['Decision Tree', 'K-NN', 'Bagging Tree', 'Bagging K-NN']
clf_list = [clf1, clf2, bagging1, bagging2]


for clf, label in zip(clf_list, label):        
    scores = cross_val_score(clf, X, y, cv=3, scoring='f1')
    accuracy_scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')

    print ("F1 Score: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    print ("Acccuracy Score: %.2f (+/- %.2f) [%s]" %(accuracy_scores.mean(), accuracy_scores.std(), label))

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, bagging1, print_model=False, style='ggplot')
plt.show()

BOOSTING WITH WORD2VEC

In [None]:
   
clf = DecisionTreeClassifier(criterion='gini', max_depth=10)

num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=10)']

In [None]:
for n_est, label in zip(num_est, label):     
    boosting = AdaBoostClassifier(base_estimator=clf, n_estimators=n_est)   
    boosting.fit(X, y)

In [None]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    boosting = AdaBoostClassifier(base_estimator=clf, n_estimators=10, learning_rate=learning_rate)
        
    plt.figure()
    plot_learning_curves(X_train, y_train, X_test, y_test, boosting, print_model=False, style='ggplot')
    plt.show()
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(boosting.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(boosting.score(X_test, y_test)))

Adaboost 

In [None]:
for learning_rate in lr_list:
#Ensemble Size
    num_est = np.linspace(1,20,10).astype(int)
    bg_clf_cv_mean = []
    bg_clf_cv_std = []
    for n_est in num_est:
        ada_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=n_est,learning_rate=learning_rate)
        scores = cross_val_score(ada_clf, X, y, cv=3, scoring='f1')
        bg_clf_cv_mean.append(scores.mean())
        bg_clf_cv_std.append(scores.std())

In [None]:
for learning_rate in lr_list:
    plt.figure()
    (_, caps, _) = plt.errorbar(num_est, bg_clf_cv_mean, yerr=bg_clf_cv_std, c='blue', fmt='-o', capsize=5)
    for cap in caps:
        cap.set_markeredgewidth(1)                                                                                                                                
    plt.ylabel('f1'); plt.xlabel('Ensemble Size'); plt.title('AdaBoost Ensemble learning rate of: '+ str(learning_rate));
    plt.show()

Stacking

In [None]:
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr1 = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2],meta_classifier=lr1)
label = ['KNN', 'Random Forest', "Naive Bayes", 'Stacking Classifier']
clf_list = [clf1, clf2, clf3, sclf]

clf_cv_mean = []
clf_cv_std = []
acc_clf_cv_mean = []
acc_clf_cv_std = []

for clf, label in zip(clf_list, label):
        
    acc_scores = cross_val_score(clf, X, y, cv=2, scoring='accuracy')
    print ("Accuracy: %.2f (+/- %.2f) [%s]" %(acc_scores.mean(), acc_scores.std(), label))
    scores = cross_val_score(clf, X, y, cv=2, scoring='f1')
    print ("F1 Score: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())
    acc_clf_cv_mean.append(scores.mean())
    acc_clf_cv_std.append(scores.std())

In [None]:
#plot classifier f1 score    
plt.figure()
(_, caps, _) = plt.errorbar(range(4), clf_cv_mean, yerr=clf_cv_std, c='blue', fmt='-o', capsize=4)
for cap in caps:
    cap.set_markeredgewidth(1)                                                                                                                                
plt.xticks(range(4), ['KNN', 'RF', "Naive Bayes", 'Stacking Classifier'])        
plt.ylabel('F1'); plt.xlabel('Classifier'); plt.title('Stacking Ensemble');
plt.show()

In [None]:
#plot learning curves
    
plt.figure()
plot_learning_curves(X_train, le.fit_transform(y_train), X_test, le.fit_transform(y_test), sclf, print_model=False, style='ggplot')
plt.show()

Interpreting and Explaining Models

In [None]:
#classification to the lime intepreter does not work with a linearsvc so we used lightning boost method which returns a probability 

In [None]:
X = features
new_labels = le.fit_transform(labels)
y = new_labels
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.3, random_state=42)


In [None]:


import lightgbm as lgb

lgb_params = {
    'task': 'train',
    'boosting_type': 'goss',
    'objective': 'binary',
    'metric':'binary_logloss',
    'metric': {'l2', 'auc'},
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbose': None,
    'num_iteration':100,
    'num_threads':7,
    'max_depth':12,
    'min_data_in_leaf':100,
    'alpha':0.5}

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)


# training the lightgbm model
model = lgb.train(lgb_params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5)



# this is required as LIME requires class probabilities in case of classification example
# LightGBM directly returns probability for class 1 by default 

def prob(data):
    return np.array(list(zip(1-model.predict(data),model.predict(data))))
    


explainer = lime.lime_tabular.LimeTabularExplainer(training_data = features, 
                                                   mode = 'classification',
                                                   training_labels = new_labels)




In [None]:
#this function will take a row number and explain the prediction by giving the probability and give which
#which features impacted the result 
def model_explained(i):
    exp = explainer.explain_instance(features[i],predict_fn = prob)

# Plot the prediction explaination
    exp.as_pyplot_figure();
    exp.show_in_notebook()

In [None]:
#line 456
#we see here that the sentiment is more positive then negative 
model_explained(456)

In [None]:
model_explained(678)

Model Interpretation with different models applied in project 

In [None]:
def model_interpret(model, vec, row_number):
    model = str(model)
    model = model.lower()
    vec = str(vec)
    vec = vec.lower()
    row_number = int(row_number)
    if model == "multinb" and vec =="word2vec":
        return print("Multinomial does not take negative values please pick other model or choose tfidf Word embeddings")
    
    if vec == "tfdif":
        X = features
        new_labels = le.fit_transform(labels)
        y = new_labels
        X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.3, random_state=42)
    elif vec == "word2vec":
        X = w2v_feature_array
        new_labels = le.fit_transform(labels)
        y = new_labels
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle = True, stratify = None)
    else:
        return print("Please enter Tfidf or Word2Vec")
    
    if model == "randomf":
        model = RandomForestClassifier()
    elif model == "gaussiannb":
        model = GaussianNB()
    elif model == "multinb":
        model = MultinomialNB()
    elif model == "logreg":
        model = LogisticRegression()
    elif model == "linearsvc":
        model = LinearSVC()
    else: 
        return print("Please enter RandomF, GaussianNB, MultiNB, LogReg or LinearSVC")

        
    # Fit and test on the features
    model.fit(X, y)
    model_pred = model.predict(X_test)
    number_of_pred = len(model_pred)
    
    if row_number < 0:
        return print("Please enter number bigger or equal to 0")
    elif row_number > number_of_pred:
        return print("Please enter number smaller or equal to " + str(number_of_pred))
    else: 
        row_number = row_number
    

    
    # Create a lime explainer object
    explainer = lime.lime_tabular.LimeTabularExplainer(training_data = X, 
                                                   mode = 'regression',
                                                   training_labels = new_labels)                                                  


    # Explanation for wrong prediction
    #exp = explainer.explain_instance(data_row = wrong, predict_fn = model.predict)
    exp = explainer.explain_instance(X[row_number],predict_fn = model.predict)

    # Plot the prediction explaination
    return exp.show_in_notebook()        
        

In [None]:
model_interpret(model='GaussianNB', vec='tfdif', row_number=258)

In [None]:
model_interpret(model='randomf', vec='tfdif', row_number=258)

In [None]:
model_interpret(model='logreg', vec='tfdif', row_number=258)

In [None]:
model_interpret(model='multinb', vec='tfdif', row_number=258)

In [None]:
model_interpret(model='linearsvc', vec='tfdif', row_number=258)

In [None]:
model_interpret(model='linearsvc', vec='word2vec', row_number=258)

In [None]:
model_interpret(model='logreg', vec='word2vec', row_number=258)

In [None]:
model_interpret(model='randomf', vec='word2vec', row_number=258)

In [None]:
model_interpret(model='GaussianNB', vec='word2vec', row_number=258)

In [None]:
model_interpret(model='multinb', vec='word2vec', row_number=258)

Additional Tests

Text Classification Using LSTM W/ Tfidf

In [None]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
# Others
import string


In [None]:
data = pad_sequences(features, maxlen=50)

## Network architecture
model = Sequential()
model.add(Embedding(20000, 100, input_length=50))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
## Fit the model
model.fit(data, le.fit_transform((labels)), validation_split=0.4, epochs=3)

In [None]:
word_embds = model.layers[0].get_weights()[0]

In [None]:
word_list = []
for word, i in counter.items():
    word_list.append(word)

In [None]:
X_embedded = TSNE(n_components=2).fit_transform(word_embds)
number_of_words = 1000
trace = go.Scatter(
    x = X_embedded[0:number_of_words,0], 
    y = X_embedded[0:number_of_words, 1],
    mode = 'markers',
    text= word_list[0:number_of_words]
)
layout = dict(title= 't-SNE 1 vs t-SNE 2 for first 1000 words ',
              yaxis = dict(title='t-SNE 2'),
              xaxis = dict(title='t-SNE 1'),
              hovermode= 'closest')
fig = dict(data = [trace], layout= layout)
py.iplot(fig)

Unsupervised Learning 

Implemented K Means Cluster with the Tdif

In [None]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(9, 10))
X = vectorizer.fit_transform(df["Text"])


In [None]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1)
model.fit(X)

In [None]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [None]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :20]:
        print('%s' % terms[ind])