# Supervised Models

This notebook tests a variety of classification algorithms on our finalized dataset of reviews.

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from nltk.stem import WordNetLemmatizer 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
import helper_functions as helper
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [32]:
# import csv files as dataframes
df_engineered = pd.read_csv('df_engineered.csv') # these are our engineered features
df_reviews = pd.read_csv('final_df.csv') # this contains our corpus 

In [33]:
# concatenate the dataframes into one final dataframe
df_final = pd.concat([df_engineered[['review_length','word_count','exclamation_count','question_count']],
                      df_reviews],axis=1)

In [34]:
df_final.dropna(inplace=True)

In [35]:
df_final= df_final[['reviews','review_length',
                    'word_count','exclamation_count','question_count',
                   'review_length','overall']]

In [36]:
# split data into train and test set
X = df_final[['reviews','review_length','word_count',
              'exclamation_count','question_count',
                   'review_length']]
y = df_final['overall']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [37]:
X_engineered = X_train[['review_length','word_count',
              'exclamation_count','question_count',
                   'review_length']]

In [38]:
X_test_engineered = X_test[['review_length','word_count',
              'exclamation_count','question_count',
                   'review_length']]

## Vectorize

#### TF-IDF

In [39]:
# vectorize the train set
tf_idf = TfidfVectorizer(max_df=0.5, max_features=10000,
                                  min_df=2, token_pattern = '(?ui)\\b\\w*[a-z]+\\w*\\b', analyzer='word')
tf_idf_vectors = tf_idf.fit_transform(X_train['reviews'])

In [40]:
# put into dataframe so we can run it through a classification algorithm
vector_df = pd.DataFrame(tf_idf_vectors.toarray(), columns=tf_idf.get_feature_names())

In [41]:
# vectorize the test set
tfidf_test = tf_idf.transform(X_test['reviews'])

In [42]:
# put into dataframe
vector_df_test = pd.DataFrame(tfidf_test.toarray(), columns=tf_idf.get_feature_names())

#### Bag of words

In [8]:
# vectorize the train set, this time with the bag of words method
bow = CountVectorizer(max_df=0.5, max_features=10000,
                                  min_df=2, token_pattern = '(?ui)\\b\\w*[a-z]+\\w*\\b', analyzer='word')

In [9]:
bow_vectors = bow.fit_transform(X_train['reviews'])

In [10]:
# put into dataframe
bow_df = pd.DataFrame(bow_vectors.toarray(), columns=bow.get_feature_names())

In [11]:
# vectorize test set too
bow_vectors_test = bow.transform(X_test['reviews'])

In [12]:
bow_test_df = pd.DataFrame(bow_vectors_test.toarray(), columns=bow.get_feature_names())

In [None]:
#### Visualize the top words and bigrams

In [None]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [30]:
def get_top_n_bigrams(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer(ngram_range(2,2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
# find the top 10 words
top10 = get_top_n_words(X_train['reviews'], n=10)
top10_df = pd.DataFrame(top10)

In [None]:
# plot the top 10 words in a bar graph
plt.figure(figsize=(15,5))
ax = sns.barplot(x=0,y=1, data=top10_df)
ax.set(title='Most Common Words in Corpus')
ax.set(xlabel='Words', ylabel='Count')
plt.tight_layout()
plt.savefig('Most Common Words')

In [None]:
# find the top 10 bigrams
top10_bigrams = get_top_n_bigrams(X_train['reviews'], n=10)
top10_bigrams_df = pd.DataFrame(top10_bigrams)

In [None]:
# plot the top 10 bigrams in a bar graph
plt.figure(figsize=(15,5))
ax = sns.barplot(x=0,y=1, data=top10_bigrams_df)
ax.set(title='Most Common Bigrams in Corpus')
ax.set(xlabel='Bigrams', ylabel='Count')
plt.tight_layout()
plt.savefig('Most Common Bigrams')

### SVD

We tried using SVD to reduce our feature space so we could use it in conjuction with our engineered features. We did not have the computing power to merge the engineered features with the TF-IDF dataframe.

In [None]:
# use SVD to reduce dimensionality of feature space
svd = TruncatedSVD(n_components=100)
LSA = svd.fit_transform(tf_idf_vectors)

In [None]:
df_LSA = pd.DataFrame(LSA)

In [None]:
# check out much of the variance is explained by the principal components
svd.explained_variance_ratio_.sum()

In [None]:
# transform test set too
LSA_test = svd.transform(tfidf_test)

In [None]:
df_LSA_test = pd.DataFrame(LSA_test)

In [None]:
## concatenate vectorized dataframe to dataframe of engineered features

In [None]:
X_engineered.reset_index(inplace=True)

In [None]:
df_SVD_train = pd.concat([df_LSA,X_engineered], axis=1, ignore_index=True)

In [None]:
## concatenate vectorized TEST dataframe to OG TEST dataframe with engineered feats.

In [None]:
X_test_engineered.reset_index(inplace=True)

In [None]:
df_SVD_test = pd.concat([df_LSA_test,X_test_engineered], axis=1)

## Modeling

### Dummy classifier

In [43]:
# dummy without SVD
dum_clf = DummyClassifier()
dum_model = dum_clf.fit(vector_df,y_train)
y_hat_test = dum_model.predict(vector_df_test)

In [44]:
print(accuracy_score(y_hat_test,y_test))

0.2025553554064179


In [45]:
## dummy with LSA/SVD
dum_clf_2 = DummyClassifier()
dum_model_2 = dum_clf_2.fit(df_LSA,y_train)
y_hat_test_2 = dum_model_2.predict(df_LSA_test)

NameError: name 'df_LSA' is not defined

In [None]:
print(accuracy_score(y_hat_test_2,y_test))

In [None]:
## dummy with LSA/SVD AND engineered feats.
dum_clf_3 = DummyClassifier()
dum_model_3 = dum_clf_2.fit(df_SVD_train,y_train)
y_hat_test_3 = dum_model_2.predict(df_SVD_test)

In [None]:
print(accuracy_score(y_hat_test_3,y_test))

### Decision Tree

In [None]:
## Decision Tree without SVD
dt_clf = DecisionTreeClassifier()
dt_clf_model = dt_clf.fit(vector_df,y_train)
y_hat_test_DT = dt_clf_model.predict(vector_df_test)

In [None]:
print(accuracy_score(y_hat_test_DT,y_test))

In [None]:
## Decision Tree with SVD
dt_clf_2 = DecisionTreeClassifier()
dt_clf_model_2 = dt_clf_2.fit(df_LSA,y_train)
y_hat_test_DT_2 = dt_clf_model_2.predict(df_LSA_test)

In [None]:
print(accuracy_score(y_hat_test_DT_2,y_test))

In [None]:
## Decision Tree with SVD and engineered feats.
dt_clf_3 = DecisionTreeClassifier()
dt_clf_model_3 = dt_clf_3.fit(df_SVD_train,y_train)
y_hat_test_DT_3 = dt_clf_model_3.predict(df_SVD_test)

In [None]:
print(accuracy_score(y_hat_test_DT_3,y_test))

### Naive Bayes

In [None]:
# NB with tfidf
NB = MultinomialNB()
NB_model = NB.fit(vector_df, y_train)
y_hat_test_NB = NB.predict(vector_df_test)

In [None]:
print(accuracy_score(y_hat_test_NB,y_test))

In [None]:
# plot confusion matrix
cnf_matrix = confusion_matrix(y_test, y_hat_test_NB)

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[1,2,3,4,5],normalize=True,
                      title='Naive Bayes Confusion Matrix')

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    plot_confusion_matrix(cm, classes,normalize=False,
                          title='Confusion matrix',cmap=plt.cm.Blues)
    
    """
    #Add Normalization Option
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig('confusion')

In [13]:
## NB with bag of words
NB = MultinomialNB()
NB_model = NB.fit(bow_vectors, y_train)
y_hat_test_NB_bow = NB.predict(bow_test_df)

In [14]:
print(accuracy_score(y_hat_test_NB_bow,y_test))

0.4630192017982257


In [None]:
def plot_AUC_ROC(y_score,fpr,tpr):
    sns.set_style("darkgrid", {"axes.facecolor": ".9"})
    print('AUC: {}'.format(auc(fpr, tpr)))
    plt.figure(figsize=(10,8))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.yticks([i/20.0 for i in range(21)])
    plt.xticks([i/20.0 for i in range(21)])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()