# Supervised Models

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from nltk.stem import WordNetLemmatizer 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
import helper_functions as helper
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df_engineered = pd.read_csv('df_engineered.csv')

In [3]:
df_reviews_final = pd.read_csv('final_df.csv')

In [4]:
df_final = pd.concat([df_engineered,df_reviews_final],axis=1)

In [5]:
df_final.dropna(inplace=True)

In [6]:
df_final= df_final[['reviews','review_length',
                    'word_count','exclamation_count','question_count',
                   'review_length','overall']]

In [7]:
from sklearn.model_selection import train_test_split
X = df_final[['reviews','review_length','word_count',
              'exclamation_count','question_count',
                   'review_length']]
y = df_final['overall']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
X_engineered = X_train[['review_length','word_count',
              'exclamation_count','question_count',
                   'review_length']]

In [None]:
X_test_engineered =X_test[['review_length','word_count',
              'exclamation_count','question_count',
                   'review_length']]

### Vectorize

In [None]:
## TF-IDF

In [None]:
# vectorize the train set
tf_idf = TfidfVectorizer(max_df=0.5, max_features=10000,
                                  min_df=2, token_pattern = '(?ui)\\b\\w*[a-z]+\\w*\\b', analyzer='word')
tf_idf_vectors = tf_idf.fit_transform(X_train['reviews'])

In [None]:
vector_df = pd.DataFrame(tf_idf_vectors.toarray(), columns=tf_idf.get_feature_names())

In [None]:
# vectorize the test set
tfidf_test = tf_idf.transform(X_test['reviews'])

In [None]:
vector_df_test = pd.DataFrame(tfidf_test.toarray(), columns=tf_idf.get_feature_names())

In [None]:
## Bag of words

In [8]:
bow = CountVectorizer(max_df=0.5, max_features=10000,
                                  min_df=2, token_pattern = '(?ui)\\b\\w*[a-z]+\\w*\\b', analyzer='word')

In [9]:
bow_vectors = bow.fit_transform(X_train['reviews'])

In [10]:
bow_df =pd.DataFrame(bow_vectors.toarray(), columns=bow.get_feature_names())

In [11]:
bow_vectors_test = bow.transform(X_test['reviews'])

In [12]:
bow_test_df = pd.DataFrame(bow_vectors_test.toarray(), columns=bow.get_feature_names())

In [None]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
features = get_top_n_words(X_train['reviews'], n=10)

In [None]:
features_df = pd.DataFrame(features)

In [None]:
features_df

In [None]:
plt.figure(figsize=(15,5))
ax = sns.barplot(x=0,y=1, data=features_df)
ax.set(title='Most Common Words in Corpus')
ax.set(xlabel='Words', ylabel='Count')
plt.tight_layout()
plt.savefig('Most Common Words')

In [None]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
features2 = get_top_n_words(X_train['reviews'], n=10)

In [None]:
features2_df = pd.DataFrame(features2)

In [None]:
plt.figure(figsize=(15,5))
ax = sns.barplot(x=0,y=1, data=features2_df)
ax.set(title='Most Common Bigrams in Corpus')
ax.set(xlabel='Bigrams', ylabel='Count')
plt.tight_layout()
plt.savefig('Most Common Bigrams')

### SVD

In [None]:
svd = TruncatedSVD(n_components=100)
LSA = svd.fit_transform(tf_idf_vectors)
# LSA = Normalizer(copy=False).fit_transform(tf_idf_vectors)

In [None]:
LSA.shape

In [None]:
df_LSA = pd.DataFrame(LSA)

In [None]:
df_LSA.head()

In [None]:
svd.explained_variance_ratio_.sum()

In [None]:
# transform test set too
LSA_test = svd.transform(tfidf_test)

In [None]:
LSA_test.shape

In [None]:
LSA_test_df = pd.DataFrame(LSA_test)

In [None]:
## concatenate vectorized dataframe to OG dataframe

In [None]:
X_engineered.reset_index(inplace=True)

In [None]:
df_train = pd.concat([df_LSA,X_engineered], axis=1, ignore_index=True)

In [None]:
## concatenate vectorized TEST dataframe to OG TEST dataframe with engineered feats.

In [None]:
X_test_engineered.reset_index(inplace=True)

In [None]:
df_test = pd.concat([LSA_test_df,X_test_engineered], axis=1)

In [None]:
df_test.shape

## Modeling

### Dummy classifier

In [None]:
# dummy without SVD
dum_clf = DummyClassifier()
dum_model = dum_clf.fit(vector_df,y_train)
y_hat_test = dum_model.predict(vector_df_test)

In [None]:
print(accuracy_score(y_hat_test,y_test))

In [None]:
## dummy with LSA/SVD
dum_clf_LSA = DummyClassifier()
dum_model_LSA = dum_clf_LSA.fit(df_LSA,y_train)
y_hat_test_LSA = dum_model_LSA.predict(LSA_test_df)

In [None]:
print(accuracy_score(y_hat_test_LSA,y_test))

In [None]:
## dummy with LSA/SVD AND engineered feats.

In [None]:
df_train.shape

In [None]:
y_train.shape

In [None]:
dum_clf_eng = DummyClassifier()
dum_model_eng = dum_clf_eng.fit(df_train,y_train)
y_hat_test_eng = dum_model_eng.predict(df_test)

In [None]:
print(accuracy_score(y_hat_test_eng,y_test))

### Decision Tree

In [None]:
## non-SVD DT
dt_clf = DecisionTreeClassifier()
dt_clf_model_noSVD = dt_clf.fit(vector_df,y_train)
y_hat_test_DT = dum_model.predict(vector_df_test)

In [None]:
print(accuracy_score(y_hat_test_DT,y_test))

In [None]:
## SVD DT
dt_clf = DecisionTreeClassifier()

In [None]:
dt_clf_model = dt_clf.fit(df_LSA,y_train)

In [None]:
y_hat_DT = dt_clf_model.predict(LSA_test_df)

In [None]:
print(accuracy_score(y_hat_DT,y_test))

In [None]:
## SVD + engineered DT

In [None]:
dt_clf = DecisionTreeClassifier()
dt_clf_model_eng = dt_clf.fit(df_train,y_train)
y_hat_test_eng = dt_clf_model_eng.predict(df_test)

In [None]:
print(accuracy_score(y_hat_test_eng,y_test))

### Naive Bayes

In [None]:
# NB with tfidf
NB = MultinomialNB()
NB_model = NB.fit(vector_df, y_train)

In [None]:
y_hat_test_NB = NB.predict(vector_df_test)

In [None]:
print(accuracy_score(y_hat_test_NB,y_test))

In [None]:
cnf_matrix = confusion_matrix(y_test, y_hat_test_NB)

plt.figure()
helper.plot_confusion_matrix(cnf_matrix, classes=[1,2,3,4,5],normalize=True,
                      title='Naive Bayes Confusion Matrix')
plt.show()

In [None]:
def plot_AUC_ROC(y_score,fpr,tpr):
    sns.set_style("darkgrid", {"axes.facecolor": ".9"})
    print('AUC: {}'.format(auc(fpr, tpr)))
    plt.figure(figsize=(10,8))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.yticks([i/20.0 for i in range(21)])
    plt.xticks([i/20.0 for i in range(21)])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
## NB with bag of words

In [13]:
NB = MultinomialNB()
NB_model = NB.fit(bow_vectors, y_train)

In [14]:
y_hat_test_NB_bow = NB.predict(bow_test_df)
print(accuracy_score(y_hat_test_NB_bow,y_test))

0.4630192017982257
