In [None]:
from collections import Counter
import datetime
import numpy as np
import pandas as pd
import pickle
import itertools
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import classification_report,accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from textblob import TextBlob

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
from sklearn.feature_extraction import text 

In [None]:
#Reads in a CSV file as a dataframe and drops null rows.
def import_reviews(filename):
    df = pd.read_csv(filename)
    if 'Unnamed: 0' in df.columns:
        df.drop(columns = 'Unnamed: 0', inplace=True)
    df = df.dropna()
    return df

In [None]:
#Reading in the reviews and doing text processing on them.
df = import_reviews('in_n_out_reviews')

df_text = text_preprocessing(df,1)

In [None]:
df_text.isnull().values.any()

In [None]:
df_text.info()

In [None]:
#Saving the tokenized text to a dataframe for future use
df_text.to_csv('tokenized_text')

In [None]:
#Processes text to be used in topic modeling. Tokenizes and adds n-grams. Also adds the length
# of the review as a column.
def text_preprocessing(df, ngram):
    df['tokens'] = df['text'].apply(simple_preprocess)
    df_text = df[['useful','text', 'tokens']]
    df_text = df
    df_text = df_text.reset_index()
    df_text.drop(columns='index', inplace=True)
    df_text = df_text[(df_text.tokens.str.len() > 5)]
    df_text['useful'] = df_text['useful'].apply(lambda x: 1 if x > 0 else 0)
    df_text['length'] = df_text['text'].apply(len)
    if ngram > 1:
        df_text['grams'] = df_text['tokens'].apply(lambda x: find_ngrams(x, n=ngram))
        df_text['modeling_text_list'] = df_text['tokens'] + df_text['grams']
        df_text['modeling_text'] = df_text['modeling_text_list'].apply(lambda x:' '.join(x))
    else:
        df_text['modeling_text'] = df_text['tokens'].apply(lambda x:' '.join(x))
    return df_text

In [None]:
#Saves the results of my modeling to a csv
def save_metrics(metric_list, df):
    df.append(pd.DataFrame(metric_list))
    df.to_csv('metrics_dataframe')
    return True

In [None]:
#Reading in some text, I would overwrite csv files as I updated them so I no longer have the original.
df_topic_probs = import_reviews('lda_doc_topic_probs')

df = import_reviews('tokenized_text')

df_reviews = import_reviews('in_n_out_reviews')

df_user = import_reviews('user_info_subset')

In [None]:
#Trying to match up reviewer data with the review that they wrote. This was to add more features to model on.
df_user.columns = [['useful_sum', 'review_count', 'yelping_since', 'cool_sum', 'funny_sum',
       'compliment_cool', 'elite', 'user_id']]

df_reviews_user = pd.merge(df_reviews, df_user, how='left', on=['user_id'], sort=False)

df_user_with_text = df_reviews_user[['stars', 'text','useful_sum', 'review_count', 'yelping_since',
                                       'cool_sum', 'funny_sum', 'compliment_cool', 'elite']]

df_tokenized_user = text_preprocessing(df_user_with_text, 1)

df_text_user = pd.merge(df, df_tokenized_user, how='left', on=['text'], sort=False)

In [None]:
#I was getting different lengths for the two dataframes so I began to suspect that something wasn't working as
#intended
len(df), len(df_text_user)

In [None]:
#Converting the day the user joined into the age of the account in days.
def days_old(date):
    date = datetime.datetime.strptime(date, "%Y-%m-%d")
    return (datetime.datetime.utcnow() - date).days
df_text_user['account_age'] = df_text_user['yelping_since'].apply(days_old)

In [None]:
#Making an account_age column in days
df_tokenized_user['account_age'] = df_tokenized_user['yelping_since'].apply(days_old)

In [None]:
#Saving my the user data matched with review dataframe.
df_tokenized_user.to_csv('my_lst_hope')

df_text_user.to_csv('user_info_subset')

In [None]:
len(df_topic_probs), len(df_text_user)

In [None]:
# #Attempting to include POS as a feature! Doesn't work very well

# adjective = set(["JJ", "JJR", 'JJS'])
# adverb = set(["RB", "RBR", 'RBS'])
# adj_count = []
# adv_count = []
# for i in range(len(df)):
#     adj, adv, pn = 0, 0, 0
#     pos_list = TextBlob(df.iloc[i]['modeling_text']).pos_tags
#     temp_dict = Counter(pos_list[1])
#     for key in temp_dict:
#         value = temp_dict[key]
#         if key in adjective:
#             adj += value
#         elif key in adverb:
#             adv += value
#     adj_count.append(adj)
#     adv_count.append(adv)
# df_topic_probs['adjective'] = adj_count
# df_topic_probs['adv_count'] = adv_count

In [None]:
# df_topic_probs.groupby('useful')['adv_count', 'proper_n'].describe()

In [None]:
df_topic_probs.to_csv('lda_doc_topic_probs')

In [None]:
#Adding polarity for each review with TextBlob
df_topic_probs['polarity'] = df['modeling_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
df_topic_probs.reset_index(inplace=True)

In [None]:
len(df_topic_probs), len(df['length'])

In [None]:
df_test = df_topic_probs[df_topic_probs['polarity'].isnull()]

In [None]:
df_test

In [None]:
df_topic_probs = df_topic_probs.dropna()

In [None]:
len(df_topic_probs)

In [None]:
#Function that creates ngrams
def find_ngrams(input_list, n):
    # Courtesy http://locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/
    ngrams = zip(*[input_list[i:] for i in range(n)])
    flattened_ngrams = list(map(lambda x: '_'.join(x), ngrams))
    return flattened_ngrams

In [None]:
df_text.head()

In [None]:
#Adding custom stopwords
stopword = set(stopwords.words('english'))
stopword = stopword.union(set(['food', 'this', 'place', 'the', 'of', 'is', 'came', 'was', 'for', 'have', 'had'
                           ,'and', 'get', 'one', 'food', 'guy','?','!','place', 'good', 'fries','burger', 'burgers',
                            'got', 'eat','great', 'us', 'asked', 'service', 'back', 'time', 'like', 'vegas', 'go',
                            'try', 'animal', 'style', 'double', 'good', 'just', 'always', 'location', 'fresh',
                              'east', 'coast', 'order', 'ordered', 'fast']))

In [None]:
#Creating count vectors
vocabulary = set(itertools.chain.from_iterable(df_text['modeling_text_list']))
vectorizer = CountVectorizer(vocabulary=vocabulary, stop_words= stopword)

In [None]:
#Creating an empty list to store results of my models
metric_list = []

In [None]:
"""Creates train and test data with the count vectors to be used for Multinomial Naive Bayes modelling"""
# X_train, X_test, y_train, y_test = train_test_split(df_text['modeling_text'], df_text['useful'], test_size=0.3, random_state = 15)

# # Create X, y vectors
# X_train = vectorizer.fit_transform(X_train).todense()

# X_test = vectorizer.transform(X_test).todense()

# # Create, train model
# nb = MultinomialNB()
# nb.fit(X_train, y_train)

In [None]:
y_scoreMNB=nb.predict_proba(X_test)[:,1]
fpr_MNB, tpr_MNB,_ = roc_curve(y_test, y_scoreMNB)

In [None]:
#MultinomilaNB
y_prednb = nb.predict(X_test)
scores = nb.score(X_test, y_test)
nb.score(X_train,y_train), scores

In [None]:
#Saving results to a dictionary
metrics_MNB = (metrics_to_dict('MultinomialNB', , y_test, nb.predict(X_test),
                                   nb.score(X_train, y_train), nb.score(X_test, y_test),
                                   'Null', 0))

In [None]:
len(df_topic_probs)

In [None]:
#Adding results to my existing results dataframe
df_metric_list.append(pd.DataFrame(metrics_MNB, index=range(1))).to_csv('metrics_dataframe')

In [None]:
#Create a dataframe to store gridsearch results
df_metric_list = pd.read_csv('metrics_dataframe')

In [None]:
#Read in LSI topic probabilities, add length of review and polarity for modelling.
df_topic_probs = pd.read_csv('lsi_topic_probs')
df_topic_probs['length'] = df_topic_probs_lda['length']
df_topic_probs['polarity'] = df_topic_probs_lda['polarity']

In [None]:
df_topic_probs.head(1)

In [None]:
#Seperates X and y for modeling and creates train, test sets
X = df_topic_probs[['3', '1', '2', 'length']]
y = df_topic_probs_lda['useful']
X = scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 15)

In [None]:
len(X_train), len(X_test), len(y_train)

In [None]:
#GausianNB for predicting with length and n-topics as features
params = {'priors' : [None]}
gb = GridSearchCV(GaussianNB(),params, scoring='roc_auc', cv=5)
gb.fit(X_train, y_train)
y_predgb = gb.predict(X_test)
gb_scores = gb.score(X_test, y_test)

In [None]:
y_scoregb=gb.predict_proba(X_test)[:,1]

In [None]:
y_probgb = gb.predict_proba(X_test)

In [None]:
fpr, tpr,_ = roc_curve(y_test, y_scoregb)

In [None]:
gb_scores, print(change_threshold(y_probgb, 0.55))

In [None]:
#Add results to list
n=3
metric_list.append(metrics_to_dict('GaussianNB', n, y_test, gb.predict(X_test),
                                   gb.score(X_train, y_train), gb.score(X_test, y_test), "Null",1))

In [None]:
#Grid search for logistic regression
log_params = {'C': [ 1e-2,1e-1, 1,10,100],
             'penalty': ['l1','l2']}
log = GridSearchCV(LogisticRegression(), log_params, scoring='roc_auc', cv = 5)
log.fit(X_train,np.ravel(y_train))
y_scorelog=log.predict_proba(X_test)[:,1]
y_problog = log.predict_proba(X_test)
coefficients = list(log.best_estimator_.coef_)
fpr_log, tpr_log,_ = roc_curve(y_test, y_scorelog)
roc_auc = auc(fpr_log, tpr_log)
log.score(X_train, y_train), log.score(X_test, y_test), log.best_params_

In [None]:
log.best_estimator_

In [None]:
#Observing precision with different thresholds.
print(change_threshold(y_problog, 0.8))

In [None]:
#Add logistic regression metrics to list
n=3
metric_list.append(metrics_to_dict('Logistic Regression(lsi)', n, y_test,
                                   log.predict(X_test), log.score(X_train, y_train), log.score(X_test, y_test),
                                   log.best_params_, 1))

In [None]:
#Gradient boosting grid search
gb_param = {'n_estimators' : [25,100,200],
        'max_depth' : [2,3,10],
        'learning_rate': [1e-2, 1e-1, 1]}
gradboost = GridSearchCV(GradientBoostingClassifier(), gb_param,scoring = 'roc_auc', cv= 5)
gradboost.fit(X_train, y_train)
y_score_gboost=gradboost.predict_proba(X_test)[:,1]
y_prob_gboost = gradboost.predict_proba(X_test)
fpr_gboost, tpr_gboost,_ = roc_curve(y_test, y_score_gboost)
roc_auc = auc(fpr_gboost, tpr_gboost)

gradboost.score(X_train, y_train), gradboost.score(X_test, y_test), gradboost.best_params_

In [None]:
y_predgb = gradboost.predict(X_test)

In [None]:
#Observing precision with different thresholds
print(change_threshold(y_prob_gboost, 0.5))

In [None]:
#Add gradient boosting metrics to list
n = 3
metric_list.append(metrics_to_dict('Gradient Boosting(lsi)', n, y_test, gradboost.predict(X_test),
                                   gradboost.score(X_train, y_train), gradboost.score(X_test, y_test),
                                   gradboost.best_params_, 1))

In [None]:
#Saving results
save_metrics(metric_list)

In [None]:
#Returns the classification report for different thresholds, n.
def change_threshold(y_prob, n):
    y_pred = [1 if x >= n else 0 for x in y_prob[:, 1]]
    return classification_report(y_test, y_pred)

In [None]:
#Returns a dictionary of metrics for each algorithm
def metrics_to_dict(algorithm, n_topics, y_test, y_pred, auc_train, auc_test, params, sa):
    if not sa:
        sa = 0
    results = {}
    prec = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test,y_pred)
    f1 = metrics.f1_score(y_test,y_pred)
    results['Algorithm'] = algorithm
    results['Parameters'] = params
    results['Number of Topics'] = n_topics
    results['Polarity'] = sa
    results['Auc_train'] = auc_train
    results['Auc_test'] = auc_test
    results['Precision'] = prec
    results['Recall'] = recall
    
    return results

In [None]:
#Making AUC curve for all the algorithms I used.
plt.figure(figsize=[10,8])
plt.plot([0,1],[0,1])

plt.plot(fpr,tpr, label='Gaussian Naive Bayes', color = 'grey')
plt.xlabel('FPR')
plt.ylabel('TPR')

plt.plot([0,1],[0,1])
plt.plot(fpr_MNB,tpr_MNB, label='Multinomial Naive Bayes', color = 'grey')

plt.plot([0,1],[0,1])
plt.plot(fpr_log,tpr_log, label='Logistic Regression', color = 'red')

plt.plot([0,1],[0,1])
plt.plot(fpr_gboost,tpr_gboost, label='Gradient Boosting', color = 'grey')

plt.legend()

In [None]:
df_text.groupby('useful').count()

In [None]:
#Sorting by AUC scores to see which algorithm performed the best and with which hyperparameters
pd.DataFrame(metric_list).sort_values('Auc_test', ascending = False)

In [None]:
df_metric_list.fillna(0).sort_values('Auc_test', ascending = False)

In [None]:
#Making confusion matrix
class_names = ['Not Useful','Useful']
import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True')
    plt.xlabel('Predicted')

In [None]:
y_pred = log.predict(X_test)

In [None]:
#Creating confusion matrix.
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion Matrix')