In [1]:
import boto3
import json
import pandas as pd
import numpy as np
import gensim
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re
np.random.seed(0)
from pandas.io.json import json_normalize
import io
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle
from scipy.stats import randint as sp_randint
from sklearn.metrics import confusion_matrix, roc_curve, auc



In [2]:
# create the s3 client
s3 = boto3.client('s3')

# this is the location of the data on S3 (usual)
bucket='yelpreviewsdata' # put your S3 bucket name here
prefix = 'data'
obj = s3.get_object(Bucket=bucket, Key=f'{prefix}/yelp_review-0000.csv')
df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding="iso-8859-15", low_memory=True)
df = df[['stars', 'text']]
df['stars']= np.where(df['stars'] >=2, 1,0)
print(df.head())
print(df.shape)


   stars                                               text
0      1  Super simple place but amazing nonetheless. It...
1      1  Small unassuming place that changes their menu...
2      1  Lester's is located in a beautiful neighborhoo...
3      1  Love coming here. Yes the place always needs t...
4      1  Had their chocolate almond croissant and it wa...
(296227, 2)


In [3]:
def sampling_dataset(df):
    count = 150000
    class_df_sampled = pd.DataFrame(columns = ["stars","text"])
    temp = []
    for c in df.stars.unique():
        class_indexes = df[df.stars == c].index
        random_indexes = np.random.choice(class_indexes, count, replace=True)
        temp.append(df.loc[random_indexes])
        
    for each_df in temp:
        class_df_sampled = pd.concat([class_df_sampled,each_df],axis=0)
    
    return class_df_sampled

df = sampling_dataset(df)
df.reset_index(drop=True,inplace=True)
print (df.head())
print (df.shape)

  stars                                               text
0     1  The VIP has one of the best chicken wings in t...
1     1  We so wanted this new restaurant and brew hous...
2     1  I wasn't sure if I'd like Pilates on the refor...
3     1  This place is awesome. Lots of great rolls at ...
4     1  Super delicious Hainanese chicken and rice joi...
(300000, 2)


In [4]:
lmtzr = WordNetLemmatizer()
w = re.compile("\w+",re.I)

def label_sentences(df):
    labeled_sentences = []
    for index, datapoint in df.iterrows():
        tokenized_words = re.findall(w,datapoint["text"].lower())
        labeled_sentences.append(LabeledSentence(words=tokenized_words, tags=['SENT_%s' %index]))
    return labeled_sentences

def train_doc2vec_model(labeled_sentences):
    model = gensim.models.Doc2Vec(size=300, window=10, min_count=5, workers=11,alpha=0.025, min_alpha=0.025, iter=20)
    model.build_vocab(labeled_sentences)
    model.train(labeled_sentences, epochs=model.iter, total_examples=model.corpus_count)
    return model

sen = label_sentences(df)
model = train_doc2vec_model(sen)




In [5]:
def vectorize_comments(df,d2v_model):
    y = []
    comments = []
    for i in range(0,df.shape[0]):
        label = 'SENT_%s' %i
        comments.append(d2v_model.docvecs[label])
    df['vectorized_comments'] = comments
    
    return df

df = vectorize_comments(df,model)
print (df.head(5))

  stars                                               text  \
0     1  The VIP has one of the best chicken wings in t...   
1     1  We so wanted this new restaurant and brew hous...   
2     1  I wasn't sure if I'd like Pilates on the refor...   
3     1  This place is awesome. Lots of great rolls at ...   
4     1  Super delicious Hainanese chicken and rice joi...   

                                 vectorized_comments  
0  [-0.33499905, 0.4270599, 0.87942874, 0.2583504...  
1  [-0.69817674, 0.70658445, 0.86826485, -0.21861...  
2  [-0.9702267, 1.2050159, 0.99320436, 0.56507194...  
3  [-0.8135402, 0.7071663, -0.09651267, 0.3303958...  
4  [-0.33375084, 0.29596585, 1.4535037, -0.216465...  


In [6]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(df["vectorized_comments"].T.tolist(), df["stars"], test_size=0.02, random_state=17)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
def train_classifier(X,y):
    n_estimators = [200,400]
    min_samples_split = [2]
    min_samples_leaf = [1]
    bootstrap = [True]

    parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
                  'min_samples_split': min_samples_split}

    clf = GridSearchCV(RFC(verbose=1,n_jobs=4), cv=4, param_grid=parameters)
    clf.fit(X, y)
    return clf


In [7]:
#Randomized search for model selection
clf = tree.DecisionTreeClassifier()
# Utility function to report best scores
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean OOB score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 4),
              "min_samples_leaf": sp_randint(1, 5),
              "criterion": ["gini", "entropy"]}


# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

random_search.fit(X_train, y_train)
#this might take a minute to run
print("RandomizedSearchCV examined %d candidate parameter settings." % (n_iter_search))
report(random_search.cv_results_)



RandomizedSearchCV examined 30 candidate parameter settings.
Model with rank: 1
Mean OOB score: 0.584 (std: 0.002)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1}

Model with rank: 2
Mean OOB score: 0.580 (std: 0.001)
Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1}

Model with rank: 3
Mean OOB score: 0.579 (std: 0.001)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 2, 'min_samples_leaf': 1}

Model with rank: 4
Mean OOB score: 0.579 (std: 0.002)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 4}

Model with rank: 5
Mean OOB score: 0.577 (std: 0.003)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 2, 'min_samples_leaf': 1}



TypeError: '<=' not supported between instances of 'str' and 'int'

In [8]:
clf = tree.DecisionTreeClassifier(max_features = 3, criterion = 'entropy', min_samples_leaf = 1)

clf = clf.fit(X_train, y_train)

    
y_pred=clf.predict(X_test)
y_scores=clf.predict_proba(X_test)
print ('\nconfusion matrix')
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_scores[:,1])
print('\nauc score '+str(auc(false_positive_rate, true_positive_rate)))

#show a tradeoff curve for precision vs recall


confusion matrix
Predicted     0     1   All
True                       
0          1817  1197  3014
1          1230  1756  2986
All        3047  2953  6000

auc score 0.5954655234714


In [9]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
preds = logreg.predict(X_test)

In [11]:
sum(preds == y_test) / len(y_test)

0.7045