## imports

In [1]:
import sqlite3
from sqlite3 import Error

import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

## helpers

In [2]:
def create_connection(db_file):
    """ 
    create a database connection to a SQLite database specified by the db_file
    :param db_file: path to database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
    return conn

def run_query(conn, query):
    """
    query all rows in the tasks table
    :param conn: the Connection object
    :param query: query used on db
    :return:
    """
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    return rows

def get_X_and_Y(conn, query):
    """
    query all rows in the tasks table, and get X and Y
    :param conn: the Connection object
    :param query: query used on db
    :return X: features
    :return Y: flagged
    """
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    X = []
    Y = []
    for row in rows:
        X.append(row[:8] + row[9:])
        Y.append(row[8])
    return X, Y

def refine_col_names(list):
    names = []
    for col_info in list:
        names.append(col_info[1])
    return names

def refine_col_names_wout_flag(list):
    names = []
    for i in range(len(list)):
        if i != 8:
            names.append(list[i][1])
    return names

def close_connection(conn):
    """
    close database connection
    """
    if conn:
        conn.close()

## connect to db

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'

db_path = "filteredData/yelpResData.db"
conn = create_connection(db_path)
conn.text_factory = lambda b: b.decode(errors = 'ignore')

## get reviews from database

In [4]:
# get restaurant filtered reviews & regular reviews
reviews_list = run_query(conn, "SELECT * FROM review WHERE flagged IN ('Y', 'N')")
col_names = refine_col_names(run_query(conn, "PRAGMA table_info('review')"))

# turn list to dataframe
reviews = pd.DataFrame(reviews_list, columns=col_names)

reviews.head()

Unnamed: 0,date,reviewID,reviewerID,reviewContent,rating,usefulCount,coolCount,funnyCount,flagged,restaurantID
0,9/22/2012,GtwU21YOQn-wf4vWRUIx6w,bNYesZ944s6IJVowOnB0iA,"Unlike Next, which we'd eaten at the previous ...",5,0,0,0,N,pbEiXam9YJL3neCYHGwLUA
1,9/22/2012,0LpVTc3,TRKxLC3y-ZvP45e5iilMtw,Probably one of the best meals I've had ever. ...,5,0,0,0,N,pbEiXam9YJL3neCYHGwLUA
2,9/19/2012,tljtLzf68Fkwf,0EMm8umAqXZzyhxNpL4M9g,Service was impeccable. Experience and present...,3,2,0,0,N,pbEiXam9YJL3neCYHGwLUA
3,9/6/2012,iSN,DlwexC7z88ymAzu45skODw,"The problem with places like this, given the e...",3,8,0,3,N,pbEiXam9YJL3neCYHGwLUA
4,9/9/2012,Jmwrh7,kW2dk1CWihmh3g7k9N2G8A,I have no idea how to write my review - dining...,5,1,2,0,N,pbEiXam9YJL3neCYHGwLUA


## get the data we are going to use & change flag to int

In [5]:
data = reviews[['reviewContent','flagged']]
data['flagged'] = data['flagged'].replace({'Y':1, 'N':0})
data.head()

Unnamed: 0,reviewContent,flagged
0,"Unlike Next, which we'd eaten at the previous ...",0
1,Probably one of the best meals I've had ever. ...,0
2,Service was impeccable. Experience and present...,0
3,"The problem with places like this, given the e...",0
4,I have no idea how to write my review - dining...,0


## data cleaning

In [6]:
def lower_case(text):
    return text.lower()

data['reviewContent'] = data['reviewContent'].apply(lower_case)
data.head()

Unnamed: 0,reviewContent,flagged
0,"unlike next, which we'd eaten at the previous ...",0
1,probably one of the best meals i've had ever. ...,0
2,service was impeccable. experience and present...,0
3,"the problem with places like this, given the e...",0
4,i have no idea how to write my review - dining...,0


In [7]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data['reviewContent'] = data['reviewContent'].apply(remove_punctuation)
data.head()

Unnamed: 0,reviewContent,flagged
0,unlike next which wed eaten at the previous ni...,0
1,probably one of the best meals ive had ever i...,0
2,service was impeccable experience and presenta...,0
3,the problem with places like this given the ex...,0
4,i have no idea how to write my review dining ...,0


In [8]:
stop_words = stopwords.words('english')

def remove_stop_words(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

data['reviewContent'] = data['reviewContent'].apply(remove_stop_words)
data.head()

Unnamed: 0,reviewContent,flagged
0,unlike next wed eaten previous night dish comp...,0
1,probably one best meals ive ever performance f...,0
2,service impeccable experience presentation coo...,0
3,problem places like given exhorbitant cost med...,0
4,idea write review dining alinea brings whole d...,0


In [9]:
# lemmatizer = WordNetLemmatizer()

# def lemmatize_words(text):
#     return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

data['reviewContent'] = data['reviewContent'].apply(lemmatize_words)
data.head()

Unnamed: 0,reviewContent,flagged
0,unlike next wed eaten previous night dish comp...,0
1,probably one best meal ive ever performance fo...,0
2,service impeccable experience presentation coo...,0
3,problem place like give exhorbitant cost mediu...,0
4,idea write review din alinea brings whole diff...,0


In [10]:
# def clean_text(text) :
#     # cast characters to lower case
#     lower = text.lower()
#     # remove stopwords
#     no_stop_words = " ".join([word for word in str(lower).split() if word not in stop_words])
#     # remove punctuation
#     no_punc = no_stop_words.translate(str.maketrans('', '', string.punctuation))
#     # lemmatization
#     lammatized = " ".join([lemmatizer.lemmatize(word) for word in no_punc.split()])
#     return lammatized

# data['reviewContent'] = data['reviewContent'].apply(clean_text)
# data.head()

In [11]:
# split reviews into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(data['reviewContent'], data['flagged'], train_size=0.75)

In [12]:
# count vectorizor
count_vectorizor = CountVectorizer()
count_vectorizor.fit(X_train)
X_cv = count_vectorizor.transform(X_train)
X_cv_test = count_vectorizor.transform(X_test)

In [13]:
# count vectorizor ngram
count_ngram_vectorizor = CountVectorizer(ngram_range=(1, 2))
count_ngram_vectorizor.fit(X_train)
X_cv_ngram = count_ngram_vectorizor.transform(X_train)
X_cv_ngram_test = count_ngram_vectorizor.transform(X_test)

In [14]:
# tf-idf vectorizor
tfidf_vectorizor = TfidfVectorizer()
tfidf_vectorizor.fit(X_train)
X_tv = tfidf_vectorizor.transform(X_train)
X_tv_test = tfidf_vectorizor.transform(X_test)

In [15]:
# tf-idf vectorizor ngram
tfidf_ngram_vectorizor = TfidfVectorizer(ngram_range=(1, 2))
tfidf_ngram_vectorizor.fit(X_train)
X_tv_ngram = tfidf_ngram_vectorizor.transform(X_train)
X_tv_ngram_test = tfidf_ngram_vectorizor.transform(X_test)

In [25]:
# logistic regression w/ count vectorizor
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr_cv = LogisticRegression(C=c, max_iter=500000)
    lr_cv.fit(X_cv, y_train)
    print("Accuracy for C = %s : %s" % (c, accuracy_score(y_test, lr_cv.predict(X_cv_test))))

Accuracy for C = 0.01 : 0.8759773202029245
Accuracy for C = 0.05 : 0.8759773202029245
Accuracy for C = 0.25 : 0.8724559832885705
Accuracy for C = 0.5 : 0.8691136974037601
Accuracy for C = 1 : 0.8654133094598627


In [26]:
# logistic regression w/ count vectorizor ngram
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr_cv_ngram = LogisticRegression(C=c, max_iter=500000)
    lr_cv_ngram.fit(X_cv_ngram, y_train)
    print("Accuracy for C = %s : %s" % (c, accuracy_score(y_test, lr_cv_ngram.predict(X_cv_ngram_test))))

Accuracy for C = 0.01 : 0.8756789018203521
Accuracy for C = 0.05 : 0.8762160549089824
Accuracy for C = 0.25 : 0.8729931363772009
Accuracy for C = 0.5 : 0.8716800954938825
Accuracy for C = 1 : 0.8696508504923903


In [27]:
# logistic regression w/ tf-idf vectorizor
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr_tv = LogisticRegression(C=c, max_iter=500000)
    lr_tv.fit(X_tv, y_train)
    print("Accuracy for C = %s : %s" % (c, accuracy_score(y_test, lr_tv.predict(X_tv_test))))

Accuracy for C = 0.01 : 0.8757385854968666
Accuracy for C = 0.05 : 0.8757385854968666
Accuracy for C = 0.25 : 0.876037003879439
Accuracy for C = 0.5 : 0.8762757385854969
Accuracy for C = 1 : 0.8768725753506416


In [28]:
# logistic regression w/ tf-idf vectorizor ngram
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr_tv_ngram = LogisticRegression(C=c, max_iter=500000)
    lr_tv_ngram.fit(X_tv_ngram, y_train)
    print("Accuracy for C = %s : %s" % (c, accuracy_score(y_test, lr_tv_ngram.predict(X_tv_ngram_test))))

Accuracy for C = 0.01 : 0.8757385854968666
Accuracy for C = 0.05 : 0.8757385854968666
Accuracy for C = 0.25 : 0.8759176365264101
Accuracy for C = 0.5 : 0.8757385854968666
Accuracy for C = 1 : 0.8761563712324679


In [29]:
# linear SVM w/ count vectorizor
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm_cv = LinearSVC(C=c, max_iter=500000)
    svm_cv.fit(X_cv, y_train)
    print("Accuracy for C = %s : %s" % (c, accuracy_score(y_test, svm_cv.predict(X_cv_test))))

Accuracy for C = 0.01 : 0.8754401671142943
Accuracy for C = 0.05 : 0.8710832587287377
Accuracy for C = 0.25 : 0.8572366457773799
Accuracy for C = 0.5 : 0.8494777678304983
Accuracy for C = 1 : 0.8377200835571471


In [30]:
# linear SVM w/ count vectorizor ngram
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm_cv_ngram = LinearSVC(C=c, max_iter=500000)
    svm_cv_ngram.fit(X_cv_ngram, y_train)
    print("Accuracy for C = %s : %s" % (c, accuracy_score(y_test, svm_cv_ngram.predict(X_cv_ngram_test))))

Accuracy for C = 0.01 : 0.8752014324082363
Accuracy for C = 0.05 : 0.8666069829901522
Accuracy for C = 0.25 : 0.8533572068039391
Accuracy for C = 0.5 : 0.8460161145926589
Accuracy for C = 1 : 0.8393315428230379


In [31]:
# linear SVM w/ tf-idf vectorizor
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm_tv = LinearSVC(C=c, max_iter=500000)
    svm_tv.fit(X_tv, y_train)
    print("Accuracy for C = %s : %s" % (c, accuracy_score(y_test, svm_tv.predict(X_tv_test))))

Accuracy for C = 0.01 : 0.8757385854968666
Accuracy for C = 0.05 : 0.8757385854968666
Accuracy for C = 0.25 : 0.8765144732915547
Accuracy for C = 0.5 : 0.8756789018203521
Accuracy for C = 1 : 0.8718591465234259


In [32]:
# linear SVM w/ count vectorizor ngram
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    svm_tv_ngram = LinearSVC(C=c, max_iter=500000)
    svm_tv_ngram.fit(X_tv_ngram, y_train)
    print("Accuracy for C = %s : %s" % (c, accuracy_score(y_test, svm_tv_ngram.predict(X_tv_ngram_test))))

Accuracy for C = 0.01 : 0.8757385854968666
Accuracy for C = 0.05 : 0.8758579528498955
Accuracy for C = 0.25 : 0.8759773202029245
Accuracy for C = 0.5 : 0.8758579528498955
Accuracy for C = 1 : 0.8751417487317219
