In [1]:
__author__ = "Billy Yuan, Nikita Lakhotia, Stuti Maddan, Tyler Nicolas, Wenduo Wang"
__copyright__ = "Well, knowledge is open to curious minds."
__license__ = "GPL-3.0"
__version__ = "0.1"
__maintainer__ = "Wenduo Wang"
__email__ = "wenduo.wang@utexas.edu"
__status__ = "development"
__date__ = "Sep/15/2016"

In [26]:
import pandas as pd
import numpy as np
import random as rd
import time, re, math, functools
from urllib2 import urlopen, Request
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from patsy import dmatrices
from nltk import pos_tag, bigrams
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as stpwds
from bs4 import BeautifulSoup

In [3]:
lmtz = WordNetLemmatizer().lemmatize

In [4]:
def timer(func):
    '''This is a decorator to return a function's running time'''
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = func(*args, **kwargs)
        t2 = time.time()
        print "{:>10}:{:>10.3f} seconds".format(func.__name__, t2-t1)
        return result
    return wrapper

In [5]:
@timer
def readData(portion, random_state=time.time()):
    '''Read in a certain portion of data in a random manner'''
    rd.seed(random_state)
    skip = rd.sample(xrange(1, 19999), int(math.ceil(19999*(1-portion))))
    data = pd.read_csv("yelp.csv", skiprows=skip)
    data["target"]=data.stars.map(lambda v: 1 if v>3 else 0)
    return data

In [6]:
@timer
def generateTrainTest(data, portion, random_state=time.time()):
    rd.seed(random_state)
    train_index = rd.sample(xrange(len(data)), int(math.ceil(len(data)*portion)))
    test_index = list(set(xrange(len(data)))-set(train_index))
    train_data = data.ix[train_index]
    test_data = data.ix[test_index]
    return train_data, test_data

In [7]:
@timer
def generateFormula(data):
    formula = "target~0"
    for var in data.columns.values.tolist():
        if data[var].dtype == "int64" and var not in ["stars", "target", "wc", "Review", "prediction"]:
            formula += "+"+var
        else:
            continue
    return formula

In [8]:
def splitXY(data):
    Y, X = dmatrices(generateFormula(data), data=data, return_type="dataframe")
    return X, np.ravel(Y)

In [9]:
def logistic_model(X, y):
    model = LogisticRegression(random_state=128)
    model.fit(X, y)
    return model

In [10]:
def printAccuracy(prediction, target):
    print "Accuracy: {:>6.4f}".format((prediction == target).mean())

In [11]:
def review2wc(text, lem=False):
    wc = {}
    text = text.lower()
    tokens = re.split("\W+", text)
    stopwords = stpwds.words("english")
    if lem:
        lmtzi = lmtz
        tokens = map(lmtz, tokens)
    while "" in tokens:
        tokens.remove("")
        
    for token in tokens:
        if token not in stopwords:
            try:
                wc[token] =+ 1
            except KeyError:
                wc[token] = 1
    return wc

In [12]:
@timer
def term_prob(corpus, subset):
    prob_dict = {}
    N = sum([i for (_, i) in list(corpus.items())])
    for key in corpus:
        if key not in subset:
            prob_dict[key] = 1.0 / N
        else:
            prob_dict[key] = subset[key] + 1.0 / N
    return prob_dict

@timer
def log_prob(term_prob_high, term_prob_low):
    term_log_prob = {}
    log = math.log
    for key in term_prob_high:
        term_log_prob[key] = log(term_prob_high[key]/term_prob_low[key])
    return term_log_prob

In [13]:
@timer
def token_count(wc):
    tc = {}
    for dic in wc.tolist():
        if len(dic) == 0: continue
        for token, count in dic.items():
            try:
                tc[token] += count
            except KeyError:
                tc[token] = 1
    return tc

In [14]:
def totalscore(wc, prior, benchmark):
    prob = 0
    for word, count in wc.items():
        try:
            prob += count * benchmark[word]
        except KeyError:
            prob += 0
    prob += math.log(prior/(1-prior+0.00001))
    return prob

In [15]:
class NBClassifier(object):
    
    def __init__(self):
        self.X = None
        self.y = None
        self.term_log_prob = None
        self.prior = None
    
    def fit(self, data, x_label, y_label):
        self.X = data[x_label]
        self.y = data[y_label]
        self.x_label = x_label
        self.y_label = y_label
        token_count_total = token_count(data[x_label])
        token_count_high = token_count(data[data[y_label]==1][x_label])
        token_count_low = token_count(data[data[y_label]==0][x_label])
        term_prob_high = term_prob(token_count_total, token_count_high)
        term_prob_low = term_prob(token_count_total, token_count_low)
        self.term_log_prob = log_prob(term_prob_high, term_prob_low)
        self.prior = len(data[data[y_label]==1])*1.0/len(data)
        
    def predict(self, test, threshold=None):
        totalscore_partial = functools.partial(totalscore, 
                                               prior= self.prior,
                                               benchmark=self.term_log_prob)
        score = test[self.x_label].map(totalscore_partial)
        if threshold == None:
            return score
        else:
            prediction = score.map(lambda x: 1 if x>threshold else 0)
            return prediction
        


In [104]:
def positiveness(test, positive, negative, threshold=1):
    product_positive = 0.1
    product_negative = 0.1
    len_positive = math.sqrt(sum(map(lambda x: x*x, positive.values())))
    len_negative = math.sqrt(sum(map(lambda x: x*x, negative.values())))
    for key in positive.keys():
        try:
            product_positive += positive[key] * test[key]
        except KeyError:
            continue
    product_positive = product_positive*1.0/len_positive    
        
    for key in negative.keys():
        try:
            product_negative += negative[key] * test[key]
        except KeyError:
            continue
    product_negative = product_negative*1.0/len_negative
    
    return ((product_positive*1.0/product_negative)>threshold)*1

@timer
def predRating(train, test):
    base_vec = train.wc.tolist()
    prediction = [0]*len(test)
    test_vec = test.wc.tolist()
    for i in range(len(test_vec)):
        vecCosine_partial = functools.partial(vecCosine, test=test_vec[i])
        cosine_list = map(vecCosine_partial, base_vec)
        top_5 = sorted(range(len(base_vec)), key=lambda i: cosine_list[i], reverse=True)[:5]
        prediction[i] = train.ix[top_5].stars.mean()
    pred_star = np.array(map(lambda x: (x>3)*1, prediction))
    return pred_star

#### Task A. Ignore the text (reviews) and run a classification model with the numeric data (you can use standard methods like logistic regression, k-nearest neighbors or anything else). What is the best accuracy of your model with numeric data?

In [17]:
data = readData(0.2, random_state=6)
train, test = generateTrainTest(data, 0.7, random_state=6)
X, y = splitXY(data)
model_1 = logistic_model(X, y)
X_test, y_test = splitXY(test)
prediction = model_1.predict(X_test)
printAccuracy(prediction, y_test)

  readData:     0.116 seconds
generateTrainTest:     0.006 seconds
generateFormula:     0.001 seconds
generateFormula:     0.001 seconds
Accuracy: 0.6964


#### Task B. Perform a supervised classification on a subset of the corpus using the reviews only. You can write your code in Python or R. What accuracy do you get from this text mining exercise?

In [18]:
data["wc"] = data.Review.map(review2wc)
train, test = generateTrainTest(data, 0.7, random_state=6)

generateTrainTest:     0.004 seconds


In [19]:
classifier = NBClassifier()
classifier.fit(train, "wc", "target")
prediction = classifier.predict(test, threshold=0)
printAccuracy(prediction, test.target)

token_count:     0.088 seconds
token_count:     0.029 seconds
token_count:     0.017 seconds
 term_prob:     0.008 seconds
 term_prob:     0.009 seconds
  log_prob:     0.006 seconds
Accuracy: 0.7048


#### Task C. Combine the numeric data and the text classification model (in task B) to create a “hybrid” model. It is your task to figure out how to do this. Now run this hybrid classification model and compare the results with those in A and B. Does the numeric data add to the predictive power relative to text?

In [20]:
data["total_score"] = classifier.predict(data, threshold=None)
train, test = generateTrainTest(data, 0.7, random_state=6)
X, y = splitXY(train)
model_2 = logistic_model(X, y)
X_test, y_test = splitXY(test)
prediction = model_2.predict(X_test)
printAccuracy(prediction, y_test)

generateTrainTest:     0.015 seconds
generateFormula:     0.001 seconds
generateFormula:     0.001 seconds
Accuracy: 0.6956


#### Task D. Use unsupervised sentiment analysis on the reviews (with SentiStrength or any other tool) and use the sentiment scores to predict high/low rating. Compare and contrast the results of tasks B and D. What can you conclude from your analysis?

In [25]:
totally_positive = "This restaurant is very good. It is actually the best on that I have ever been to.\
                    The queue could be long, but if you have booked well in advance it would not be a problem.\
                    Everyone smiles and their service is definitely professional. The foods are fantastic,\
                    and the price is low, I mean affordable. The wines are very nice, and there is a good collection\
                    of desserts which tastes phenomenal. The waiter and waitress are attentative and helpful.\
                    I believe they have been trained very well. Tables are clean, dishes\
                    served in time and they taste absolutely delicious. I totally recommend it."

totally_negative = "I can't believe this restaurant could be so bad. We waited for a long time before we were attended\
                    to by a waiter, who was so crude, maybe because he thought I couldn't afford the meal, the price of\
                    which by the way is riculously high. We each ordered 3 courses, but nothing showed up in the following\
                    30 minutes. Nobody even explained that to us. Finally I called the manager, and he just said they were\
                    busy. Well, I could see they were busy, but it doesn't make sense that other people were served better\
                    than us. And the end, we decided to give a smaller tip to the waitor (I preferred not at all), and\
                    I can still remember his face -- disgusting. Please don't go there!"

In [27]:
positive_vec = review2wc(totally_positive)
negative_vec = review2wc(totally_negative)

In [115]:
positiveness_partial = functools.partial(positiveness, positive=positive_vec, negative=negative_vec, threshold=.5)
unsupervised_prediction = data.wc.map(positiveness_partial)
printAccuracy(unsupervised_prediction, data.target)

Accuracy: 0.6712


#### Task E. Implement the PMI approach to sentiment analysis (in either Python or R), and run the classification model with the sentiment scores. How do your results compare with those in Task D?

In [None]:
def review2list(text):
    wc = {}
    text = text.lower()
    tokens = re.split("\W+", text)
    stopwords = stpwds.words("english")
    remove = tokens.remove
    while "" in tokens:
        remove("")
    for token in tokens:
        if token in stopwords:
            remove(token)
    token_pos = pos_tag(tokens)
    pos_list = map(lambda tup: tup[1])
    pos_bigram = bigrams(pos_list)
    return wc

In [None]:
pattern_1 = [("JJ", "NN"), ("JJ", "NNS"), 
           ("RB", "VB"), ("RB", "VBD"), ("RB", "VBN"), ("RB", "VBG"),
          ("RBR", "VB"), ("RBR", "VBD"), ("RBR", "VBN"), ("RBR", "VBG"),
          ("RBS", "VB"), ("RBS", "VBD"), ("RBS", "VBN"), ("RBS", "VBG")]
pattern_2 = [("RB", "JJ"), ("RBR", "JJ"), ("RBS", "JJ"),
            ("JJ", "JJ"),
            ("NN", "JJ"), ("NNS", "JJ")]
no_match = ["NN", "NNS"]

In [127]:
url = "http://www.google.com/search?q=good%20restaurant"
request = Request(url=url, headers={"user-agent":"Chrome/53.0.2785.113"})
soup = BeautifulSoup(urlopen(request).read(100000), "lxml")

In [138]:
int("".join(re.split("\D+",soup.find("div", id="resultStats").get_text().encode("utf-８"))))

857000000

In [139]:
def semanticOrientation(phrases, 
                        positive="excellent", 
                        negative="poor", 
                        url="http://www.google.com/search?q=%",
                        distance=5,
                        threshold=0):
    so_positive = 0.01
    so_negative = 0.01
    so_avg = 0
    request_partial = functools.partial(Request, headers={"user-agent":"Chrome/53.0.2785.113"})
    soup = functools.partial(BeautifulSoup, "lxml")
    for phrase in phrases:
        term = "%22{}+{}%22+AROUND({})+%22{}%22".format(phrase[0], distance, phrase[1])
        request = request_partial(url=url % term)
        s = soup(urlopen)

#### Task F. What are the top 5 “attributes” of a restaurant that are associated with (i) high and (ii) low ratings? That is, when people rate a restaurant high or low, are they more likely to mention service, ambiance, etc.? 