In [39]:
import pandas as pd
import numpy as np
import random as rd
import re, math, functools
from sklearn.linear_model import LogisticRegression
from patsy import dmatrices
from nltk import pos_tag, bigrams
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as stpwds

In [2]:
rd.seed(6)
lmtz = WordNetLemmatizer().lemmatize

In [3]:
def timer(func):
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = func(*args, **kwargs)
        t2 = time.time()
        print "{:>10}:{:>10.3f} seconds".format(func.__name__, t2-t1)
        return result
    return wrapper

In [4]:
@timer
def readData(portion):
    skip = rd.sample(xrange(1, 19999), int(math.ceil(19999*(1-portion))))
    data = pd.read_csv("yelp.csv", skiprows=skip)
    data["target"]=data.stars.map(lambda v: 1 if v>3 else 0)
    return data

In [5]:
data=readData(0.1)

In [6]:
@timer
def generateTrainTest(data, portion):
    train_index = rd.sample(xrange(len(data)), int(math.ceil(len(data)*portion)))
    test_index = list(set(xrange(len(data)))-set(train_index))
    train_data = data.ix[train_index]
    test_data = data.ix[test_index]
    return train_data, test_data

In [7]:
@timer
def generateFormula(data, num=False):
    formula = "target~0"
    for var in data.columns.values.tolist():
        if num:
            if data[var].dtype == "int64" and var not in ["stars", "target"]:
                formula += "+"+var
            else:
                continue
        else:
            if var not in ["stars", "target", "wc", "Review", "prediction"]:
                formula += "+"+var
            else:
                continue
    return formula

In [8]:
def logistic_model(data, num=False):
    Y, X = dmatrices(generateFormula(data, num=num), data=data, return_type="dataframe")
    model = LogisticRegression(random_state=128)
    model.fit(X, np.ravel(Y))
    return model

In [9]:
def predAccuracy(model, data, num):
    y, x = dmatrices(generateFormula(data, num=num), data=data, return_type="dataframe")
    print "Accuracy: {:>6.4f}".format((model.predict(x) == np.ravel(y)).mean())

In [10]:
def review2wc(text, lem=False):
    wc = {}
    text = text.lower()
    tokens = re.split("\W+", text)
    stopwords = stpwds.words("english")
    if lem:
        lmtzi = lmtz
        tokens = map(lmtz, tokens)
    while "" in tokens:
        tokens.remove("")
        
    for token in tokens:
        if token not in stopwords:
            try:
                wc[token] =+ 1
            except KeyError:
                wc[token] = 1
    return wc

In [11]:
@timer
def term_prob(corpus, subset):
    prob_dict = {}
    N = sum([i for (_, i) in list(corpus.items())])
    for key in corpus:
        if key not in subset:
            prob_dict[key] = 1.0 / N
        else:
            prob_dict[key] = subset[key] + 1.0 / N
    return prob_dict

@timer
def log_prob(term_prob_high, term_prob_low):
    term_log_prob = {}
    log = math.log
    for key in term_prob_high:
        term_log_prob[key] = log(term_prob_high[key]/term_prob_low[key])
    return term_log_prob

In [12]:
@timer
def token_count(wc):
    tc = {}
    for dic in wc.tolist():
        if len(dic) == 0: continue
        for token, count in dic.items():
            try:
                tc[token] += count
            except KeyError:
                tc[token] = 1
    return tc

In [13]:
def totalscore(text, prior, benchmark):
    prob = 0
    review2wci = review2wc
    wc = review2wci(text)
    for word, count in wc.items():
        try:
            prob += count * benchmark[word]
        except KeyError:
            prob += 0
    # add log(prior)
    prob += math.log(prior/(1-prior+0.00001))
    return prob

In [38]:
def vecCosine(base, test):
    product = 0
    len_base = math.sqrt(sum(map(lambda x: x*x, base.values())))
    len_test = math.sqrt(sum(map(lambda x: x*x, test.values())))
    for key in base.keys():
        try:
            product += base[key] * test[key]
        except KeyError:
            continue
    return product

@timer
def predStar(train, test):
    base_vec = train.wc.tolist()
    prediction = [0]*len(test)
    test_vec = test.wc.tolist()
    for i in range(len(test_vec)):
        vecCosine_partial = functools.partial(vecCosine, test=test_vec[i])
        cosine_list = map(vecCosine_partial, base_vec)
        top_5 = sorted(range(len(base_vec)), key=lambda i: cosine_list[i], reverse=True)[:5]
        prediction[i] = train.ix[top_5].stars.mean()
    pred_star = np.array(map(lambda x: (x>3)*1, prediction))
    return pred_star

#### Task A. Ignore the text (reviews) and run a classification model with the numeric data (you can use standard methods like logistic regression, k-nearest neighbors or anything else). What is the best accuracy of your model with numeric data?

In [14]:
data = readData(0.2)
train, test = generateTrainTest(data, 0.7)
model_1 = logistic_model(train, num=True)
predAccuracy(model_1, test, num=True)

Accuracy: 0.7056


#### Task B. Perform a supervised classification on a subset of the corpus using the reviews only. You can write your code in Python or R. What accuracy do you get from this text mining exercise?

In [15]:
data["wc"] = data.Review.map(review2wc)
token_count_total = token_count(data.wc)
token_count_high = token_count(data[data["target"]==1].wc)
token_count_low = token_count(data[data["target"]==0].wc)
term_prob_high = term_prob(token_count_total, token_count_high)
term_prob_low = term_prob(token_count_total, token_count_low)
term_log_prob = log_prob(term_prob_high, term_prob_low)
totalscore_partial = functools.partial(totalscore, prior=len(data[data.target==1])*1.0/len(data), benchmark=term_log_prob)
data["review_score"] = data.Review.map(totalscore_partial)
prediction = data.review_score.map(lambda x: 1 if x>0 else 0)

In [16]:
print "Accuracy: {:>6.4f}".format((prediction == data.target).mean())

Accuracy: 0.8617


#### Task C. Combine the numeric data and the text classification model (in task B) to create a “hybrid” model. It is your task to figure out how to do this. Now run this hybrid classification model and compare the results with those in A and B. Does the numeric data add to the predictive power relative to text?

In [17]:
train, test = generateTrainTest(data, 0.7)
model_2 = logistic_model(train, num=False)
predAccuracy(model_2, test, num=False)

Accuracy: 0.8874


#### Task D. Use unsupervised sentiment analysis on the reviews (with SentiStrength or any other tool) and use the sentiment scores to predict high/low rating. Compare and contrast the results of tasks B and D. What can you conclude from your analysis?

1. take test set
2. measure length, create an empty list to store predictions
3. compare vector angle between wc and base line by line 
4. take the top 5 best matches and take average of stars, if average>3, predict 1.

In [None]:
pred = predStar(train, test)
print "Accuracy: {:>6.4f}".format((pred == test.target).mean())

#### Task E. Implement the PMI approach to sentiment analysis (in either Python or R), and run the classification model with the sentiment scores. How do your results compare with those in Task D?

In [None]:
def review2list(text):
    wc = {}
    text = text.lower()
    tokens = re.split("\W+", text)
    stopwords = stpwds.words("english")
    remove = tokens.remove
    while "" in tokens:
        remove("")
    for token in tokens:
        if token in stopwords:
            remove(token)
    token_pos = pos_tag(tokens)
    pos_list = map(lambda tup: tup[1])
    pos_bigram = bigrams(pos_list)
    return wc