In [16]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
df = pd.read_csv("../../data400_share/beer.csv", dtype={'review/text' : str})

In [18]:
df.columns

Index(['index', 'beer/ABV', 'beer/beerId', 'beer/brewerId', 'beer/name',
       'beer/style', 'review/appearance', 'review/aroma', 'review/overall',
       'review/palate', 'review/taste', 'review/text', 'review/timeStruct',
       'review/timeUnix', 'user/ageInSeconds', 'user/birthdayRaw',
       'user/birthdayUnix', 'user/gender', 'user/profileName'],
      dtype='object')

In [19]:
df = df[['review/text', 'review/overall']].dropna()
reviews_raw = df['review/text']
y = df['review/overall']

In [20]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
reviews_clean = [REPLACE_NO_SPACE.sub("", row.lower()) for row in reviews_raw]

In [21]:
cv = CountVectorizer(binary = True)
cv.fit(reviews_clean)
X = cv.transform(reviews_clean)
X_test = cv.transform(reviews_clean)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if rating > 4 else 0 for rating in y]

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

for c in np.arange(.01, .1, .01):
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.762936093033
Accuracy for C=0.02: 0.764002987304
Accuracy for C=0.03: 0.761975888189
Accuracy for C=0.04: 0.761762509335
Accuracy for C=0.05: 0.762829403606
Accuracy for C=0.06: 0.762722714179
Accuracy for C=0.07: 0.761762509335
Accuracy for C=0.08: 0.760802304492
Accuracy for C=0.09: 0.760375546783


In [23]:
lr = LogisticRegression(C=.04)
final_model = lr.fit(X, target)



In [24]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}

best_positive = sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:1500]

In [25]:
best_negative = sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:1500]


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
wordlist = best_positive + best_negative
wordlist = [x[0] for x in wordlist]

tfidf = TfidfVectorizer(vocabulary = wordlist).fit_transform(reviews_clean).toarray()
pd.DataFrame(tfidf, columns=wordlist).to_csv("tfidfsentiment.csv")

In [26]:
exceptional = [1 if 'exceptional' in r else 0 for r in reviews_clean]
excellent = [1 if 'excellent' in r else 0 for r in reviews_clean]
fantastic = [1 if 'fantastic' in r else 0 for r in reviews_clean]
wonderful = [1 if 'wonderful' in r else 0 for r in reviews_clean]
highly = [1 if 'highly' in r else 0 for r in reviews_clean]

ok = [1 if 'ok' in r else 0 for r in reviews_clean]
average = [1 if 'average' in r else 0 for r in reviews_clean]
bad = [1 if 'bad' in r else 0 for r in reviews_clean]
corn = [1 if 'corn' in r else 0 for r in reviews_clean]
sipper = [1 if 'sipper' in r else 0 for r in reviews_clean]

In [28]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components = 2)
lda.fit(X)


KeyboardInterrupt: 

In [None]:
groups = lda.transform(X)

In [None]:
topics = [1 if g[0] > g[1] else 0 for g in groups]

In [None]:
new = pd.DataFrame(exceptional, columns=['exceptional'])
new['excellent'] = excellent
new['fantastic'] = fantastic
new['wonderful'] = wonderful
new['highly'] = highly

new['ok'] = ok
new['average'] = average
new['bad'] = bad
new['corn'] = corn
new['sipper'] = sipper

new['group_lda'] = topics

new.head()
new.to_csv("sentiments.csv")

In [None]:
group_correctness = pd.DataFrame(topics, columns=['group'])
group_correctness['score'] = y
group1 = np.mean(group_correctness[group_correctness['group'] == 1]['score'])
group0 = np.mean(group_correctness[group_correctness['group'] == 0]['score'])

print("group1: ", group1, "group0: ", group0)