In [None]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("../../data400_share/beer.csv", dtype={'review/text' : str})

In [None]:
df.columns

In [None]:
df = df[['review/text', 'review/overall']].dropna()
reviews_raw = df['review/text']
y = df['review/overall']

In [None]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
reviews_clean = [REPLACE_NO_SPACE.sub("", row.lower()) for row in reviews_raw]

In [None]:
cv = CountVectorizer(binary = True)
cv.fit(reviews_clean)
X = cv.transform(reviews_clean)
X_test = cv.transform(reviews_clean)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if rating > 4 else 0 for rating in y]

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

for c in np.arange(.01, .1, .01):
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

In [None]:
lr = LogisticRegression(C=.04)
final_model = lr.fit(X, target)

In [None]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}

best_positive = sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:500]


In [None]:
best_negative = sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:500]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
wordlist = best_positive + best_negative
wordlist = [x[0] for x in wordlist]

tfidf = TfidfVectorizer(vocabulary = wordlist).fit_transform(reviews_clean).toarray()
tfidfsentiment = pd.DataFrame(tfidf, columns=wordlist)
tfidfsentiments.to_csv("tfidfsentiment.csv")

In [None]:
exceptional = [1 if 'exceptional' in r else 0 for r in reviews_clean]
excellent = [1 if 'excellent' in r else 0 for r in reviews_clean]
fantastic = [1 if 'fantastic' in r else 0 for r in reviews_clean]
wonderful = [1 if 'wonderful' in r else 0 for r in reviews_clean]
highly = [1 if 'highly' in r else 0 for r in reviews_clean]

ok = [1 if 'ok' in r else 0 for r in reviews_clean]
average = [1 if 'average' in r else 0 for r in reviews_clean]
bad = [1 if 'bad' in r else 0 for r in reviews_clean]
corn = [1 if 'corn' in r else 0 for r in reviews_clean]
sipper = [1 if 'sipper' in r else 0 for r in reviews_clean]

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components = 2)
lda.fit(X)

In [None]:
groups = lda.transform(X)

In [None]:
topics = [1 if g[0] > g[1] else 0 for g in groups]

In [None]:
new = pd.DataFrame(exceptional, columns=['exceptional'])
new['excellent'] = excellent
new['fantastic'] = fantastic
new['wonderful'] = wonderful
new['highly'] = highly

new['ok'] = ok
new['average'] = average
new['bad'] = bad
new['corn'] = corn
new['sipper'] = sipper

new['group_lda'] = topics

new.head()
sentiments = new.copy()
sentiments.to_csv("sentiments.csv")

In [None]:
beer = pd.read_csv("~/data400_share/beer.csv")
print(beer.shape)
beer = beer.dropna(subset=['review/appearance'])
beer = beer.dropna(subset=['review/aroma'])
beer = beer.dropna(subset=['review/overall'])
beer = beer.dropna(subset=['review/palate'])
beer = beer.dropna(subset=['review/taste'])
beer = beer.dropna(subset=['review/palate'])
beer = beer.dropna(subset=['review/text'])
beer = beer.iloc[:, list(range(0,13))]
beer["review/text"] = beer["review/text"].str.replace('[^\w\s]',' ')
beer["review/text"] = beer["review/text"].str.replace('\t', ' ')


In [None]:
beer["stopwordcount"] = 0
beer['charcount'] = 0
beer['cursewordcount'] = 0
beer['uniquecursewordcount'] = 0
cursewords = ["anal",
"anus",
"arse",
"ass",
"ballsack",
"balls",
"bastard",
"bitch",
"biatch",
"bloody",
"blowjob",
"bollock",
"bollok",
"boner",
"boob",
"bugger",
"bum",
"butt",
"clitoris",
"cock",
"coon",
"crap",
"cunt",
"damn",
"dick",
"dildo",
"dyke",
"fag",
"feck",
"fellate",
"fellatio",
"felching",
"fuck",
"fudgepacker",
"flange",
"goddamn",
"damn",
"hell",
"homo",
"jerk",
"jizz",
"knobend",
"labia",
"lmao",
"lmfao",
"muff",
"nigger",
"nigga",
"omg",
"penis",
"piss",
"poop",
"prick",
"pube",
"pussy",
"queer",
"scrotum",
"sex",
"shit",
"sh1t",
"slut",
"smegma",
"spunk",
"tit",
"tosser",
"turd",
"twat",
"vagina",
"wank",
"whore",
"wtf"]
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [None]:
beer["review/text"] = beer["review/text"].str.split(" ")

In [None]:
for i in range(beer.shape[0]):
    currentlist = beer.iloc[i, 11]
    currentlist = list(map(str.lower,beer.iloc[i, 11]))
    beer.iloc[i, 14] = (sum(len(c) for c in currentlist))
    results1 = {}
    results2 = {}
    for j in stopwords:
        results1[j] = currentlist.count(j)
    for k in cursewords:
        results2[k] = currentlist.count(k)
    beer.iloc[i, 13] = sum(results1.values())
    beer.iloc[i, 15] = sum(results2.values())
    beer.iloc[i, 16] = sum(1 for x in results2.values() if x >= 1)

In [None]:
generalfeatures = beer.copy()
generalfeatures.to_csv("generalfeatures.csv")

In [None]:
nrcsentiment = pd.read_csv("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep = "\t", header = None)
nrcsentiment.columns = ["word", "sentiment", "indicator"]
nrcsentiment = nrcsentiment[nrcsentiment.indicator == 1][["word", "sentiment"]]

In [None]:
df = pd.read_csv("beer.csv")
df = df.dropna(subset=['review/text'])

In [None]:
def cleaner(text):
    punct = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\t)")
    text = text.lower()
    return word_tokenize(re.sub(punct, " ", text))
 
def unique_over_total(text):
    numunique = len(np.unique(text))
    return numunique / len(text)

def tidy_text(df):
    """Takes our df and makes it into tidy data wrt text:
    id, word
    id, word etc...
    """
    identifier = df["index"]
    words = {"index": [], "word": [], "part_of_speech": []}
    uniquefeature = []
    
    for i in identifier:
        texttosplit = list(df[df["index"] == i]["review/text"])[0]
        cleaned = cleaner(texttosplit)
        
        uniquefeature.append(unique_over_total(cleaned))
        
        for j in np.unique(cleaned):
            words["index"].append(i)
            words["word"].append(j)
            words["part_of_speech"].append(pos_tag([j])[0][1])
        
    untidydf = pd.DataFrame({"index": identifier, "proportion_unique": uniquefeature})
    tidydf = pd.DataFrame({"index": words["index"], "word": words["word"], "part_of_speech": words["part_of_speech"]})
    
    return untidydf, tidydf

In [None]:
metrics, tidydata = tidy_text(df)

In [None]:
sentimentjoined = pd.merge(tidydata, nrcsentiment, on = "word")

In [None]:
sentimentjoined = pd.get_dummies(sentimentjoined, columns = ["sentiment"])
groupcols = ['sentiment_anger', 'sentiment_anticipation',
             'sentiment_disgust', 'sentiment_fear', 'sentiment_joy',
             'sentiment_negative', 'sentiment_positive', 'sentiment_sadness',
             'sentiment_surprise', 'sentiment_trust']

In [None]:
finalsentiment = sentimentjoined.groupby(["index"])[groupcols].sum()
finalsentiment["index"] = finalsentiment.index
finalsentiment.reset_index(drop = True, inplace = True)
summedsentiments = finalsentiment[groupcols].sum(axis = 1)

for i in groupcols:
    finalsentiment[i] = finalsentiment[i] / summedsentiments

In [None]:
additionalfeatures = pd.merge(pd.merge(finalsentiment, metrics, on = "index"), df, on = "index")

In [None]:
additionalfeatures.to_csv("additionalfeatures.csv")

In [None]:
depcols = ['review/appearance', 'review/aroma', 'review/overall',
           'review/palate', 'review/taste']

In [None]:
generalfeatures = pd.read_csv("generalfeatures.csv")[depcols + ["index", "stopwordcount", "charcount", "cursewordcount", "uniquecursewordcount"]]

df = pd.merge(pd.merge(pd.read_csv("tfidfsentiment.csv"), 
                       pd.read_csv("sentiments.csv"), 
                       on = "index"), 
              pd.read_csv("additionalfeatures.csv"), 
              on = "index")

df = pd.merge(df, generalfeatureS, on = "index")

In [None]:
df.to_csv("finalset.csv")