We shall do

- \# of sentiment words by group
- \# of unique / \# of words

In [24]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [25]:
pos_tag(["we", "are", "out", "here"])

[('we', 'PRP'), ('are', 'VBP'), ('out', 'RP'), ('here', 'RB')]

In [2]:
nrcsentiment = pd.read_csv("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep = "\t", header = None)
nrcsentiment.columns = ["word", "sentiment", "indicator"]
nrcsentiment = nrcsentiment[nrcsentiment.indicator == 1][["word", "sentiment"]]

In [3]:
df = pd.read_csv("beer.csv")
df = df.dropna(subset=['review/text'])

In [34]:
def cleaner(text):
    punct = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\t)")
    text = text.lower()
    return word_tokenize(re.sub(punct, " ", text))
 
def unique_over_total(text):
    numunique = len(np.unique(text))
    return numunique / len(text)

def tidy_text(df):
    """Takes our df and makes it into tidy data wrt text:
    id, word
    id, word etc...
    """
    identifier = df["index"]
    words = {"index": [], "word": [], "part_of_speech": []}
    uniquefeature = []
    
    for i in identifier:
        texttosplit = list(df[df["index"] == i]["review/text"])[0]
        cleaned = cleaner(texttosplit)
        
        uniquefeature.append(unique_over_total(cleaned))
        
        for j in np.unique(cleaned):
            words["index"].append(i)
            words["word"].append(j)
            words["part_of_speech"].append(pos_tag([j])[0][1])
        
    untidydf = pd.DataFrame({"index": identifier, "proportion_unique": uniquefeature})
    tidydf = pd.DataFrame({"index": words["index"], "word": words["word"], "part_of_speech": words["part_of_speech"]})
    
    return untidydf, tidydf

In [None]:
metrics, tidydata = tidy_text(df)

In [6]:
metrics.head()

Unnamed: 0,index,proportion_unique
0,40163,0.744681
1,8135,0.660194
2,10529,0.850746
3,44610,0.814286
4,37062,0.747368


In [7]:
tidydata.head()

Unnamed: 0,index,word
0,40163,a
1,40163,absinthe
2,40163,added
3,40163,after
4,40163,aftertaste


In [8]:
nrcsentiment.head()

Unnamed: 0,word,sentiment
19,abacus,trust
23,abandon,fear
25,abandon,negative
27,abandon,sadness
30,abandoned,anger


In [9]:
sentimentjoined = pd.merge(tidydata, nrcsentiment, on = "word")

In [10]:
sentimentjoined = pd.get_dummies(sentimentjoined, columns = ["sentiment"])
groupcols = ['sentiment_anger', 'sentiment_anticipation',
             'sentiment_disgust', 'sentiment_fear', 'sentiment_joy',
             'sentiment_negative', 'sentiment_positive', 'sentiment_sadness',
             'sentiment_surprise', 'sentiment_trust']

In [11]:
finalsentiment = sentimentjoined.groupby(["index"])[groupcols].sum()
finalsentiment["index"] = finalsentiment.index
finalsentiment.reset_index(drop = True, inplace = True)
summedsentiments = finalsentiment[groupcols].sum(axis = 1)

for i in groupcols:
    finalsentiment[i] = finalsentiment[i] / summedsentiments

In [19]:
finaldf = pd.merge(pd.merge(finalsentiment, metrics, on = "index"), df, on = "index")

In [21]:
finaldf.columns

Index(['sentiment_anger', 'sentiment_anticipation', 'sentiment_disgust',
       'sentiment_fear', 'sentiment_joy', 'sentiment_negative',
       'sentiment_positive', 'sentiment_sadness', 'sentiment_surprise',
       'sentiment_trust', 'index', 'proportion_unique', 'beer/ABV',
       'beer/beerId', 'beer/brewerId', 'beer/name', 'beer/style',
       'review/appearance', 'review/aroma', 'review/overall', 'review/palate',
       'review/taste', 'review/text', 'review/timeStruct', 'review/timeUnix',
       'user/ageInSeconds', 'user/birthdayRaw', 'user/birthdayUnix',
       'user/gender', 'user/profileName'],
      dtype='object')

In [22]:
features = groupcols + ["proportion_unique", ]

0        5.0
1        6.2
2        6.5
3        5.0
4        7.7
5        4.7
6        4.7
7        4.7
8        4.7
9        4.7
10       4.7
11       4.7
12       7.2
13       5.6
14       7.4
15       7.4
16       7.4
17       7.4
18       7.4
19       7.4
20       7.4
21       7.4
22       7.4
23       7.4
24       7.4
25       7.4
26       7.4
27       7.4
28       7.4
29       7.4
        ... 
37454    9.0
37455    9.0
37456    9.0
37457    9.0
37458    9.0
37459    9.0
37460    9.0
37461    9.0
37462    9.0
37463    9.0
37464    9.0
37465    9.0
37466    9.0
37467    9.0
37468    9.0
37469    9.0
37470    9.0
37471    9.0
37472    9.0
37473    9.0
37474    9.0
37475    9.0
37476    9.0
37477    9.0
37478    9.0
37479    9.0
37480    9.0
37481    9.0
37482    9.0
37483    9.0
Name: beer/ABV, Length: 37484, dtype: float64

In [None]:
""".agg({
    'sentiment_anger' : np.sum,
    'sentiment_anticipation' : np.sum,
    'sentiment_disgust' : np.sum,
    'sentiment_fear' : np.sum,
    'sentiment_joy' : np.sum,
    'sentiment_negative' : np.sum,
    'sentiment_positive' : np.sum,
    'sentiment_sadness' : np.sum,
    'sentiment_surprise' : np.sum,
    'sentiment_trust' : np.sum,
})"""
