We shall do

- \# of sentiment words by group
- \# of unique / \# of words

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [25]:
pos_tag(["we", "are", "out", "here"])

[('we', 'PRP'), ('are', 'VBP'), ('out', 'RP'), ('here', 'RB')]

In [2]:
nrcsentiment = pd.read_csv("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt", sep = "\t", header = None)
nrcsentiment.columns = ["word", "sentiment", "indicator"]
nrcsentiment = nrcsentiment[nrcsentiment.indicator == 1][["word", "sentiment"]]

In [3]:
df = pd.read_csv("beer.csv")

In [34]:
def cleaner(text):
    punct = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\t)")
    text = text.lower()
    return word_tokenize(re.sub(punct, " ", text))
 
def unique_over_total(text):
    numunique = len(np.unique(text))
    return numunique / len(text)

def tidy_text(df):
    """Takes our df and makes it into tidy data wrt text:
    id, word
    id, word etc...
    """
    identifier = df["index"]
    words = {"index": [], "word": [], "part_of_speech": []}
    uniquefeature = []
    
    for i in identifier:
        texttosplit = list(df[df["index"] == i]["review/text"])[0]
        cleaned = cleaner(texttosplit)
        
        uniquefeature.append(unique_over_total(cleaned))
        
        for j in np.unique(cleaned):
            words["index"].append(i)
            words["word"].append(j)
            words["part_of_speech"].append(pos_tag([j])[0][1])
        
    untidydf = pd.DataFrame({"index": identifier, "proportion_unique": uniquefeature})
    tidydf = pd.DataFrame({"index": words["index"], "word": words["word"], "part_of_speech": words["part_of_speech"]})
    
    return untidydf, tidydf

In [35]:
metrics, tidydata = tidy_text(df)

In [36]:
metrics.head()

Unnamed: 0,index,proportion_unique
0,40163,0.744681
1,8135,0.660194
2,10529,0.850746
3,44610,0.814286
4,37062,0.747368


In [37]:
tidydata.head()

Unnamed: 0,index,part_of_speech,word
0,40163,DT,a
1,40163,NN,absinthe
2,40163,VBD,added
3,40163,IN,after
4,40163,NN,aftertaste


In [38]:
nrcsentiment.head()

Unnamed: 0,word,sentiment
19,abacus,trust
23,abandon,fear
25,abandon,negative
27,abandon,sadness
30,abandoned,anger


# Sentiment sums

In [72]:
sentimenttidy = tidydata[["index", "word"]]

In [75]:
sentimentjoined = pd.merge(sentimenttidy, nrcsentiment, on = "word")

In [76]:
sentimentjoined = pd.get_dummies(sentimentjoined, columns = ["sentiment"])
groupcols = ['sentiment_anger', 'sentiment_anticipation',
             'sentiment_disgust', 'sentiment_fear', 'sentiment_joy',
             'sentiment_negative', 'sentiment_positive', 'sentiment_sadness',
             'sentiment_surprise', 'sentiment_trust']

In [77]:
finalsentiment = sentimentjoined.groupby(["index"])[groupcols].sum()
finalsentiment["index"] = finalsentiment.index
finalsentiment.reset_index(drop = True, inplace = True)
summedsentiments = finalsentiment[groupcols].sum(axis = 1)


for i in groupcols:
    finalsentiment[i] = finalsentiment[i] / summedsentiments

In [120]:
sentfinal = pd.merge(pd.merge(finalsentiment, metrics, on = "index"), df, on = "index")

# POS sums

In [99]:
tidydata = pd.get_dummies(tidydata, columns = ["part_of_speech"])

ValueError: labels ['part_of_speech'] not contained in axis

In [114]:
groupcols_pos = [
    'part_of_speech_#', 'part_of_speech_$',
    'part_of_speech_\'\'', 'part_of_speech_(', 'part_of_speech_)',
    'part_of_speech_:', 'part_of_speech_CC', 'part_of_speech_CD',
    'part_of_speech_DT', 'part_of_speech_IN', 'part_of_speech_JJ',
    'part_of_speech_JJR', 'part_of_speech_JJS', 'part_of_speech_LS',
    'part_of_speech_MD', 'part_of_speech_NN', 'part_of_speech_NNS',
    'part_of_speech_POS', 'part_of_speech_PRP', 'part_of_speech_PRP$',
    'part_of_speech_RB', 'part_of_speech_RBR', 'part_of_speech_TO',
    'part_of_speech_VB', 'part_of_speech_VBD', 'part_of_speech_VBG',
    'part_of_speech_VBN', 'part_of_speech_VBP', 'part_of_speech_VBZ',
    'part_of_speech_WDT', 'part_of_speech_WP', 'part_of_speech_WP$',
    'part_of_speech_WRB', 'part_of_speech_``'
    
]

In [115]:
tidydata.head().columns

Index(['index', 'word', 'part_of_speech_#', 'part_of_speech_$',
       'part_of_speech_''', 'part_of_speech_(', 'part_of_speech_)',
       'part_of_speech_:', 'part_of_speech_CC', 'part_of_speech_CD',
       'part_of_speech_DT', 'part_of_speech_IN', 'part_of_speech_JJ',
       'part_of_speech_JJR', 'part_of_speech_JJS', 'part_of_speech_LS',
       'part_of_speech_MD', 'part_of_speech_NN', 'part_of_speech_NNS',
       'part_of_speech_POS', 'part_of_speech_PRP', 'part_of_speech_PRP$',
       'part_of_speech_RB', 'part_of_speech_RBR', 'part_of_speech_TO',
       'part_of_speech_VB', 'part_of_speech_VBD', 'part_of_speech_VBG',
       'part_of_speech_VBN', 'part_of_speech_VBP', 'part_of_speech_VBZ',
       'part_of_speech_WDT', 'part_of_speech_WP', 'part_of_speech_WP$',
       'part_of_speech_WRB', 'part_of_speech_``'],
      dtype='object')

In [116]:
finalpos = tidydata.groupby(["index"])[groupcols_pos].sum()
finalpos["index"] = finalpos.index
finalpos.reset_index(drop = True, inplace = True)
summedpos = finalpos[groupcols_pos].sum(axis = 1)

for i in groupcols_pos:
    finalpos[i] = finalpos[i] / summedpos

# Joining it all together

In [118]:
finalpos.head()

Unnamed: 0,part_of_speech_#,part_of_speech_$,part_of_speech_'',part_of_speech_(,part_of_speech_),part_of_speech_:,part_of_speech_CC,part_of_speech_CD,part_of_speech_DT,part_of_speech_IN,...,part_of_speech_VBG,part_of_speech_VBN,part_of_speech_VBP,part_of_speech_VBZ,part_of_speech_WDT,part_of_speech_WP,part_of_speech_WP$,part_of_speech_WRB,part_of_speech_``,index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.057143,0.0,0.085714,0.142857,...,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.054054,0.0,0.054054,0.081081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.042553,0.0,0.106383,0.085106,...,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.046512,0.0,0.069767,0.093023,...,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.008,0.0,0.056,0.064,...,0.032,0.016,0.008,0.008,0.008,0.0,0.0,0.0,0.0,4


In [122]:
finaldf = pd.merge(finalpos, sentfinal, on = "index")

In [123]:
features = groupcols_pos + groupcols + ["proportion_unique", "index"]

In [124]:
finaldf = finaldf[features]

In [126]:
finaldf.to_csv("more_features.csv", encoding = "utf-8", index = False)