In [72]:
import re
import pandas as pd
import numpy as np

stop = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
        "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", 
        "she", "her", "hers", "herself", "it", "its", "itself", "they", 
        "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", 
        "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", 
        "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", 
        "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", 
        "for", "with", "about", "against", "between", "into", "through", "during", "before", 
        "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", 
        "over", "under", "again", "further", "then", "once", "here", "there", "when", 
        "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", 
        "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", 
        "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

data = pd.read_csv("beer.csv")
data.drop('index',axis =1, inplace = True)
newCols = [col.split('/')[1] for col in data.columns]
data = pd.DataFrame(data.values, columns = newCols).drop(['beerId','brewerId'], axis = 1)

In [73]:
train = data
# Additional featureW
train['word_count'] = train['text'].apply(lambda x: len(str(x).split(" ")))
train['char_count'] = train['text'].str.len() ## this also includes spaces
train['avg_word_len'] = train['text'].apply(lambda x: avg_word(str(x)))
train[['text','avg_word_len']].head()
train['stopwords'] = train['text'].apply(lambda x: len([x for x in str(x).split() if x in stop]))
train['questions'] = train['text'].apply(lambda x: len([x for x in str(x).split() if '?' in str(x)]))
train['ellipses'] = train['text'].apply(lambda x: len([x for x in str(x).split() if '...' in str(x)]))
train['numerics'] = train['text'].apply(lambda x: len([x for x in str(x).split() if x.isdigit()]))
train['upper'] = train['text'].str.findall(r'[A-Z]').str.len()
# Convert reviews to lower case
train['text'] = train['text'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
# Remove stop words
train['text'] = train['text'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
# Remove punctuation characters from  text review
train['text'] = train['text'].str.replace('[^\w\s]','')
train['text'].head()

0    pours clouded gold thin white head nose quite ...
1    12oz bottle 8oz snifter deep ruby red hue one ...
2    first enjoyed brewpub 2 years ago finally mana...
3    first thing noticed pouring green bottle glass...
4    a pours amber one finger head strong pour head...
Name: text, dtype: object

In [74]:
# Remove most freq words
m_freq = pd.Series(' '.join(train['text']).split()).value_counts()[:10]
m_freq = list(freq.index) #take out most popular words
train['text'] = train['text'].apply(lambda x: " ".join(x for x in str(x).split() if x not in m_freq))
train['text'].head()

0    pours clouded gold thin white head nose quite ...
1    12oz bottle 8oz snifter deep ruby red hue one ...
2    first enjoyed brewpub 2 years ago finally mana...
3    first thing noticed pouring green bottle glass...
4    a pours amber one finger head strong pour head...
Name: text, dtype: object

In [75]:
# Remove least frequent words
l_freq = pd.Series(' '.join(train['text']).split()).value_counts()[-10:]
l_freq = list(freq.index)
train['text'] = train['text'].apply(lambda x: " ".join(x for x in str(x).split() if x not in l_freq))
train['text'].head() #remove least common words

0    pours clouded gold thin white head nose quite ...
1    12oz bottle 8oz snifter deep ruby red hue one ...
2    first enjoyed brewpub 2 years ago finally mana...
3    first thing noticed pouring green bottle glass...
4    a pours amber one finger head strong pour head...
Name: text, dtype: object

In [76]:
'''
import textblob
import nltk
from textblob import Word
train['review/text'] = train['review/text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in str(x).split()]))
train['review/text'].head()
'''


'\nimport textblob\nimport nltk\nfrom textblob import Word\ntrain[\'review/text\'] = train[\'review/text\'].apply(lambda x: " ".join([Word(word).lemmatize() for word in str(x).split()]))\ntrain[\'review/text\'].head()\n'

In [77]:
train.columns

Index(['ABV', 'name', 'style', 'appearance', 'aroma', 'overall', 'palate',
       'taste', 'text', 'timeStruct', 'timeUnix', 'ageInSeconds',
       'birthdayRaw', 'birthdayUnix', 'gender', 'profileName', 'word_count',
       'char_count', 'avg_word_len', 'stopwords', 'questions', 'ellipses',
       'numerics', 'upper'],
      dtype='object')

In [134]:
open_beer = pd.read_csv('open-beer-database.csv', sep = ';').rename(columns = {'Name':'name'})
open_beer = open_beer[['name','Country']]
open_beer.drop_duplicates(inplace = True)
one_hot = pd.get_dummies(open_beer['Country'])
open_beer.drop('Country', axis = 1, inplace = True)
open_beer = open_beer.join(one_hot)
beer_names = list(set(open_beer['name'].values))

In [147]:
# Compress one hot encoding to get one beer and all locations into one row
beer_locs = []
for beer in beer_names:
    cur_beer = open_beer[open_beer['name'] == beer]
    t = np.zeros(one_hot.shape[1])
    for row in nut.iterrows():
        index, data = row
        t = np.add( t, np.array(data)[1:])
    beer_locs.append(t)   
final_hot = pd.DataFrame.from_items(zip(one_hot.columns, np.array(beer_locs).T))
final_hot['name'] = beer_names

In [153]:
merged = pd.merge(train, final_hot, on = 'name', how = 'left')

In [154]:
merged.shape

(37500, 85)