In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize

In [2]:
def cleaner(text):
    punct = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\t)")
    text = text.lower()
    return word_tokenize(re.sub(punct, " ", text))

def build_vocabulary(df):
    """
    create a vocab + number of 
    """
    
    vocabulary = set()
    counts = {}
    
    for i in df["index"]:
        texttosplit = list(df[df["index"] == i]["review/text"])[0]
        cleaned = cleaner(texttosplit)
        
        for word in cleaned:
            if word not in vocabulary:
                vocabulary.add(word)
                counts.update({word: 1})
            else:
                counts[word] += 1
                
    return vocabulary, counts

In [3]:
df = pd.read_csv("beer.csv")
df = df.dropna(subset=['review/text'])

In [4]:
vocab, frequencies = build_vocabulary(df)

In [5]:
def build_vectors(df, vocab, frequencies, vocabsize = 3000):
    
    vectors = {}
    realvocab = sorted(frequencies.items(), 
                       key = lambda x: x[1], 
                       reverse = True)[:vocabsize]
    
    idx = {realvocab[c][0]: c for c in range(vocabsize)}
    #print(idx)
    
    for identifier in df["index"]:
        
        texttosplit = list(df[df["index"] == identifier]["review/text"])[0]
        cleaned = set(cleaner(texttosplit))
        vector = []
        
        for word in realvocab:
            if word[0] in cleaned:
                vector.append(1)
            else:
                vector.append(0)
        vectors.update({identifier: vector})
    return realvocab, vectors
    #for word in realvocab:
    #    word[0]
    

In [6]:
realvocab, vectors = build_vectors(df, vocab, frequencies, vocabsize = 5000)

In [7]:
df_vectors = pd.DataFrame.from_dict(vectors, orient = "index")

In [8]:
allvocab = [word[0] for word in realvocab]

In [9]:
df_vectors.columns = allvocab
df_vectors["index"] = df_vectors.index

In [10]:
groupcols_pos = [
    'part_of_speech_#', 'part_of_speech_$',
    'part_of_speech_\'\'', 'part_of_speech_(', 'part_of_speech_)',
    'part_of_speech_:', 'part_of_speech_CC', 'part_of_speech_CD',
    'part_of_speech_DT', 'part_of_speech_IN', 'part_of_speech_JJ',
    'part_of_speech_JJR', 'part_of_speech_JJS', 'part_of_speech_LS',
    'part_of_speech_MD', 'part_of_speech_NN', 'part_of_speech_NNS',
    'part_of_speech_POS', 'part_of_speech_PRP', 'part_of_speech_PRP$',
    'part_of_speech_RB', 'part_of_speech_RBR', 'part_of_speech_TO',
    'part_of_speech_VB', 'part_of_speech_VBD', 'part_of_speech_VBG',
    'part_of_speech_VBN', 'part_of_speech_VBP', 'part_of_speech_VBZ',
    'part_of_speech_WDT', 'part_of_speech_WP', 'part_of_speech_WP$',
    'part_of_speech_WRB', 'part_of_speech_``'
    
]

groupcols = ['sentiment_anger', 'sentiment_anticipation',
             'sentiment_disgust', 'sentiment_fear', 'sentiment_joy',
             'sentiment_negative', 'sentiment_positive', 'sentiment_sadness',
             'sentiment_surprise', 'sentiment_trust']

indepcols = ['beer/ABV',
             'stopwordcount', 'charcount', 'cursewordcount', 'uniquecursewordcount',
             'exceptional_y', 'excellent_y', 'fantastic_y', 'wonderful_y', 'highly_y', 'ok_y',
             'average_y', 'bad_y', 'corn_y', 'sipper_y', 'group_lda',
            ]

bad = ['exceptional', 'excellent', 'fantastic', 'wonderful', 'highly', 'ok',
             'average', 'bad', 'corn', 'sipper']

allvocab = [x for x in allvocab if x not in bad]

indepcols = allvocab + indepcols + groupcols + groupcols_pos + ["proportion_unique"]

depcols = ['review/appearance', 'review/aroma', 'review/overall',
           'review/palate', 'review/taste']

In [11]:
import json

with open("independent_variables.json", "w") as outfile:
    json.dump(indepcols, outfile)
    
with open("dependent_variables.json", "w") as outfile:
    json.dump(depcols, outfile)

In [12]:
df = pd.read_csv("all_attributes.csv").dropna()
finaldf = pd.merge(df_vectors, df, on = "index")
finaldf.to_csv("for_neural_regression.csv", index = False, encoding = "utf-8")

# Neural Network predictions