In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import math
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
path = "beer.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,index,beer/ABV,beer/beerId,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,review/timeStruct,review/timeUnix,user/ageInSeconds,user/birthdayRaw,user/birthdayUnix,user/gender,user/profileName
0,40163,5.0,46634,14338,Chiostro,Herbed / Spiced Beer,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,"{'min': 38, 'hour': 3, 'mday': 16, 'sec': 10, ...",1229398690,,,,,RblWthACoz
1,8135,11.0,3003,395,Bearded Pat's Barleywine,American Barleywine,4.0,3.5,3.5,3.5,3.0,12oz bottle into 8oz snifter.\t\tDeep ruby red...,"{'min': 38, 'hour': 23, 'mday': 8, 'sec': 58, ...",1218238738,,,,,BeerSox
2,10529,4.7,961,365,Naughty Nellie's Ale,American Pale Ale (APA),3.5,4.0,3.5,3.5,3.5,First enjoyed at the brewpub about 2 years ago...,"{'min': 7, 'hour': 18, 'mday': 26, 'sec': 2, '...",1101492422,,,,Male,mschofield
3,44610,4.4,429,1,Pilsner Urquell,Czech Pilsener,3.0,3.0,2.5,3.0,3.0,First thing I noticed after pouring from green...,"{'min': 7, 'hour': 1, 'mday': 20, 'sec': 5, 'y...",1308532025,1209827000.0,"Aug 10, 1976",208508400.0,Male,molegar76
4,37062,4.4,4904,1417,Black Sheep Ale (Special),English Pale Ale,4.0,3.0,3.0,3.5,2.5,A: pours an amber with a one finger head but o...,"{'min': 51, 'hour': 6, 'mday': 12, 'sec': 48, ...",1299912708,,,,,Brewbro000


In [3]:
targets = ["review/appearance", "review/aroma", "review/overall", "review/palate", "review/taste"]
predictors = [x for x in list(data.columns) if x not in targets and x!="index"]
predictors = ['beer/ABV', 'review/timeUnix', "beer/beerId", 'beer/brewerId', 'beer/style']
predictor_data = pd.get_dummies(data[predictors], columns = ['beer/beerId', 'beer/brewerId', 'beer/style'])
data["review/text"] = ["" if isinstance(x, float) and math.isnan(x) else x.replace("\t", " ") for x in data["review/text"]]  

In [4]:
# list of review text
text = data["review/text"]
# create the transform
vectorizer = TfidfVectorizer(stop_words = ENGLISH_STOP_WORDS, max_features = 2000, max_df=0.85,smooth_idf=True,use_idf=True)
vectorizer.fit(text)
vector = vectorizer.transform(text)

In [5]:
words = list(vectorizer.vocabulary_.keys())
vectorized_data = pd.DataFrame(vector.toarray(), columns = words)

## X = new data set
## y = overall reviews

In [6]:
X = pd.concat([predictor_data, vectorized_data], axis=1, ignore_index=True)
X["log(ABV)"] = np.log(data["beer/ABV"])
y = data["review/overall"]

In [7]:
def lin_reg(X, y, add_ones = False):
    if add_ones == True:
        X.insert(0, "Ones", [1 for x in range(X.shape[0])])
    Mx = np.transpose(X).dot(X)
    v = np.transpose(X).dot(y)
    return np.linalg.solve(Mx, v)

In [8]:
def get_predictions(X, result, y, add_ones = False):
    if add_ones == True:
        X.insert(0, "Ones", [1 for x in range(X.shape[0])])
    result = result.reshape((1, len(result)))
    predictions = np.matmul(result, X.transpose())
    return predictions

In [9]:
def calculate_r2(predictions, y):
    errors = np.subtract(list(y),predictions)
    sse = np.sum(errors**2)
    sst = np.sum((np.subtract(list(y), np.mean(list(y)))**2))
    r_2 = 1-sse/sst
    return r_2

In [None]:
# betas = lin_reg(X, y, True)
# predictions = get_predictions(X, betas, y)
# calculate_r2(predictions, y)
regr = linear_model.LinearRegression().fit(X, y)
regr.score(X, y)

In [None]:
y = data["review/overall"]
plt.plot(np.log(cool["beer/ABV"]), cool["review/overall"]);

In [None]:
reviews_by_abv = data[["beer/ABV", "review/overall"]].groupby(['beer/ABV'], as_index = False).mean()

In [None]:
y = data["review/overall"]
plt.plot(np.log(reviews_by_abv["beer/ABV"]), reviews_by_abv["review/overall"]);