# Kaggle Competition

Now it's your turn to determine what machine learning model you want to fit to the data! You may use any machine model you like, including ones that we did not cover in class. Remember, your goal is to win the [Kaggle competition](https://inclass.kaggle.com/c/beer2), so try to get your prediction error down, any way you can!

In [1]:
from random import randint
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 7)
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor



In [2]:
scores = []

## Question 1

Fit _at least_ 5 different models to the training data (`/data/beer_train.csv`). Each model must include at least one categorical and one quantitative input variable. At least one model must use linear regression, and at least one model must use $k$-nearest neighbors. Other than that, you are free to fit any machine learning model you like, with any input variables you like, in your pursuit of the model with the best prediction accuracy. (_Hint:_ You might find it worthwhile to create new input variables out of the descriptions of the beers, which are rich in information.)

Estimate the test error of each of the models using cross-validation. Determine which of the models you tried is the best.

In [5]:
keywords = ['Light','Dark','Bitter', 'Sweet',
            'Hop', 'Pale', 'Sour', 'IBU', 
            'Refreshing', 'Citrus', 'Rich', 'Malt',
            'IPA', 'Ale', 'Dry', 'Black','Balanced',
            'German', 'Strong', 'Stout','India', 
            'Imperial', 'Wheat', 'Lager', 'Crisp',
            'Traditional', 'Finish', 'Golden','Belgian',
            'America', 'Flavor', 'Yeast', 'Character',
            'Caramel', 'Roast', 'Pumpkin','Honey','Clove',
            'Note', 'Big','Barley', 'Tropical', 
            'Intense', 'Herb', 'Complex', 'Perfect', 
            'Backbone', 'Subtle', 'Abbey', 'Berlin',
            'Fruit', 'Berry', 'Berries', 'ABV',
            'Craft', 'Brew', 'Floral', 'Filter', 'Amber',
            'Red', 'Gravity', 'Very', 'Extremely', 'Rye',
            'India Pale Ale', 'We'
           ]

name_keywords = ['IPA', 'Double', 'Ale', 'Imperial',
                 'Stout', 'Light', 'Wheat',
                 'Blonde', 'Pale'
                ]
variables = ['originalGravity','srm','abv']

# Unknown Data
data_test = pd.read_csv("/data/beer_test.csv")
data_test['description'] = data_test['description'].str.lower()
data_test['isOrganic'] = (data_test['isOrganic'] == 'Y') * 1
data_test['srm'] = pd.to_numeric(data_test['srm'], errors='coerce').fillna(45)
data_test['originalGravity'] = pd.to_numeric(data_test['originalGravity'], errors='coerce').fillna(1.05)
data_test['abv'] = pd.to_numeric(data_test['abv'], errors='coerce').fillna(6.5)
for kw in keywords:
    data_test[kw] = (data_test['description'].str.count(kw.lower()).fillna(0)) * 1
for nkw in name_keywords:
    data_test['Name %s' %nkw] = (data_test['name'].str.contains(nkw.lower()).fillna(False)) * 1


#----------------------------------------------------------------------------------------------


# Training Data
data_train = pd.read_csv("/data/beer_train.csv")
data_train = data_train[data_train['ibu'] < 150]
data_train['description'] = data_train['description'].str.lower()
data_train['description'] = data_train['description'].str.replace('-|\.|!', ' ')
data_train['name'] = data_train['name'].str.lower()
data_train['isOrganic'] = (data_train['isOrganic'] == 'Y') * 1
data_train['srm'] = pd.to_numeric(data_train['srm'], errors='coerce').fillna(55)
data_train['originalGravity'] = pd.to_numeric(data_train['originalGravity'], errors='coerce').fillna(1.05)
data_train['abv'] = pd.to_numeric(data_train['abv'], errors='coerce').fillna(6.5)
for kw in keywords:
    data_train[kw] = ((data_train['description'].str.count(kw.lower()).fillna(0)) * 1 ) + ((data_train['name'].str.count(kw.lower()).fillna(0)) * 1 ) 
for nkw in name_keywords:
    data_train['Name %s' %nkw] = (data_train['name'].str.contains(nkw.lower()).fillna(False)) * 1


#----------------------------------------------------------------------------------------------



# Add dummy variables for the type of glass
glass_types = pd.get_dummies(data_train['glass'])
glass_types = glass_types.drop('Flute', axis=1)

test_glass_types = pd.get_dummies(data_test['glass'])
test_glass_types = test_glass_types.drop('Flute', axis=1)

#Add dummy variables for availability of beer
avails = pd.get_dummies(data_train['available'])
avails = avails.drop('Limited availability.', axis=1)

test_avails = pd.get_dummies(data_test['available'])
test_avails = test_avails.drop('Limited availability.', axis=1)

data_train

Index(['id', 'abv', 'available', 'description', 'glass', 'ibu', 'isOrganic',
       'name', 'originalGravity', 'srm', 'Light', 'Dark', 'Bitter', 'Sweet',
       'Hop', 'Pale', 'Sour', 'IBU', 'Refreshing', 'Citrus', 'Rich', 'Malt',
       'IPA', 'Ale', 'Dry', 'Black', 'Balanced', 'German', 'Strong', 'Stout',
       'India', 'Imperial', 'Wheat', 'Lager', 'Crisp', 'Traditional', 'Finish',
       'Golden', 'Belgian', 'America', 'Flavor', 'Yeast', 'Character',
       'Caramel', 'Roast', 'Pumpkin', 'Honey', 'Clove', 'Note', 'Big',
       'Barley', 'Tropical', 'Intense', 'Herb', 'Complex', 'Perfect',
       'Backbone', 'Subtle', 'Abbey', 'Berlin', 'Fruit', 'Berry', 'Berries',
       'ABV', 'Craft', 'Brew', 'Floral', 'Filter', 'Amber', 'Red', 'Gravity',
       'Very', 'Extremely', 'Rye', 'India Pale Ale', 'We', 'Name IPA',
       'Name Double', 'Name Ale', 'Name Imperial', 'Name Stout', 'Name Light',
       'Name Wheat', 'Name Blonde', 'Name Pale'],
      dtype='object')

In [4]:
#Normalize abv, srm, gravity to get better results
from sklearn import preprocessing

for var in variables:
    data_train[var] = preprocessing.scale(data_train[var])
    data_test[var] = preprocessing.scale(data_test[var])

for k in keywords:
    data_train[k] = preprocessing.scale(data_train[k])
    data_test[k] = preprocessing.scale(data_test[k])



In [None]:
# Find very bitter beers and very not bitter beers
bitters = data_train[data_train['ibu'] >= 40]
non = data_train[data_train['ibu'] <= 25]

In [None]:
# Look at 500 most common words used in description of bitter beers
d = bitters['description'].fillna('')
bitter_words = {}

for ds in d:
    if(len(ds) <= 0):
        continue
    words = ds.split(' ')
    
    for word in words:
        if word in bitter_words:
            bitter_words[word] += 1
        else:
            bitter_words[word] = 1
        
bitter_words

import operator

sorted_bitter = sorted(bitter_words.items(), key=operator.itemgetter(1))
sorted_bitter[len(sorted_bitter)-500:]

In [None]:
# Look at 500 most common words used in description of low ibu beers
d = non['description'].fillna('')
non_words = {}

for ds in d:
    if(len(ds) <= 0):
        continue
    words = ds.split(' ')
    
    for word in words:
        if word in non_words:
            non_words[word] += 1
        else:
            non_words[word] = 1
        

sorted_non = sorted(non_words.items(), key=operator.itemgetter(1))
sorted_non[len(sorted_non)-500:]

In [None]:
# Linear Regression model, using abv to the seventh power,type of glass, and the availability. 
model = LinearRegression()
abv = ['abv']
for i in range(2,8):
    new_col = "abv^%s" % str(i)
    data_train[new_col] = data_train['abv'] ** i
    abv.append(new_col)

abvs = data_train[abv]

X = pd.get_dummies(data_train['available']).drop('Beer is not available.', axis=1)
test_variables = pd.concat([X, abvs, glass_types], axis=1)
print(-cross_val_score(model,test_variables, data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())


In [None]:
# K Nearest Neighbors model. 15 neighbours was found to be the optimal number in this case.
model = KNeighborsRegressor(n_neighbors=15)

test_variables = pd.concat([data_train[['originalGravity','abv', 'srm']+keywords], glass_types, avails], axis=1)
print(-cross_val_score(model,test_variables, data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())




In [None]:
# This cell checks every possible combination of keywords and prints the
# combination with the lowest error. Since there are 2^47 possible combinations,
# this won't finish running. Probably should not run this cell
"""
model = RandomForestRegressor()
scores = []

# Prints error using every keyword
print(-cross_val_score(model,data_train[variables+keywords], data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())


#Loop through all possible combinations of keywords. 
kws = np.array(keywords)
for i in range(2**len(keywords)):
    mask = list(format(i, '016b'))
    mask = np.array([x=='1' for x in mask])
    combo = kws[mask].tolist()
    scores.append((-cross_val_score(model,data_train[variables+combo], data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean(), combo))
print(min(scores))
"""

In [None]:
# This cell will use a pseudorandom number as a mask to randomly choose
# combinations of keywords. It then prints the one with the lowest error.
model = RandomForestRegressor(n_estimators= 40, min_samples_split=13, min_samples_leaf=2, max_features=14)

inputs= ['originalGravity', 'srm', 'abv', 
         'Light','Dark','Bitter', 'Sweet',
         'Hop', 'Pale', 'Sour', 'IBU', 'Brew',
         'Refreshing','Citrus', 'Rich', 'Complex',
         'IPA', 'Ale', 'Dry', 'Black','Balanced',
         'German', 'Strong','India', 'Berlin',
         'Wheat', 'Lager','Crisp','Traditional',
         'Finish', 'Golden','Belgian','America', 
         'Flavor', 'Yeast', 'Character','Caramel', 
         'Roast', 'Pumpkin','Clove', 'Big', 'Gravity',
         'Barley', 'Tropical', 'Intense','Perfect',
         'Backbone', 'Subtle', 'Abbey', 'Herb', 'ABV',
         'Name IPA',  'Name Ale', 'Name Imperial', 'Red', 
         'Name Light', 'Name Wheat', 'Name Blonde', 'Name Pale',
        ]
    
#Prints score using every keyword in the model
print(-cross_val_score(model,data_train[inputs], data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())


#Only checks a given amount of random combinations. In this case, 5.
kws = np.array(keywords)
for j in range(10):
    i = randint(2**len(keywords)-5000,2**len(keywords))
    mask = list(format(i, '0%db' % len(keywords)))
    mask = np.array([x=='1' for x in mask])
    combo = kws[mask].tolist()
    scores.append((-cross_val_score(model,data_train[variables+combo], data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean(), combo))
    if j%5 == 0:
        print(min(scores))

print(min(scores))
print('Finished')

In [22]:
#This is the random forest model tuned for words and for the RF Regressor
rf_model = RandomForestRegressor(n_estimators= 150, min_samples_split=3, 
                              min_samples_leaf=2, max_features=14)

inputs= ['originalGravity', 'srm', 'abv', 
         'Light','Dark','Bitter', 'Sweet',
         'Hop', 'Pale', 'Sour', 'IBU',
         'Refreshing','Citrus', 'Rich',
         'IPA', 'Ale', 'Dry', 'Black',
         'German', 'Strong','India', 'Berlin',
         'Wheat', 'Lager','Crisp','Traditional',
         'Golden','Belgian','America', 
         'Flavor', 'Yeast', 'Red',
         'Roast', 'Pumpkin','Clove', 
         'Barley', 'Tropical', 'Intense',
         'Backbone',  'Abbey', 
         'Name IPA',  'Name Ale', 'Name Imperial',
         'Name Light', 'Name Wheat', 'Name Blonde'
        ]
    

    
test_variables = data_train[inputs].join(glass_types)
    
print(-cross_val_score(rf_model,test_variables, data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())

191.953859994


In [None]:
from sklearn.ensemble import AdaBoostRegressor

ada_model = AdaBoostRegressor(xgb_model, n_estimators=300)

test_variables = data_train[inputs].join(glass_types)
print(-cross_val_score(ada_model,test_variables, data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())

In [None]:
# Loop used to find optimal values for each relevant parameter.
for i in range(1,10, 10):
    model = RandomForestRegressor(n_estimators= i, min_samples_split=13, min_samples_leaf=2, max_features=14)
    print(i)
    print(-cross_val_score(model,data_train[variables+keywords], data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor(n_estimators=250, min_samples_split = 2, learning_rate = 0.4)

inputs= ['originalGravity', 'srm', 'abv', 
         'Light','Dark','Bitter', 'Sweet',
         'Hop', 'Pale', 'Sour', 'IBU', 'Brew',
         'Refreshing','Citrus', 'Rich', 'Complex',
         'IPA', 'Ale', 'Dry', 'Black','Balanced',
         'German', 'Strong','India', 'Berlin',
         'Wheat', 'Lager','Crisp','Traditional',
         'Finish', 'Golden','Belgian','America', 
         'Flavor', 'Yeast', 'Character','Red',
         'Roast', 'Pumpkin','Clove', 'Big', 'Gravity',
         'Barley', 'Tropical', 'Intense','Perfect',
         'Backbone', 'Subtle', 'Abbey', 'Herb', 'ABV',
         'Name IPA',  'Name Ale', 'Name Imperial',
         'Name Light', 'Name Wheat', 'Name Blonde', 'Name Pale',
        ]
    
 
print(-cross_val_score(gbr_model,data_train[inputs].join(glass_types).join(avails), data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())

In [None]:
xgb_model = XGBRegressor(min_child_weight = 7, max_depth=6, gamma=0.4,
                     subsample = 0.85, colsample_bytree = 0.55, scale_pos_weight=1,
                     n_estimators = 150, nthread=4)

inputs= ['originalGravity', 'srm', 'abv', 
         'Light','Dark','Bitter', 'Sweet',
         'Hop', 'Pale', 'Sour', 'IBU', 'Brew',
         'Refreshing','Citrus', 'Rich', 'Complex',
         'IPA', 'Ale', 'Dry', 'Black','Balanced',
         'German', 'Strong','India', 'Berlin',
         'Wheat', 'Lager','Crisp','Traditional',
         'Finish', 'Golden','Belgian','America', 
         'Flavor', 'Yeast', 'Character','Red',
         'Roast', 'Pumpkin','Clove', 'Big', 'Gravity',
         'Barley', 'Tropical', 'Intense','Perfect',
         'Backbone', 'Subtle', 'Abbey', 'Herb', 'ABV',
         'Name IPA',  'Name Ale', 'Name Imperial', 
         'Name Light', 'Name Wheat', 'Name Blonde', 'Name Pale',
        ]
    

    
test_variables = data_train[inputs].join(glass_types)#.join(avails)
    
print(-cross_val_score(xgb_model,test_variables, data_train['ibu'], cv = 10,
          scoring = "neg_mean_squared_error").mean())

Linear regression is a decent model if you train it with useful data. It is quick, but falls short compared to other
models. 

K nearest neighbors regression is slightly better than linear regression in this project, using similar training data.

The adaboost regressor takes a long time to run if it takes another model to ensemble. The results are not much better than when used alone.

I found the random forest model and multilayer perceptron model to have the best results by far, especially when the parameters are fine tuned. Random forest is very comparable to mlp, but takes much less time to run. That is why it is my chosen model. 

The xgbregressor had the lowest cv score andtakes less time to run than random forest. However, it had a worse mse when submitting to kaggle.  

### Grader Comments

- 
- 

[This question is worth 30 points]

In [None]:
scores.append(None)

## Question 2

Use the model that you determined to be optimal in Question 1, and predict the IBU for the test data. Export your predictions to a CSV file (using `.to_csv()`) in the format expected by Kaggle (see `/data/beer_test_sample_submission.csv`). Then, upload your predictions to [Kaggle](https://inclass.kaggle.com/c/beer2). You'll be able to see how well you did on the Leaderboard. You can upload as often as twice a day until the contest ends on Tuesday, June 6.

The top 5 teams will earn up to 5 bonus points. In addition, the team that wins the competition will get another prize!

_Hint:_ Be extra careful when encoding the categorical variables. Make sure your encoding for the test data matches the encoding you used for the training data **exactly**.

In [37]:
xgb_model = XGBRegressor(min_child_weight = 7, max_depth=6, gamma=0.4,
                     subsample = 0.85, colsample_bytree = 0.55, scale_pos_weight=1,
                     n_estimators = 150, nthread=4)

inputs= ['originalGravity', 'srm', 'abv', 
         'Light','Dark','Bitter', 'Sweet',
         'Hop', 'Pale', 'Sour', 'IBU', 'Brew',
         'Refreshing','Citrus', 'Rich', 'Complex',
         'IPA', 'Ale', 'Dry', 'Black','Balanced',
         'German', 'Strong','India', 'Berlin',
         'Wheat', 'Lager','Crisp','Traditional',
         'Finish', 'Golden','Belgian','America', 
         'Flavor', 'Yeast', 'Character','Red',
         'Roast', 'Pumpkin','Clove', 'Big', 'Gravity',
         'Barley', 'Tropical', 'Intense','Perfect',
         'Backbone', 'Subtle', 'Abbey', 'Herb', 'ABV',
         'Name IPA',  'Name Ale', 'Name Imperial', 
         'Name Light', 'Name Wheat', 'Name Blonde', 'Name Pale',
        ]
    

    
fit_variables = data_train[inputs].join(glass_types)
    
model.fit(fit_variables, data_train['ibu'])


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=17, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [49]:
test_variables = pd.concat([data_test[inputs], test_glass_types], axis=1)
predicted_ibus = model.predict(test_variables)
predicted_ibus

array([ 37.97158261,  35.36412532,  22.24054342, ...,  34.26337563,
        24.86615651,  54.94756519])

In [48]:
import csv

with open('predictions.csv', 'w') as f:  
    fieldnames = ['id', 'ibu']
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    writer.writeheader()    
    for i in range(6000, 10753):
        writer.writerow({'id':i, 'ibu': predicted_ibus[i-6000]})
  

### Grader Comments

- 
- 

[This question is worth 20 points]

In [None]:
scores.append(None)