In [64]:
import pandas as pd
import numpy as np
import os, re
import matplotlib.pyplot as plt
import nltk
import math
%matplotlib inline

In [65]:
def sigmoid(x):
    return 1 / (1 + math.e ** -x)

In [66]:
os.chdir(r'D:\Hackathon\How-To-Choose-The-Perfect-Beer-Data-Set')

In [67]:
train_data=pd.read_csv('train_cleaned.csv')
test_data=pd.read_csv('test_cleaned.csv')
train_data.shape, test_data.shape

((185643, 18), (20628, 18))

In [68]:
# Load the submission file and convert the scores to array
y_act_final=pd.read_excel('Beer_Sample_Submission.xlsx',sheet_name='Sheet1')
y_act_score=y_act_final.Score
y_act_score=np.array(y_act_score)

In [69]:
train_data['glassware_len']=[len(i) for i in train_data['Glassware Used']]
test_data['glassware_len']=[len(i) for i in test_data['Glassware Used']]

In [70]:
# Proper categorization is required
train_data['Ratings'].describe()

count    185643.000000
mean         32.955210
std         254.863803
min           0.000000
25%           1.000000
50%           2.000000
75%           7.000000
max       16954.000000
Name: Ratings, dtype: float64

In [71]:
# Quantile cut to divide into various classes
train_data['ratings_cat']=pd.qcut(train_data['Ratings'], q=[0,0.25,0.5,0.75, 1], labels=['A', 'B','C', 'D'])
test_data['ratings_cat']=pd.qcut(test_data['Ratings'], q=[0,0.25,0.5,0.75, 1], labels=['A', 'B','C', 'D'])

In [72]:
# This looks reasonable
train_data['ABV'].describe()

count    185643.000000
mean          6.354961
std           1.827834
min           0.010000
25%           5.100000
50%           6.100000
75%           7.000000
max          80.000000
Name: ABV, dtype: float64

In [73]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,ABV,Brewing Company,Food Paring,Glassware Used,Beer Name,Ratings,Style Name,Cellar Temperature,Serving Temperature,Score,style_len,min_cellar_temp,max_cellar_temp,min_serving_temp,max_serving_temp,item_score,glass_score,glassware_len,ratings_cat
0,0,6.5,8929,"(Curried,Thai)Cheese(pepperyMontereyPepperJack...","PintGlass(orBecker,Nonic,Tumbler),Mug(orSeidel...",15121,22,AmericanIPA,40-45,45-50,3.28,11,40,45,45,50,3.0,2.0,53,D
1,1,5.5,13187,"(PanAsian)Cheese(earthyCamembert,Fontina,nutty...","PintGlass(orBecker,Nonic,Tumbler),Mug(orSeidel...",59817,1,AmericanPaleAle(APA),35-40,40-45,3.52,20,35,40,40,45,4.0,2.0,53,A
2,2,8.1,6834,"Meat(Pork,Poultry)","PintGlass(orBecker,Nonic,Tumbler),Mug(orSeidel...",32669,3,IrishRedAle,35-40,40-45,4.01,11,35,40,40,45,1.0,2.0,53,C
3,3,6.354961,11688,"(Indian,LatinAmerican,PanAsian)General(Aperitif)","PintGlass(orBecker,Nonic,Tumbler),PilsenerGlas...",130798,0,AmericanMaltLiquor,35-40,35-40,0.0,18,35,40,35,40,2.0,3.0,76,A
4,4,6.0,10417,"Meat(Poultry,Fish,Shellfish)",PilsenerGlass(orPokal),124087,1,EuroPaleLager,35-40,40-45,2.73,13,35,40,40,45,1.0,1.0,22,A


In [74]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,ABV,Brewing Company,Food Paring,Glassware Used,Beer Name,Ratings,Style Name,Cellar Temperature,Serving Temperature,Score,style_len,min_cellar_temp,max_cellar_temp,min_serving_temp,max_serving_temp,item_score,glass_score,glassware_len,ratings_cat
0,0,6.2,8803,"(Curried,Thai)Cheese(pepperyMontereyPepperJack...","PintGlass(orBecker,Nonic,Tumbler),Mug(orSeidel...",34558,3,AmericanIPA,40-45,45-50,,11,40,45,45,50,3.0,2.0,53,C
1,1,5.3,8558,"(Barbecue)Cheese(butteryBrie,Gouda,Havarti,Swi...","PintGlass(orBecker,Nonic,Tumbler),Mug(orSeidel...",86826,20,AmericanPorter,40-45,45-50,,14,40,45,45,50,4.0,2.0,53,D
2,2,7.0,7519,"Cheese(earthyCamembert,Fontina)General(Aperitif)","Snifter,Tulip,OversizedWineGlass",17051,0,BelgianDarkAle,45-50,45-50,,14,45,50,45,50,2.0,3.0,32,A
3,3,5.2,9852,"(LatinAmerican,German)Meat(Pork,Poultry)","Flute,PilsenerGlass(orPokal),Mug(orSeidel,Stein)",49156,2,ViennaLager,35-40,40-45,,11,35,40,40,45,2.0,3.0,48,B
4,4,8.1,8991,"(Barbecue)Cheese(butteryBrie,Gouda,Havarti,Swi...","PintGlass(orBecker,Nonic,Tumbler),Mug(orSeidel...",162723,5,AmericanPorter,40-45,45-50,,14,40,45,45,50,4.0,2.0,53,C


In [75]:
train_data_1=pd.get_dummies(train_data['ratings_cat'])
test_data_1=pd.get_dummies(test_data['ratings_cat'])

In [76]:
train_data=train_data.join(train_data_1)
test_data=test_data.join(test_data_1)

In [None]:
# Beer types referred from http://www.thebeerstore.ca/beer-101/beer-types
beer_types=['Ales','Lagers','Stouts & Porters', 'Malts']
beer_styles=['Amber','Blonde','Brown','Cream','Dark','Fruit','Golden','Honey','IPA','Light','Lime','Pale','Pilsner',\
            'Red','Strong','Wheat']

In [146]:
train_data_final['style_name_mod']=[re.findall('[A-Z][^A-Z-(]*', i)[0] for i in train_data_final['Style Name']]

In [172]:
style_df=train_data_final.groupby('style_name_mod').agg({"style_name_mod":'count', 'ABV':'mean'}).sort_values(by='style_name_mod',\
                                                                                                              ascending=False)

In [173]:
style_df['cum_sum']=style_df['style_name_mod'].cumsum()
style_df['cum_percentage']=(100*style_df['style_name_mod'].cumsum())/style_df['style_name_mod'].sum()

In [174]:
style_df.head(20)

Unnamed: 0_level_0,style_name_mod,ABV,cum_sum,cum_percentage
style_name_mod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
American,99118,6.703907,99118,53.391725
English,10182,5.443981,109300,58.876446
Saison,8982,6.150862,118282,63.714764
Belgian,7392,7.12255,125674,67.6966
Fruit,4088,5.382545,129762,69.898676
Euro,3867,5.601643,133629,71.981707
Hefeweizen,3670,5.302382,137299,73.95862
Witbier,3265,5.30441,140564,75.717372
German,2955,5.171126,143519,77.309136
Irish,2441,5.384938,145960,78.624026


In [203]:
test_data_final['style_name_mod']=[re.findall('[A-Z][^A-Z-(]*', i)[0] for i in test_data_final['Style Name']]

In [205]:
test_data_final.head()

Unnamed: 0,ABV,Ratings,Style Name,Score,style_len,min_cellar_temp,min_serving_temp,glass_score,A,B,C,D,style_name_mod
0,6.2,3,AmericanIPA,,11,40,45,2.0,0,0,1,0,American
1,5.3,20,AmericanPorter,,14,40,45,2.0,0,0,0,1,American
2,7.0,0,BelgianDarkAle,,14,45,45,3.0,1,0,0,0,Belgian
3,5.2,2,ViennaLager,,11,35,40,3.0,0,1,0,0,Vienna
4,8.1,5,AmericanPorter,,14,40,45,2.0,0,0,1,0,American


In [204]:
# Pereto to consider the 80% of data datapoint to improve accuracy
train_data_final['country_style']=['American' if i in ['American']\
                                       else 'English' if i in ['English']\
                                       else 'Saison' if i in ['Saison']\
                                       else 'Belgian' if i in ['Belgian']\
                                       else 'Fruit' if i in ['Fruit']\
                                       else 'Euro' if i in ['Euro']\
                                       else 'Hefeweizen' if i in ['Hefeweizen']\
                                       else 'Witbier' if i in ['Witbier']\
                                       else 'German' if i in ['German']\
                                       else 'Irish' if i in ['Irish']\
                                       else 'Berliner' if i in ['Berliner']\
                                       else 'Mrzen' if i in ['Mrzen']\
                                       else 'others' for i in train_data_final['style_name_mod']]

In [206]:
# Pereto to consider the 80% of data datapoint to improve accuracy
test_data_final['country_style']=['American' if i in ['American']\
                                       else 'English' if i in ['English']\
                                       else 'Saison' if i in ['Saison']\
                                       else 'Belgian' if i in ['Belgian']\
                                       else 'Fruit' if i in ['Fruit']\
                                       else 'Euro' if i in ['Euro']\
                                       else 'Hefeweizen' if i in ['Hefeweizen']\
                                       else 'Witbier' if i in ['Witbier']\
                                       else 'German' if i in ['German']\
                                       else 'Irish' if i in ['Irish']\
                                       else 'Berliner' if i in ['Berliner']\
                                       else 'Mrzen' if i in ['Mrzen']\
                                       else 'others' for i in test_data_final['style_name_mod']]

In [207]:
train_data_final.head()

Unnamed: 0,ABV,Ratings,Style Name,Score,style_len,min_cellar_temp,min_serving_temp,glass_score,A,B,C,D,style_name_mod,country_style
0,6.5,22,AmericanIPA,3.28,11,40,45,2.0,0,0,0,1,American,American
1,5.5,1,AmericanPaleAle(APA),3.52,20,35,40,2.0,1,0,0,0,American,American
2,8.1,3,IrishRedAle,4.01,11,35,40,2.0,0,0,1,0,Irish,Irish
3,6.354961,0,AmericanMaltLiquor,0.0,18,35,35,3.0,1,0,0,0,American,American
4,6.0,1,EuroPaleLager,2.73,13,35,40,1.0,1,0,0,0,Euro,Euro


In [208]:
test_data_final.head()

Unnamed: 0,ABV,Ratings,Style Name,Score,style_len,min_cellar_temp,min_serving_temp,glass_score,A,B,C,D,style_name_mod,country_style
0,6.2,3,AmericanIPA,,11,40,45,2.0,0,0,1,0,American,American
1,5.3,20,AmericanPorter,,14,40,45,2.0,0,0,0,1,American,American
2,7.0,0,BelgianDarkAle,,14,45,45,3.0,1,0,0,0,Belgian,Belgian
3,5.2,2,ViennaLager,,11,35,40,3.0,0,1,0,0,Vienna,others
4,8.1,5,AmericanPorter,,14,40,45,2.0,0,0,1,0,American,American


In [209]:
country_dummies_train=pd.get_dummies(train_data_final['country_style'])
country_dummies_test=pd.get_dummies(test_data_final['country_style'])

In [211]:
train_data_final=train_data_final.join(country_dummies_train)
test_data_final=test_data_final.join(country_dummies_test)

In [212]:
train_data_final.head()

Unnamed: 0,ABV,Ratings,Style Name,Score,style_len,min_cellar_temp,min_serving_temp,glass_score,A,B,...,English,Euro,Fruit,German,Hefeweizen,Irish,Mrzen,Saison,Witbier,others
0,6.5,22,AmericanIPA,3.28,11,40,45,2.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.5,1,AmericanPaleAle(APA),3.52,20,35,40,2.0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,8.1,3,IrishRedAle,4.01,11,35,40,2.0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,6.354961,0,AmericanMaltLiquor,0.0,18,35,35,3.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,6.0,1,EuroPaleLager,2.73,13,35,40,1.0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [135]:
train_data_final=train_data.drop(labels=['Unnamed: 0','Beer Name','Brewing Company','max_cellar_temp','style_name_mod','country_style',\
                                        'max_serving_temp', 'item_score','glassware_len','Food Paring','Glassware Used',\
                                        'Cellar Temperature','Serving Temperature','ratings_cat'], axis=1)
test_data_final=test_data.drop(labels=['Unnamed: 0','Beer Name','Brewing Company','max_cellar_temp','style_name_mod','country_style',\
                                        'max_serving_temp','item_score','glassware_len','Food Paring','Glassware Used',\
                                        'Cellar Temperature','Serving Temperature','ratings_cat'], axis=1)

In [129]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

# Model Packages
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

def model_eval(train_data, regressor, final_test_set, actual_score):
    y=train_data['Score']
    X=train_data.drop(labels=['Score'], axis=1)
    
    # Split the train data to train-test split
    X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=0.8, random_state=45)
    
    # Training model object
    print('Training the model with 80-20 Split.....')
    regr=regressor
    model_obj=regr.fit(X_train, y_train)
    y_pred=model_obj.predict(X_test)
    y_act=y_test
    mean_abs_err=mean_absolute_error(y_act, y_pred)
    root_mean_sqr_err=sqrt(mean_squared_error(y_act, y_pred))
    print('RMSE_Train_model: ', root_mean_sqr_err)
    sigmoid_value=sigmoid(root_mean_sqr_err)
    print('Score_trained_model: ',(1-sigmoid_value))
    
    # Final test set
    pred_score=model_obj.predict(final_test_set.drop(labels='Score', axis=1))
    root_mean_sqr_err_final=sqrt(mean_squared_error(actual_score, pred_score))
    print('RMSE_Test_set: ', root_mean_sqr_err_final)
    sigmoid_value_test=sigmoid(root_mean_sqr_err_final)
    print('Score_on_test_set: ',(1-sigmoid_value_test))
    
    return pred_score

In [130]:
pred_score_lreg=model_eval(train_data=train_data_final, regressor=LinearRegression(), final_test_set=test_data_final,\
           actual_score=y_act_score)

Training the model with 80-20 Split.....
RMSE_Train_model:  1.186345034817271
Score_trained_model:  0.2339132605794414
RMSE_Test_set:  1.709645931686735
Score_on_test_set:  0.1532096456910632




In [131]:
pred_score_rforest=model_eval(train_data=train_data_final, regressor=RandomForestRegressor(), final_test_set=test_data_final,\
           actual_score=y_act_score)



Training the model with 80-20 Split.....
RMSE_Train_model:  0.37903801612002397
Score_trained_model:  0.4063589368168271
RMSE_Test_set:  2.0622009237384713
Score_on_test_set:  0.11282533908886827


In [132]:
pred_score_gbm=model_eval(train_data=train_data_final, regressor=GradientBoostingRegressor(), final_test_set=test_data_final,\
           actual_score=y_act_score)

Training the model with 80-20 Split.....




RMSE_Train_model:  0.3686872552082452
Score_trained_model:  0.4088582650334929
RMSE_Test_set:  2.050587005439614
Score_on_test_set:  0.11399308108280959


In [133]:
pred_score_xgb=model_eval(train_data=train_data_final, regressor=XGBRegressor(), final_test_set=test_data_final,\
           actual_score=y_act_score)

Training the model with 80-20 Split.....




RMSE_Train_model:  0.36901694381564876
Score_trained_model:  0.40877858393920274
RMSE_Test_set:  2.050828995249628
Score_on_test_set:  0.11396864271951623


### Saving to Submission Format

In [106]:
final_sub=pd.DataFrame(columns=['Score'])

In [107]:
final_sub['Score']=pred_score_lreg

In [134]:
writer = pd.ExcelWriter('output_lreg.xlsx')
final_sub.to_excel(writer,'Sheet1', index= False)
writer.save()

In [61]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

y=train_data_final['Score']
X=train_data_final.drop(labels=['Score'], axis=1)

# Split the train data to train-test split
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=0.8, random_state=45)

# Training model object
regressor=LinearRegression()
model_obj=regressor.fit(X_train, y_train)
y_pred=model_obj.predict(X_test)
y_act=y_test
mean_abs_err=mean_absolute_error(y_act, y_pred)
root_mean_sqr_err=sqrt(mean_squared_error(y_act, y_pred))

print('Mean absolute error: ', mean_abs_err)
print('Root Mean squared error: ', root_mean_sqr_err)

Mean absolute error:  0.851313921177971
Root Mean squared error:  1.1862212444393614




In [62]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

y=train_data_final['Score']
X=train_data_final.drop(labels=['Score'], axis=1)

# Split the train data to train-test split
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=0.8, random_state=45)

# Training model object
regressor=RandomForestRegressor()
model_obj=regressor.fit(X_train, y_train)
y_pred=model_obj.predict(X_test)
y_act=y_test
mean_abs_err=mean_absolute_error(y_act, y_pred)
root_mean_sqr_err=sqrt(mean_squared_error(y_act, y_pred))

print('Mean absolute error: ', mean_abs_err)
print('Root Mean squared error: ', root_mean_sqr_err)



Mean absolute error:  0.25742239529883504
Root Mean squared error:  0.3796847578021573


In [63]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

y=train_data_final['Score']
X=train_data_final.drop(labels=['Score'], axis=1)

# Split the train data to train-test split
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=0.8, random_state=45)

# Training model object
regressor=GradientBoostingRegressor()
model_obj=regressor.fit(X_train, y_train)
y_pred=model_obj.predict(X_test)
y_act=y_test
mean_abs_err=mean_absolute_error(y_act, y_pred)
root_mean_sqr_err=sqrt(mean_squared_error(y_act, y_pred))

print('Mean absolute error: ', mean_abs_err)
print('Root Mean squared error: ', root_mean_sqr_err)



Mean absolute error:  0.2513746984276792
Root Mean squared error:  0.36701487306977854


In [21]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt

y=train_data_final['Score']
X=train_data_final.drop(labels=['Score'], axis=1)

# Split the train data to train-test split
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=0.8, random_state=45)

# Training model object
regressor=XGBRegressor()
model_obj_xgb=regressor.fit(X_train, y_train)
y_pred=model_obj_xgb.predict(X_test)
y_act=y_test
mean_abs_err=mean_absolute_error(y_act, y_pred)
root_mean_sqr_err=sqrt(mean_squared_error(y_act, y_pred))

print('Mean absolute error: ', mean_abs_err)
print('Root Mean squared error: ', root_mean_sqr_err)



Mean absolute error:  0.2512569377176331
Root Mean squared error:  0.36714119973700055


### Predictions on the test dataset

In [None]:
y_pred_final=model_obj_xgb.predict(test_data_final.drop(labels='Score', axis=1))

In [None]:
y_pred_final

In [None]:

root_mean_sqr_err_final=sqrt(mean_squared_error(y_act_score, y_pred_final))
sigmoid_value=sigmoid(root_mean_sqr_err_final)
print('Score: ',(1-sigmoid_value))

In [25]:
sigmoid_value=sigmoid(0.36714119973700055)
print('Score: ',(1-sigmoid_value))

Score:  0.40923198869023103


In [None]:
1-0.88

### Grid Search - GBM

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV

param_test1 = {'n_estimators':[50, 100, 300], 'min_samples_leaf':[10, 50, 70], 'learning_rate':[0.01, 0.05, 0.1],'max_depth ':[2, 3, 5]}

gbm = GradientBoostingRegressor(random_state=10)

gsearch1 = GridSearchCV(estimator=gbm, param_grid = param_test1, scoring='explained_variance', cv=5)

In [None]:
gsearch1.fit(X_train, y_train.values)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
X_train.values

In [None]:
X_train.head()