In [176]:
#suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


import pickle
# Import the relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

z_file = "./The_Zodiac_Race_Challenge_Dataset.csv"
z_df = pd.read_csv(z_file)

# Remove wrong col
z_df.drop('Had_A_Good_Sleep', axis=1, inplace=True)

X = z_df.iloc[:, :8]
Y = z_df.iloc[:, 8:]

animal_list = ['rat', 'ox', 'tiger', 'rabbit', 'dragon', 'snake', 
               'horse', 'goat', 'monkey', 'rooster', 'dog', 'pig']

top_3 = ['Winner', 'Second', 'Third']

#Generate df (one-hot encoded) indicating whether each animal (column) finishes in the top 3

Y_top3 = pd.DataFrame({animal:z_df[top_3].isin([animal]).apply(sum, axis=1) for animal in animal_list}).astype(bool)

# Data preprocessing

# Engineer Features
# Total distance:

X.eval('Total_distance = Desert + Mountain + Water + Land + Forest + Snow', inplace= True)

X['Is_Raining'] = X['Is_Raining'].map({'Yes':1, 'No':0}) 

#scale data

X_scaled = (X - X.min()) / (X.max() - X.min())

In [None]:
quiz_features_val = [2345, 8764, 6689, 7332, 1050, 3741, 40]

In [177]:
X_scaled.describe()

Unnamed: 0,Desert,Mountain,Water,Land,Forest,Snow,Average_Temperature,Is_Raining,Total_distance
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,0.497754,0.500196,0.500136,0.49875,0.498832,0.498901,0.497934,0.5,0.499481
std,0.288067,0.288185,0.287242,0.288154,0.287536,0.289164,0.298589,0.500006,0.139036
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.24815,0.25175,0.252951,0.25005,0.25245,0.24795,0.233333,0.0,0.402832
50%,0.4977,0.5014,0.50175,0.497049,0.495599,0.4986,0.5,0.5,0.500237
75%,0.746249,0.7496,0.74775,0.74925,0.74825,0.748775,0.766667,1.0,0.595553
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [178]:
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
import copy

In [180]:
def get_time(top3_series, animal):

# INPUTS
# animal: name of animal
# top3_series: 6-entry series which represent top 3 animals and their finishing times: 
# [Winner, Winner_Time, Second, Second_Time ,Third, Third_Time]
# *Each row of Y is such a series

# OUTPUTS
# finishing time


    bool_arr = (top3_series==animal)
    
    map_dict = { 'Winner': 'Winner_Time',
                 'Second': 'Second_Time',
                 'Third' : 'Third_Time'
                }
    
    if any(bool_arr):

        return top3_series[map_dict[(top3_series==animal).idxmax()]]
     
    else:
        return np.nan

In [181]:
regen_timings = True

if regen_timings:
    
    Y_time = pd.DataFrame([[get_time(row[1], animal) for animal in animal_list] for row in Y.iterrows()],
                              columns= animal_list)
    
    pickle.dump(Y_time, open("Y_time", 'wb') )

else:
    Y_time= pickle.load(open("Y_time", 'rb'))

In [182]:
#1st stage:
#Predict whether each animal finishes in top 3


#split into training and test sets
X_train, X_test, Y_top3_train, Y_top3_test, Y_time_train, Y_time_test = train_test_split(X_scaled, Y_top3.astype(int), Y_time, random_state=0)

In [183]:
def acc_top3(clf_fitted, X_test, y_test):
    
#get accurracy of predicting top3 finishes (how well does classifier predict a true "top 3" label?)
#leaving out the majority of instances without a "top3" label
#clf: FITTED classifier

    y_test_intop3 = y_test[y_test==1]
    X_test_intop3 = X_test[y_test==1]
    
    y_pred_intop3= clf_fitted.predict(X_test_intop3)
    
    return y_pred_intop3.mean()

In [179]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(random_state=0),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
#     QuadraticDiscriminantAnalysis(),
    LogisticRegression()
    ]


clf_names = [clf.__class__.__name__ for clf in classifiers]

In [184]:
refit_clfs = False

def helper(clf, animal, X_train, Y_top3_train):
    
    print('fitting {} to {} \n'.format(clf.__class__.__name__, animal))
    return clf.fit(X_train, Y_top3_train[animal])

if refit_clfs==True:
    clfs_fitted_df = pd.DataFrame([[helper(clf, animal, X_train, Y_top3_train) for clf in copy.deepcopy(classifiers)] 
                                    for animal in animal_list],
                                  columns= clf_names,
                                  index = animal_list)
    pickle.dump(clfs_fitted_df, open("clfs_fitted_df", 'wb') )

else:
    clfs_fitted_df= pickle.load(open("clfs_fitted_df", 'rb'))


In [187]:
def compare_clfs(animal):

#given name of animal, generates dataframe comparing classifiers 

    return pd.DataFrame([(clf_fitted.__class__.__name__
                          ,clf_fitted.score(X_test, Y_top3_test[animal])
#                           ,acc_top3(clf_fitted, X_test, Y_top3_test[animal])
                         ) for clf_fitted in clfs_fitted_df.loc[animal]]
                           ,columns=['classifier', 'acc']).set_index('classifier')

In [316]:
clf_all_animals = pd.concat([compare_clfs(animal) for animal in animal_list], axis=1, keys=animal_list)
# clf_all_animals.reindex((df.columns), axis=1)
clf_all_animals

Unnamed: 0_level_0,rat,ox,tiger,rabbit,dragon,snake,horse,goat,monkey,rooster,dog,pig
Unnamed: 0_level_1,acc,acc,acc,acc,acc,acc,acc,acc,acc,acc,acc,acc
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
KNeighborsClassifier,0.8972,0.9364,0.9082,0.9409,0.9247,0.8777,0.9061,0.9166,0.9053,0.9475,0.9036,0.9273
SVC,0.9022,0.9571,0.9084,0.9555,0.9291,0.8768,0.8813,0.9111,0.8817,0.9562,0.8977,0.9238
DecisionTreeClassifier,0.9234,0.9829,0.9218,0.9786,0.9249,0.9074,0.9192,0.9252,0.9196,0.9577,0.9274,0.9589
RandomForestClassifier,0.9456,0.9868,0.9421,0.9854,0.9484,0.9266,0.9465,0.946,0.9438,0.9688,0.9483,0.9711
AdaBoostClassifier,0.9242,0.9783,0.9356,0.9786,0.9273,0.8959,0.9112,0.9298,0.892,0.9634,0.9238,0.9546
GradientBoostingClassifier,0.945,0.9851,0.9423,0.9852,0.955,0.9241,0.9459,0.9493,0.934,0.9695,0.9453,0.9667
GaussianNB,0.8443,0.9502,0.8654,0.9471,0.896,0.8157,0.863,0.8792,0.8481,0.9496,0.8519,0.882
LinearDiscriminantAnalysis,0.7804,0.9445,0.8348,0.9379,0.9125,0.7187,0.8676,0.9,0.8527,0.9441,0.7933,0.869
LogisticRegression,0.7794,0.9467,0.8354,0.9408,0.9123,0.719,0.8683,0.9002,0.8533,0.9504,0.7932,0.8683


In [317]:
# clf_all_animals=clf_all_animals.swaplevel(axis=1)
# clf_all_animals.reindex(sorted(clf_all_animals.columns), axis=1)
clf_all_animals.loc['best_clfs'] = clf_all_animals.idxmax()

name_clf_dict = {clf.__class__.__name__:clf for clf in classifiers}
clf_all_animals.loc['best_clfs'].value_counts()

best_clf = name_clf_dict[clf_all_animals.loc['best_clfs'].mode().loc[0]]
print('best clf: {}'.format(best_clf.__class__.__name__))

best clf: RandomForestClassifier


In [318]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LogisticRegression

In [319]:
regressors = [GradientBoostingRegressor(), 
              RandomForestRegressor(), 
              AdaBoostRegressor(),
              LogisticRegression()
             ]

rgg_names = [rgg.__class__.__name__ for rgg in regressors]

In [320]:
refit_rggs = False

def helper_rgg(rgg, animal, X_train, Y_time_train):
    
    X_selected = X_train[~Y_time_train[animal].isnull()]
    y_selected = Y_time_train[animal][~Y_time_train[animal].isnull()]

    print('fitting {} to {} \n'.format(rgg.__class__.__name__, animal))
    return rgg.fit(X_selected, y_selected)

if refit_rggs==True:
    rggs_fitted_df = pd.DataFrame([[helper_rgg(rgg, animal, X_train, Y_time_train) for rgg in copy.deepcopy(regressors)] 
                                    for animal in animal_list]
                                   ,columns= rgg_names
                                   ,index = animal_list)
    pickle.dump(rggs_fitted_df, open("rggs_fitted_df", 'wb') )

else:
    rggs_fitted_df= pickle.load(open("rggs_fitted_df", 'rb'))


In [321]:
rggs_fitted_df

Unnamed: 0,GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,LogisticRegression
rat,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
ox,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
tiger,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
rabbit,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
dragon,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
snake,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
horse,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
goat,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
monkey,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."
rooster,([DecisionTreeRegressor(criterion='friedman_ms...,"(DecisionTreeRegressor(criterion='mse', max_de...","(DecisionTreeRegressor(criterion='mse', max_de...","LogisticRegression(C=1.0, class_weight=None, d..."


In [322]:
def compare_rggs(animal):

#given name of animal, generates dataframe comparing classifiers 

    return pd.DataFrame([(rgg_fitted.__class__.__name__
                          ,rgg_fitted.score(X_train[~Y_time_train[animal].isnull()], 
                                            Y_time_train[animal][~Y_time_train[animal].isnull()])
                         ) for rgg_fitted in rggs_fitted_df.loc[animal]]
                           ,columns=['regressor', 'acc']).set_index('regressor')

In [323]:
rgg_all_animals = pd.concat([compare_rggs(animal) for animal in animal_list], axis=1, keys=animal_list)
rgg_all_animals.loc['best_rggs'] = rgg_all_animals.idxmax()

In [324]:
name_rgg_dict = {rgg.__class__.__name__:rgg for rgg in regressors}
rgg_all_animals.loc['best_rggs'].value_counts()

best_rgg = name_rgg_dict[rgg_all_animals.loc['best_rggs'].mode().loc[0]]
print('best rgg: {}'.format(best_rgg.__class__.__name__))

best rgg: RandomForestRegressor


In [389]:
# Predict top3 using regression
# Compare results vs using classifer technique to predict top 3

def pred_time(X, best_rgg):
# rgg_fitted_series: series of fitted (single) regressor for all all animals 
# returns dataframe Y predicted timings
    return pd.DataFrame([rggs_fitted_df.loc[animal,best_rgg.__class__.__name__].predict(X) for animal in animal_list], 
                        index=animal_list,
                        columns=X.index
                       ).T
    
    

In [390]:
def pred_top3_rgg(X, best_rgg):
# returns dataframe Y of (one-hot) indicator of whether animal finished in top3
    return pred_time(X_test, best_rgg).apply(lambda row: row.nsmallest(3) ,axis=1).notnull().reindex(animal_list, axis=1)
    

In [391]:
def pred_top3_clf(X, best_clf):
# returns dataframe Y of (one-hot) indicator of whether animal finished in top3
    return pd.DataFrame([clfs_fitted_df.loc[animal,best_clf.__class__.__name__]
                         .predict(X) for animal in animal_list],
                        index=animal_list,
                        columns=X.index
                       ).T

In [392]:
pred_top3_clf_df = pred_top3_clf(X_test, best_clf)

In [393]:
pred_top3_rgg_df= pred_top3_rgg(X_test,best_rgg)

In [394]:
print("Accuracy in predicting whether animal finishes in top3 \nUsing Regression: {} \nUsing Classification: {}" \
      .format((Y_top3_test == pred_top3_rgg_df).mean().mean(),(Y_top3_test == pred_top3_clf_df).mean().mean())
     )

Accuracy in predicting whether animal finishes in top3 
Using Regression: 0.7658 
Using Classification: 0.9549499999999999


In [None]:
# Predict top 3 finihers in order using

# Method 1
# Pure regression: regression, then sort by timings

# Method 2
# Classification to pick animals predicted to finish in top3, then sort by timings 
# If < 3 finishers are predicted to be in top3, choose next fastest timing from animal not predicted in top3

In [505]:
#Generate test rank set, entries where animals are tied

Y_rank_test = Y_time_test.rank(axis=1)
X_rank_test = X_test

In [506]:
# # Remove entries where 2 or more of the top3 finishers are tied
# tied = Y_rank_test.mode(axis=1)[2].isnull()

# Y_rank_test = Y_rank_test[~tied]
# X_rank_test = X_test[~tied]

In [507]:
#Method 1

#Dataframe of predicting rankings using regression
pred_rank_rgg = pred_time(X_rank_test, best_rgg).rank(axis=1).apply(lambda row: row.nsmallest(3) ,axis=1).reindex(animal_list, axis=1)

#number of positions predicted correctly for each entry
print((pred_rank_rgg==Y_rank_test).sum(axis=1).value_counts())

#proportion of times where all top3 rankings where predicted correctly
rank_acc_rgg = ((pred_rank_rgg==Y_rank_test).sum(axis=1)==3).mean()

0    5552
1    2086
3    1358
2    1004
dtype: int64


In [508]:
rank_acc_rgg

0.1358

In [537]:
#Method 2

def pred_rank_m2(X, best_rgg, best_clf):
    pred_rank = pred_time(X, best_rgg).rank(axis=1)
    pred_top3_clf_df = pred_top3_clf(X, best_clf)
    
    #ranking(using timings from rgg) of animals predicted by clf to finish in top 3 
    intop3_ranked = pred_rank[pred_top3_clf_df.astype(bool)]
    outtop3_ranked = pred_rank[~pred_top3_clf_df.astype(bool)] + 12
    
    return  intop3_ranked.fillna(outtop3_ranked).apply(lambda row: row.nsmallest(3) ,axis=1) \
            .reindex(animal_list, axis=1).rank(axis=1)


In [538]:
pred_rank_m2_df = pred_rank_m2(X_rank_test, best_rgg, best_clf)

In [539]:
print((pred_rank_m2_df==Y_rank_test).sum(axis=1).value_counts())

3    3971
1    2953
2    2087
0     989
dtype: int64


In [540]:
rank_acc_m2 = ((pred_rank_m2_df==Y_rank_test).sum(axis=1)==3).mean()

In [552]:
clfs_fitted_df = pd.DataFrame([[helper(clf, animal, X_scaled, Y_top3) for clf in copy.deepcopy([best_clf])] 
                                for animal in animal_list],
                              columns= [best_clf.__class__.__name__],
                              index = animal_list)

rggs_fitted_df = pd.DataFrame([[helper_rgg(rgg, animal, X_scaled, Y_time) for rgg in copy.deepcopy([best_rgg])] 
                                for animal in animal_list]
                               ,columns= [best_rgg.__class__.__name__]
                               ,index = animal_list)



fitting RandomForestClassifier to rat 

fitting RandomForestClassifier to ox 

fitting RandomForestClassifier to tiger 

fitting RandomForestClassifier to rabbit 

fitting RandomForestClassifier to dragon 

fitting RandomForestClassifier to snake 

fitting RandomForestClassifier to horse 

fitting RandomForestClassifier to goat 

fitting RandomForestClassifier to monkey 

fitting RandomForestClassifier to rooster 

fitting RandomForestClassifier to dog 

fitting RandomForestClassifier to pig 

fitting RandomForestRegressor to rat 

fitting RandomForestRegressor to ox 

fitting RandomForestRegressor to tiger 

fitting RandomForestRegressor to rabbit 

fitting RandomForestRegressor to dragon 

fitting RandomForestRegressor to snake 

fitting RandomForestRegressor to horse 

fitting RandomForestRegressor to goat 

fitting RandomForestRegressor to monkey 

fitting RandomForestRegressor to rooster 

fitting RandomForestRegressor to dog 

fitting RandomForestRegressor to pig 



In [555]:
X_submit = pd.DataFrame([[2345, 8764, 6689, 7332, 1050, 3741, 40]], columns=X_test.columns[:7])
X_submit['Is_Raining']=0
X_submit['Total_distance']= X_submit.loc[::7].sum(axis=1)

pred_rank_m2(X_submit, best_rgg, best_clf)

Unnamed: 0,rat,ox,tiger,rabbit,dragon,snake,horse,goat,monkey,rooster,dog,pig
0,,1.0,,2.0,,,,,,,,3.0
