# Modeling
For the modeling portion of this project, we will be taking a slightly different approach. In this, all years will be treated equally as separate observations. We will only be predicting Gold medal winners in select sports. 

In [70]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

First we will pre process our data to get just our target labels and our input data.

In [71]:
df = pd.read_csv("./data/summer.csv")
for col in df.columns[1:]:
    df[col] = pd.Categorical(df[col])
df = df.drop("Sport", axis = 1).drop("Athlete",axis  =1)
df = df.drop_duplicates()

In [72]:
df["yearWeight"] = (2/3)**(29 - (df["Year"] - 1896)/4) # 2/3 to an exponent = to how many olympiads ago that olympiad was. so 2012's exponent is 0, 2008's is 1, etc.

In [73]:
df["medalWeight"] = 0
def weightMedal (row):
    if row['Medal'] == "Gold" :
        return 4
    if row["Medal"] == "Silver":
        return 2
    if row['Medal'] == "Bronze" :
        return 1
df['medalWeight'] = df.apply(lambda row: weightMedal(row), axis=1)
df["Weight"] = df['medalWeight'] * df["yearWeight"]

In [74]:
df = df[df.Year != 1896]
allscores  = df.groupby(["Country","Event"]).sum()["Weight"]
countries = df["Country"].drop_duplicates().dropna()
events = df[(df["Discipline"] == "Swimming") |(df["Discipline"] == "Athletics")].Event.drop_duplicates()
scores = pd.DataFrame(index = events,columns = countries)
for country in countries:
    for event in events:
        scores.loc[event,country] = allscores[country,event]
for event in scores.index:
    if scores.loc[event].count() < 20:
        scores.drop(event,inplace = True)

In [84]:
londonGolds = df[(df["Gender"] == "Men") &(df["Medal"] == "Gold") &(df["Year"] == 2012) & ((df["Discipline"] == "Swimming")|(df["Discipline"] == "Athletics"))]
y = londonGolds[['Event','Country']]
scores.index = scores.index.astype(str)
scores.columns = scores.columns.astype(str)
Xy = pd.merge(scores, y, on= "Event")

Let's separate our truth and our input data.

In [85]:
Xy = Xy.set_index("Event")

In [90]:
X = Xy.iloc[:,:-1]
y = Xy.iloc[:,-1]
X = X.fillna(0)

In [122]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from sklearn.preprocessing import StandardScaler

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, train_size = .8,random_state = 0)

In [99]:
clf = DecisionTreeClassifier(min_samples_leaf = 5,random_state = 100)
clf.fit(X_train,y_train)
print(clf)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=100, splitter='best')


In [100]:
y_pred_train = pd.DataFrame(clf.predict(X_train),columns = ["winner"])
y_pred_test = pd.DataFrame(clf.predict(X_test),columns = ['winner'])

In [110]:
from sklearn.model_selection import cross_validate

clf = DecisionTreeClassifier()
df_results = cross_validate(clf,X,y,cv=5,scoring=["f1_macro","accuracy"],return_train_score=True)
    
df_results = pd.DataFrame(df_results)
df_results


The least populated class in y has only 1 members, which is less than n_splits=5.



Unnamed: 0,fit_time,score_time,test_f1_macro,train_f1_macro,test_accuracy,train_accuracy
0,0.002993,0.001994,0.0,1.0,0.0,1.0
1,0.002026,0.002004,0.0,1.0,0.0,1.0
2,0.002021,0.001994,0.214286,1.0,0.333333,1.0
3,0.001995,0.001994,0.074074,1.0,0.166667,1.0
4,0.00299,0.002997,0.142857,1.0,0.2,1.0


In [112]:
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(DecisionTreeClassifier(),X,y,cv=5)
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

         ALG       0.00      0.00      0.00         1
         AUS       0.00      0.00      0.00         1
         BAH       0.00      0.00      0.00         1
         CHN       0.00      0.00      0.00         1
         DOM       0.00      0.00      0.00         1
         FRA       0.00      0.00      0.00         2
         GBR       0.50      0.67      0.57         3
         GER       0.00      0.00      0.00         1
         GRN       0.00      0.00      0.00         1
         HUN       0.00      0.00      0.00         2
         JAM       1.00      0.33      0.50         3
         KEN       0.00      0.00      0.00         1
         POL       0.00      0.00      0.00         1
         RSA       0.00      0.00      0.00         2
         RUS       0.00      0.00      0.00         1
         TTO       0.00      0.00      0.00         1
         UGA       0.00      0.00      0.00         1
         USA       0.25    


The least populated class in y has only 1 members, which is less than n_splits=5.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [116]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
print("Default decision tree")
y_pred = cross_val_predict(DecisionTreeClassifier(),X,y)
print(classification_report(y,y_pred))
print("Decision Tree using entropy")
y_pred = cross_val_predict(DecisionTreeClassifier(criterion = "entropy"),X,y)
print(classification_report(y,y_pred))
print("K neighbors=3")
y_pred = cross_val_predict(KNeighborsClassifier(3),X,y)
print(classification_report(y,y_pred))
print("Bayes")
y_pred = cross_val_predict(MultinomialNB(),X,y)
print(classification_report(y,y_pred))

Default decision tree
              precision    recall  f1-score   support

         ALG       0.00      0.00      0.00         1
         AUS       0.00      0.00      0.00         1
         BAH       0.00      0.00      0.00         1
         CHN       0.00      0.00      0.00         1
         DOM       0.00      0.00      0.00         1
         FRA       0.00      0.00      0.00         2
         GBR       0.17      0.33      0.22         3
         GER       0.00      0.00      0.00         1
         GRN       0.00      0.00      0.00         1
         HUN       0.00      0.00      0.00         2
         JAM       1.00      0.67      0.80         3
         KEN       0.00      0.00      0.00         1
         POL       0.00      0.00      0.00         1
         RSA       0.00      0.00      0.00         2
         RUS       0.00      0.00      0.00         1
         TTO       0.00      0.00      0.00         1
         UGA       0.00      0.00      0.00         1
     


The least populated class in y has only 1 members, which is less than n_splits=5.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The least populated class in y has only 1 members, which is less than n_splits=5.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The least populated class in y has only 1 members, which is less than n_splits=5.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The least populated class in y has only 1 members, which is less than n_splits=5.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



the best accuracy was find using bayes. Let's expand on that a little.

In [118]:
y_pred = cross_val_predict(MultinomialNB(alpha = .25),X,y)
print("Default bayes accuracy is 27%")
print("lower alpha (less smoothing):")
print(classification_report(y,y_pred))

y_pred = cross_val_predict(MultinomialNB(alpha = 2),X,y)
print("higher alpha (more smoothing):")
print(classification_report(y,y_pred))

Default bayes accuracy is 27%
lower alpha (less smoothing):
              precision    recall  f1-score   support

         ALG       0.00      0.00      0.00         1
         AUS       0.00      0.00      0.00         1
         BAH       0.00      0.00      0.00         1
         CHN       0.00      0.00      0.00         1
         DOM       0.00      0.00      0.00         1
         FRA       0.00      0.00      0.00         2
         GBR       0.50      0.67      0.57         3
         GER       0.00      0.00      0.00         1
         GRN       0.00      0.00      0.00         1
         HUN       0.00      0.00      0.00         2
         JAM       1.00      1.00      1.00         3
         KEN       0.00      0.00      0.00         1
         POL       0.00      0.00      0.00         1
         RSA       0.25      0.50      0.33         2
         RUS       0.00      0.00      0.00         1
         TTO       0.00      0.00      0.00         1
         UGA       0.


The least populated class in y has only 1 members, which is less than n_splits=5.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


The least populated class in y has only 1 members, which is less than n_splits=5.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [121]:

y_pred = cross_val_predict(MultinomialNB(alpha = 3,fit_prior = False),X,y)
print("even more smoothing and a uniform prior:")
print(classification_report(y,y_pred))

even more smoothing and a uniform prior:
              precision    recall  f1-score   support

         ALG       0.00      0.00      0.00         1
         AUS       0.00      0.00      0.00         1
         BAH       0.00      0.00      0.00         1
         CHN       0.00      0.00      0.00         1
         DOM       0.00      0.00      0.00         1
         FRA       0.00      0.00      0.00         2
         GBR       0.40      0.67      0.50         3
         GER       0.00      0.00      0.00         1
         GRN       0.00      0.00      0.00         1
         HUN       0.00      0.00      0.00         2
         JAM       0.60      1.00      0.75         3
         KEN       0.00      0.00      0.00         1
         POL       0.00      0.00      0.00         1
         RSA       0.00      0.00      0.00         2
         RUS       0.00      0.00      0.00         1
         TTO       0.00      0.00      0.00         1
         UGA       0.00      0.00      0


The least populated class in y has only 1 members, which is less than n_splits=5.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



With 34% accuracy, it is not bad, considering how many options there are to choose from. We see that it is easiest to be correct when the athletics and swimming are carried by the winner, which makes sense.