In [None]:
import numpy as np
import pandas as pd
import cleandata
import plots
import regressor
import classifier
from sklearn.metrics import mean_squared_error as mse

In [None]:
data = pd.read_csv('clean_data2.csv')

valid_year = 2012
test_year = 2016
run_classifier = True
run_tuning = True

classifier_list = ['LogisticRegression','SVC','GaussianNB','RandomForest','MLP']
regressor_list = ['LinearRegression','Ridge','Lasso','SVR','RandomForest']


In [None]:
def train_classifiers(x_t, y_t, x_v, y_v, run_cv=False):
    
    clf_predictions = {}
    clf_performance = {}
    clf_tuned = {}
    
    for clf_type in classifier_list:
        clf = classifier.Classifier(model_type=clf_type)
        
        if run_cv:
            clf.fit_cv(x_t, y_t)
            y_predict = np.rint(clf.model_cv.predict(x_v))
        
        else:
            clf.fit(x_t, y_t)
            y_predict = np.rint(clf.model_cv.predict(x_v))
            
        y_predict[y_predict<0] = 0
        
        clf_performance[clf_type] = score_clf_models(y_predict, clf_type)
        clf_predictions[clf_type] = y_predict
        
        if run_cv:
            clf_tuned[clf_type] = clf.model_cv
        
        else:
            clf_tuned[clf_type] = clf.model
            
        plots.plot_clf(clf_performance)
        
        return clf_predictions, clf_performance, clf_tuned
 

In [None]:
def train_regressor(x_t, y_t, x_v, y_v, clf_predict=None, run_cv=False):
    
    y_predict = 99*np.ones(len(y_v))
    
    if clf_predict is not None:
        y_predict[clf_predict==0] = 0
        x_t = x_t[y_t>0]
        y_t = y_t[y_t>0]
        x_v = x_v[clf_predict==1]
        y_v = y_v[clf_predict==1]
        
    reg_performance = {}
    reg_tuned = {}
    
    for reg_type in regressor_list:
        reg = regressor.Regressor(model_type=reg_type)
        if not run_cv:
            reg.fit(x_t, y_t)
            yp = np.rint(reg.predict(x_v))
            
        else:
            reg.fit_cv(x_t, y_t)
            yp = np.rint(reg.model_cv.predict(x_v))
            
        yp[yp<0] = 0
        
        if clf_predict is not None:
            y_predict[clf_predict==1] = yp
            
        else:
            y_predict = yp
            
        plots.plot(y_valid, y_predict, reg_type+'Prediction',
                   line=True, save_path='regressorPred.ps')
        
        reg_performance[reg_type] = score_reg_model(y_predict, reg_type)
        
        if run_cv:
            reg_tuned[reg_type] = reg.model_cv
        else:
            reg_tuned[reg_type] = reg.model
            
    if clf_predict is not None:
        save_as = 'RegPerformanceWithClassification.ps'
    else:
        save_as = 'RegPerformanceWithoutClassification.ps'
    
    plots.plot_reg(reg_performance, save_path = save_as)
    
    return reg_performance, reg_tuned


In [None]:
def predict_test_set(x_t, y_t, x_v, y_v, clf=None, reg=None, classify=False):
    
    clf_predict = np.ones(len(y_v))
    yt_bool = np.zeros(len(y_t))
    yt_bool[y_t!=0] = 1
    
    if classify:
        clf_predict = clf.fit(x_t, yt_bool).predict(x_v)
        
    y_predict = 99*np.ones(len(y_v))
    
    y_predict[clf_predict==0] = 0
    x_v = x_v[clf_predict==1]
    y_v = y_v[clf_predict==1]

    yp = np.rint(reg.fit(x_t, y_t).predict(x_v))
            
    yp[yp < 0] = 0
        
    y_predict[clf_predict==1] = yp
   
    plots.plot(yv, y_predict, 'Final Alg' + ' Predictions',
            line=True,save_path='final_prediction.ps')
                
    test_performance = score_reg_model(y_predict, 'Final Alg')
    
    return test_performance


In [None]:
def score_clf_models(y_predict, model):
    i=0
    nations=[]
    yv_print = np.zeros(len(y_valid))
    yp_print = np.zeros(len(y_predict))
    for index in y_valid.index:
        nations.append(data.iloc[index].Nation)
        if y_valid[index]>0 : 
            yv_print[i] =1
            yp_print[i]=y_predict[i]
            i+=1
            
    print(model+'Accuracy: ' + str(np.sum(yv_print==yp_print)/len(yv_print)))
        
    return np.sum(yv_print==yp_print)/len(yv_print)


In [None]:
def score_reg_model(y_predict, reg_type):
    i=0
    nations = []
    yv_print = np.zeros(len(y_valid))
    yp_print = np.zeros(len(y_predict))
    for index in y_valid.index:    
        nations.append(data.iloc[index].Nation)
        yv_print[i] = y_valid[index]
        yp_print[i] = y_predict[i]
        i += 1
    my_df = pd.DataFrame(data={'Nation':nations,'Actual_Medals':yv_print,'Predicted_Medals':yp_print})
    print(my_df.sort_values(by=['Actual_Medals'],ascending=False).head(100))
    my_df.sort_values(by=['Actual_Medals'],ascending=False).head(100).to_csv('final_alg_predictions.csv',index=False)
    top_sorted = my_df.sort_values(by=['Actual_Medals'],ascending=False)[0:100]
    print(reg_type + 'Average Std Loss: ' + str(np.sqrt(mse(yv,yp_print))))
    print(reg_type + 'Avg Std Loss Top 10: ' + str(np.sqrt(mse(top_sorted['Actual_Medals'], top_sorted['Predicted_Medals']))))
    return np.array([np.sqrt(mse(yv,yp_print)),np.sqrt(mse(top_sorted['Actual_Medals'], top_sorted['Predicted_Medals']))]) 


In [None]:
def dict_argmin(mydict):
    if not mydict: 
        return None
    min_val = min(mydict.values())
    return [k for k in mydict if mydict[k] == min_val][0]



In [None]:
def dict_argmax(mydict):
    if not mydict: 
        return None
    
    max_val = max(mydict.values())
    return [k for k in mydict if mydict[k] == max_val][0]

In [None]:
if __name__ == '__main__': 
    # Split data into model tuning set (1988-2008) and validation set (2012)    
    # For model iteration and selection
    trainvalid = cleandata.train_test_split(data,valid_year,normalized=False)
    x_train = trainvalid[0]
    y_train = trainvalid[1]
    x_valid = trainvalid[2]
    y_valid = trainvalid[3]
    
    xt, yt, xv, yv = cleandata.to_numpy(x_train, y_train, x_valid, y_valid)

    yt_clf, yv_clf = cleandata.to_clf_data(yt,yv)
    
    if not run_classifier:
        reg_performance, reg_tuned = train_regressor(xt, yt, xv, yv, 
                                                      clf_predict=None,
                                                      run_cv=run_tuning)
    else:
    
        clf_predictions, clf_performance, clf_tuned = train_classifiers(xt, yt_clf, xv, yv_clf, 
                                                                        run_cv=run_tuning)
        
        best_classifier = dict_argmax(clf_performance)
        
        reg_performance, reg_tuned = train_regressor(xt, yt, xv, yv, 
                                            clf_predict=clf_predictions[best_classifier],
                                            run_cv=run_tuning)

    reg_scores = {}
    for regressor in list(reg_performance.keys()):
        score = reg_performance[regressor][0] + 0.25*reg_performance[regressor][1]
        reg_scores[regressor] = score
                          
    best_regressor = dict_argmin(reg_scores)    
    
    # Split data into final training set (1988-2012) and test set (2016)
    # For final predictions    
    traintest = cleandata.train_test_split(data, test_year, normalized=False)
    x_train = traintest[0]
    y_train = traintest[1]
    x_test = traintest[2]
    y_test = traintest[3]
    
    xt, yt, xtest, ytest = cleandata.to_numpy(x_train, y_train, x_test, y_test)
    
    if not run_classifier:
        predict_test_set(xt, yt, xtest, ytest, clf=None, reg=reg_tuned[best_regressor], classify=False)
    else:
        
        if 'CV' in str(type(clf_tuned[best_classifier])):
            best_clf = clf_tuned[best_classifier].best_estimator_
        else:
            best_clf = clf_tuned[best_classifier]
            
        if 'CV' in str(type(reg_tuned[best_regressor])):
            best_reg = reg_tuned[best_regressor].best_estimator_
        else:
            best_reg = reg_tuned[best_regressor]
        
        predict_test_set(xt, yt, xtest, ytest, clf=best_clf, reg=reg_tuned['Ridge'], classify=True)
        
        try:
            print(reg_tuned[best_regressor].best_estimator_.get_params())
        except:
            print(reg_tuned[best_regressor].get_params())
        try:
            print(clf_tuned[best_classifier].best_estimator_.get_params())
        except:
            print(clf_tuned[best_classifier].get_params())
        
    # For plotting final results
    reg_perf_after = reg_performance
    plots.plot_before_after(reg_performance, reg_perf_after)