In [None]:
import math
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from time import time 
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from xgboost import plot_importance
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer


def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target, label):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label=label), sum(target == y_pred) / float(len(y_pred))
    #return f1_score(target, y_pred), sum(target == y_pred) / float(len(y_pred))
    #return f1_score(target, y_pred, average='samples'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test, label):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train, label)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test, label)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))
    

def show_features_importances(clf_H):
    #plt.bar(range(len(clf_H.feature_importances_)), clf_H.feature_importances_)
    plot_importance(clf_H,max_num_features =20)
    plt.show()

def _train_label(clf,X_all,y_all,label):   
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
                                                    test_size = 0.07,
                                                    random_state = 2,
                                                    stratify = y_all)
    #print(y_train[:10])    
    train_predict(clf, X_train, y_train, X_test, y_test, label)   
    return clf

def train_label_xgb(X_all,y_all,label, weight=0):
    if weight != 1:
        weight = y_all.value_counts()['N']/y_all.value_counts()[label]
        #display("weight",weight)
    clf = xgb.XGBClassifier(seed = 2, scale_pos_weight = weight)    
    return _train_label(clf,X_all,y_all,label)

def train_label_svc(X_all,y_all,label):    
    clf = SVC(random_state = 2, kernel='rbf',  probability=True)    
    return _train_label(clf,X_all,y_all,label)

def train_label_lg(X_all,y_all,label):    
    clf = LogisticRegression(random_state = 42)
    return _train_label(clf,X_all,y_all,label)


def tune_parameters(X_all, y_all, label, parameters, random=False):
   
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
                                                    test_size = 0.07,
                                                    random_state = 2,
                                                    stratify = y_all)

             
    # TODO: Initialize the classifier
    clf = xgb.XGBClassifier(seed=2)

    # TODO: Make an f1 scoring function using 'make_scorer' 
    f1_scorer = make_scorer(f1_score,pos_label='H')

    # TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
    if (random ==False):
        grid_obj = GridSearchCV(clf,
                            scoring=f1_scorer,
                            param_grid=parameters,
                            cv=5)
    else:
        grid_obj = RandomizedSearchCV(clf,
                            scoring=f1_scorer,
                            param_distributions=parameters,
                            n_iter=200,
                            cv=5)

    # TODO: Fit the grid search object to the training data and find the optimal parameters
    grid_obj = grid_obj.fit(X_train,y_train)

    display(grid_obj.grid_scores_)
    # Get the estimator
    clf = grid_obj.best_estimator_
    display("Best score: {:.4f}".format(grid_obj.best_score_))
    display(clf)

    # Report the final F1 score for training and testing after parameter tuning
    f1, acc = predict_labels(clf, X_train, y_train,'H')
    display("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

    f1, acc = predict_labels(clf, X_test, y_test,'H')
    display("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    return grid_obj



def get_model(clf, X_all,y_all,parameters, label, weight=0):
    if weight != 1:
        weight = y_all.value_counts()['N']/y_all.value_counts()[label]
        parameters['scale_pos_weight'] = [weight]
        #display("weight",weight)
        
    # TODO: Initialize the classifier
    #clf = xgb.XGBClassifier(seed=2)

    # TODO: Make an f1 scoring function using 'make_scorer' 
    f1_scorer = make_scorer(f1_score,pos_label=label)    
    #f1_scorer = make_scorer(custom_loss_func, greater_is_better=True, X_used = X_all)

    # TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
    grid_obj = GridSearchCV(clf,
                            scoring=f1_scorer,
                            param_grid=parameters,
                            cv=5)

    # TODO: Fit the grid search object to the training data and find the optimal parameters
    grid_obj = grid_obj.fit(X_all,y_all)

    # Get the estimator
    clf = grid_obj.best_estimator_
    display(clf)

    # Report the final F1 score for training and testing after parameter tuning
    #f1, acc = predict_labels(clf, X_train, y_train, label_to_train)
    f1, acc = predict_labels(clf, X_all, y_all, label)
    display("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    return clf

def simulate_predict(clf, X_last, y_last, label):
    #display(X_last.head())
    y_pred = clf.predict(X_last)
    y_pred_prob = clf.predict_proba(X_last)
    #display(y_pred)
    #display(y_last)
    f1, acc = predict_labels(clf, X_last, y_last,label)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    return y_pred, y_pred_prob



def simulate_bets(y_pred_, y_pred_prod_, y_last_bets, label, column, prob, umbral):
    balance = 100
    bet = balance/20
    counter = 0
   
    wins = 0
    skipped = 0
    lbalance = []

    for index, row in y_last_bets.iterrows():
        ftr = row['FTR']
        ftrr = row['FTR']
        odds = row[column];
        prediction = y_pred_[counter]
        prediction_prob = y_pred_prod_[counter]
        #print(prediction_prob)
        counter = counter +1

        #if ((1/prediction_prob[0])-0.05 < row['B365H'] and prediction_prob[0] > 0.65 and prediction == 'H' ):
        #if (prediction_prob[0] > 0.5 ):    
        #if (prediction_prob[0] > 0.45):  
        lbalance.append(balance)
        #print(prediction,prediction_prob[0],row[column],label)
        #if (((prediction_prob[0] > prob and prediction == label) or (prob < 0.5 and prediction_prob[0] < prob)) and row[column]>umbral ):    
        if (math.isnan(row[column])):
            skipped = skipped +1
            continue
        elif (prob < 0.5 and prediction_prob[0] > prob and row[column]>umbral ):    
            balance = balance - bet
            if (ftr == label):           
                wins = wins+1
                balance = balance + (bet*row[column])
        elif (prediction_prob[0] > prob and prediction == label and row[column]>umbral ):    
            balance = balance - bet
            if (ftr == label):           
                wins = wins+1
                balance = balance + (bet*row[column])
        else:
            skipped = skipped +1
            #print("{:.0f}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\t{:.0f}\t{:.2f}\t{:.3f}\t{:.1f} skip".format(counter-1,ftrr,ftr,prediction,prediction_prob[0],prediction_prob[1],index,odds,1/prediction_prob[0], balance))
            continue

        #print("{:.0f}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\t{:.0f}\t{:.3f}\t{:.3f}\t{:.1f} ".format(counter-1,ftrr,ftr,prediction,prediction_prob[0],prediction_prob[1],index,odds, 1/prediction_prob[0],balance))
        #print(counter-1,ftr,prediction,index, balance)


    total = len(y_pred_)-skipped
    if (total == 0):
        total = 1
    print(len(y_pred_)-skipped, wins)
    print("Balance and accuracy score for training set: {:.4f} , {:.4f}.".format(balance , (wins/total)))
    return lbalance

def custom_loss_func(y_true, y_pred, X_used = None):
    #print(len(y_true))
    #print(y_true.values[0])
    count = 0
    error = 0
    for k, v in y_true.iteritems():
        #print("key: {}, value: {}".format(k, v))
        #print(k)
        if (y_pred[count] == 'H'):
            if (v == 'H'):
                try:
                    error += 1*(X_used.iloc[k].IWH -1)
                except:
                    count +=1
                    continue
            else:
                error += -1
        count +=1
    return error
