## Import Pandas and the classifiers to experiment with

In [None]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

## Evaluate models on test data

In [None]:
from sklearn import metrics

def evaluate_model(model, row_name, training_file, test_file):
    training_df = pd.read_csv(training_file, index_col=0)
    test_df = pd.read_csv(test_file, index_col=0)

    target_feature = 'won_match'
    training_columns = [col for col in training_df.columns if col != target_feature]

    model.fit(training_df[training_columns], training_df[target_feature])
    predictions = model.predict(test_df[training_columns])
    
    misclassification = 1 - metrics.accuracy_score(predictions, test_df[target_feature])
    recall = metrics.recall_score(predictions, test_df[target_feature])
    precision = metrics.precision_score(predictions, test_df[target_feature])
    f1 = metrics.f1_score(predictions, test_df[target_feature])
    
    confusion_matrix = metrics.confusion_matrix(predictions, test_df[target_feature])
    true_positives = confusion_matrix[1][1]
    true_negatives = confusion_matrix[0][0]
    false_positives = confusion_matrix[0][1]
    false_negatives = confusion_matrix[1][0]
    true_positive_rate = true_positives / (true_positives + false_negatives)
    true_negative_rate = true_negatives / (true_negatives + false_positives)
    false_positive_rate = false_positives / (true_negatives + false_positives)
    false_negative_rate = false_negatives / (true_positives + false_negatives)
    
    return [
        row_name,
        misclassification,
        recall,
        precision,
        f1,
        true_positive_rate,
        false_positive_rate,
        true_negative_rate,
        false_negative_rate,
    ]


def evaluate_model_by_league(model):
    data= [
        evaluate_model(model, 'Premier League', 'data/premier_league.csv', 'data/test_premier_league.csv'),
        evaluate_model(model, 'Premier League - Base', 'data/individual_teams.csv', 'data/test_premier_league.csv'),
        evaluate_model(model, 'Bundesliga', 'data/bundesliga.csv', 'data/test_bundesliga.csv'),
        evaluate_model(model, 'Bundesliga - Base', 'data/individual_teams.csv', 'data/test_bundesliga.csv'),
        evaluate_model(model, 'La Liga', 'data/la_liga.csv', 'data/test_la_liga.csv'),
        evaluate_model(model, 'La Liga - Base', 'data/individual_teams.csv', 'data/test_la_liga.csv'),
        evaluate_model(model, 'Ligue Un', 'data/ligue_un.csv', 'data/test_ligue_un.csv'),
        evaluate_model(model, 'Ligue Un - Base', 'data/individual_teams.csv', 'data/test_ligue_un.csv'),
        evaluate_model(model, 'Serie A', 'data/serie_a.csv', 'data/test_serie_a.csv'),
        evaluate_model(model, 'Serie A - Base', 'data/individual_teams.csv', 'data/test_serie_a.csv'),
    ]
    
    return pd.DataFrame(data, columns=[
            'League', 
            'Misclassification', 
            'Recall', 
            'Precision', 
            'F1', 
            'TPR', 
            'FPR', 
            'TNR', 
            'FNR'
        ])

In [None]:
model = LogisticRegression(solver='lbfgs')
evaluate_model_by_league(model)

In [None]:
evaluate_model_by_league(RandomForestClassifier())

In [None]:
evaluate_model_by_league(DecisionTreeClassifier())

In [None]:
evaluate_model_by_league(SVC(kernel='linear'))

In [None]:
evaluate_model_by_league(SGDClassifier(loss='modified_huber', learning_rate='optimal', eta0=1, alpha=0.00001))

In [None]:
evaluate_model_by_league(MLPClassifier(hidden_layer_sizes=(90,), activation='logistic', max_iter=200))