# Load Libraries and Set Hyperparameters

In [177]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# import feather

In [190]:
# Set hyperparameter values
num_trees = 30
# rand_state = np.random.normal()
rand_state = 420
num_jobs = 1 # parameter for parallelization
test_prop = 0.15 # proportion of data to use for testing

# Read in Data and Construct Predictors and Set Outcome Variable

Assumption is that data has AT LEAST the following columns: (1) "league", (2) "outcome" (i.e., who won which is either home, away, or draw), and (2) home-, (3) away-, and (4) draw-odds at every hour up to 72 hours before the game. The form of the odds should be "`<home/away/draw>_<{0,1,...,71}>`" where 0 corresponds to closing odds (i.e., odds closest to game time) and 71 corresponds to opening odds (i.e., odds furthest from game time).

### Read in Data

In [191]:
os.getcwd()
os.chdir("/Users/Erik/Desktop/Eulers-Men-Sports-Betting/")

'C:\\Users\\Erik\\Desktop\\Eulers-Men-Sports-Betting'

In [192]:
df = pd.read_csv('DataPreprocessing/ml_df_test.csv', sep=',')
# df.describe()
# df.dtypes
# df["league"].astype('category')
# df.dtypes

### Construct Predictors and Outcome Variables

Ignoring "league" right now because of one-hot-encoding issues

In [193]:
# Predictors including home-away odds differences, draw odds, and league
# predictors = [("home_away_" in column) or ("draw_" in column) or ("league" in column) for column in list(df.columns)]

# Predictors including home-away odds differences and draw odds
# predictors = [("home_away_" in column) or ("draw_" in column) for column in list(df.columns)]

# Predictors including home-away odds differences and just closing draw odds
# predictors = [("home_away_" in column) or ("draw_" in column) for column in list(df.columns)]

# Predictors including just closing home-away odds differences
# predictors = [("home_away_diff_0" in column) for column in list(df.columns)]

# Predictors just home_0, away_0, draw_0
predictors = [("_0" in column) for column in list(df.columns)]

df.columns[predictors]

X = df.iloc[:,predictors]

Index(['home_away_diff_0', 'draw_0', 'home_0', 'away_0'], dtype='object')

In [194]:
y = df["outcome"]

In [195]:
X.head()
y.head()

Unnamed: 0,home_away_diff_0,draw_0,home_0,away_0
0,-12.86,6.0,1.14,14.0
1,-2.27,3.5,1.73,4.0
2,-0.35,3.4,2.45,2.8
3,-11.71,4.75,1.29,13.0
4,-1.0,3.75,2.1,3.1


0    home
1    away
2    home
3    draw
4    home
Name: outcome, dtype: object

```

```

# ML Model

In [196]:
# Use random state
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_prop, random_state=rand_state)
# RF_clf = RandomForestClassifier(n_estimators=num_trees, random_state=rand_state)

# Does not use random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_prop)
RF_clf = RandomForestClassifier(n_estimators=num_trees)

In [197]:
RF_clf = RandomForestClassifier(n_estimators=num_trees, random_state=rand_state)
RF_clf = RandomForestClassifier(n_estimators=num_trees)

In [198]:
RF_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [199]:
y_pred = RF_clf.predict(X_test)
# y_pred[:10], y_test[:10]

(array(['home', 'home', 'home', 'home', 'home', 'home', 'away', 'home',
        'away', 'home'], dtype=object), 14815    home
 13123    draw
 6893     away
 24054    home
 11575    away
 19289    away
 22701    home
 6469     home
 3200     draw
 6218     draw
 Name: outcome, dtype: object)

In [201]:
sum(y_pred==y_test)/len(y_test)

# feature_importances = pd.DataFrame(RF_clf.feature_importances_,
#                                    index = X_train.columns,
#                                     columns=['importance']).sort_values('importance',                                                                 ascending=False)
# feature_importances


0.44731878199946107

# Compare our Model to Other Betting Strategies

### Bet on Team with Higher Odds

In [202]:
def pred_team_with_higher_odds(X_test):
    predictions = ["home" if odds_diff<0 else "away" for odds_diff in X_test['home_away_diff_0']]
    return(np.array(predictions))

In [203]:
bet_high_odds_preds = pred_team_with_higher_odds(X_test)
y_test.shape
y_pred.shape
bet_high_odds_preds.shape
sum(bet_high_odds_preds==y_test)/len(y_test)


(3711,)

(3711,)

(3711,)

0.4925895984909728

In [99]:
df = X_test[['home_away_diff_71', 'draw_71']]
predictions = ["home" if odds_diff>=0 else "away" for odds_diff in X_test['home_away_diff_71']]
predictions[:10]
X_test['home_away_diff_71'][:10]


['away',
 'away',
 'away',
 'away',
 'away',
 'home',
 'away',
 'home',
 'away',
 'home']

18654    -1.30
5997     -4.25
28235    -1.10
15046    -2.20
6881    -13.78
3462      1.00
23143    -0.50
23735     2.94
17508    -3.97
15237     6.56
Name: home_away_diff_71, dtype: float64

In [91]:
X_test[['home_away_diff_71', 'draw_71']]

Unnamed: 0,home_away_diff_71,draw_71
18654,-1.30,3.40
5997,-4.25,4.33
28235,-1.10,3.00
15046,-2.20,3.80
6881,-13.78,5.50
3462,1.00,3.30
23143,-0.50,3.40
23735,2.94,4.10
17508,-3.97,4.33
15237,6.56,3.75


# Ideas
1. Predict winnings given bet size
2. Predict who will win or lose
2. Classifier that will spit out probabilities of winning

# Steps
1. For given dataset (i.e., csv file), create $X$ matrix with features and $y$ vector with outcomes. This is the one step that is dependent on the dataset. Everything else that follows should be independent of dataset.
2. Split data into train/test split
3. Classify
4. Spit out plots and metrics

# Notes
* **Predictors being used now**: 
* Some ML models (e.g., regression and SVM) are algebraic and thus their input must be numeric. To use these categories must be transformed into numbers

# To Dos
### Model evaluation-wise
1. Compare performance to just choosing favored team
2. Tune model re number of trees and any other parameters
3. How does model perform on those games for which difference in odds is high? Same question for difference in odds being low? 

### Miscellaneous
1. How is variable importance calculated for RFs? See [here](https://towardsdatascience.com/random-forest-in-python-24d0893d51c0)
2. How 

# Old Code

In [50]:
# label_colmn = "outcome"
# predictor_colmns = ['closing_odds_outcome', 'home_team', 'away_team', 'home_opening', 'home_closing', 'draw_opening',
#        'draw_closing', 'away_opening', 'away_closing',
#        'home_opening_minus_closing', 'draw_opening_minus_closing',
#        'away_opening_minus_closing', 'home_min', 'home_max', 'home_range',
#        'draw_min', 'draw_max', 'draw_range', 'away_min', 'away_max',
#        'away_range']
# # predictor_colmns = ['closing_odds_outcome']


# label_colmn 

In [27]:
# null_columns=X_train.columns[X_train.isnull().any()]
# print(X_train[X_train.isnull().any(axis=1)][null_columns].head())
# X_train['closing_odds_outcome'][289]
# null_columns
# X_train.isnull().any(axis=1).sum()

In [23]:
os.getcwd()
# df.shape
# df
tmp = df['outcome'].astype('category')
tmp.dtype
df.columns
pd.factorize(y)

'C:\\Users\\Erik\\Desktop\\Eulers-Men-Sports-Betting\\ML_Script'

CategoricalDtype(categories=['away', 'draw', 'home'], ordered=False)

Index(['Unnamed: 0', 'V1', 'match_id', 'match_date', 'match_time', 'outcome',
       'closing_odds_outcome', 'home_opening', 'home_closing', 'draw_opening',
       'draw_closing', 'away_opening', 'away_closing',
       'home_opening_minus_closing', 'draw_opening_minus_closing',
       'away_opening_minus_closing', 'home_min', 'home_max', 'home_range',
       'draw_min', 'draw_max', 'draw_range', 'away_min', 'away_max',
       'away_range'],
      dtype='object')

(array([0, 1, 0, 0, 0, 0, 2, 2, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2, 0, 2, 2, 0,
        1, 1, 0, 2, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 2, 2, 1, 1,
        0, 0, 1, 1, 2, 0, 1, 0, 1, 1, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 0, 2,
        2, 1, 0, 2, 1, 0, 2, 2, 0, 0, 1, 2, 0, 1, 2, 1, 0, 0, 1, 1, 1, 1,
        1, 2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 0, 0, 1, 0, 2, 1, 0, 1, 0, 0, 2,
        0, 2, 1, 0, 0, 2, 0, 0, 0, 0, 1, 1, 2, 0, 1, 2, 2, 2, 0, 0, 1, 2,
        1, 0, 0, 2, 0, 0, 0, 2, 1, 2, 0, 1, 1, 0, 1, 2, 0, 2, 2, 1, 1, 2,
        0, 1, 1, 0, 1, 0, 2, 0, 2, 1, 0, 2, 0, 1, 2, 0, 0, 1, 2, 0, 1, 1,
        1, 2, 1, 0, 0, 1, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0,
        1, 0, 0, 0, 1, 0, 1, 2, 2, 1, 0, 2, 1, 0, 1, 1, 2, 2, 0, 2, 0, 0,
        2, 0, 1, 0, 0, 2, 0, 2, 1, 2, 2, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 2,
        0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 1, 0, 2, 1, 2, 1, 0, 0, 0, 0, 2, 2,
        1, 0, 1, 2, 0, 1, 1, 0, 0, 1, 1, 2, 2, 2, 0, 1, 1, 0, 0, 2, 0, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 

In [28]:
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

Series([], dtype: float64)