In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn import tree
from sklearn.model_selection import GridSearchCV

# Classification
When importing dataset, by sorting with MatchID, dataset is now timeseries dataset. Every feature is (home-away) and result is decided with score diff. We have total 50 *STANDARDIZED* feature differences. This data frame is for classification.

In [8]:
data=pd.read_csv('Data/Dataset.csv').drop(['Unnamed: 0'],axis=1).sort_values(by=['MatchID'])
df_home=data.filter(regex='home').rename(columns=lambda x:x.replace('_home',''))
df_away=data.filter(regex='away').rename(columns=lambda x:x.replace('_away',''))
df_diff=df_home-df_away
cond=[(df_diff['Score']<0),(df_diff['Score']==0),(df_diff['Score']>0)]
val=['Away','Draw','Home']
df_diff['Result']=np.select(cond,val)
df_diff=df_diff.drop(columns=['Score'])
X=df_diff.iloc[:,:-1]
y=df_diff.iloc[:,-1:].values.ravel()

In [9]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

## Train,Test,Validation set split
Split train,test,validation set into 0.8,0.1,0.1

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2)
X_test,X_val,y_test,y_val=train_test_split(X_test,y_test,test_size=0.5)

In [11]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

((2835, 50), (2835,), (354, 50), (354,), (355, 50), (355,))

## SVM

In [16]:
parameters = {'kernel': ('linear', 'rbf'), 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'C': np.arange(1, 10)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print("test score:"+ str(clf.score(X_test,y_test)))
print("val score:"+str(clf.score(X_val,y_val)))

{'C': 7, 'gamma': 0.0001, 'kernel': 'rbf'}
0.5541446208112875
test score:0.5621468926553672
val score:0.5774647887323944


In [18]:
SVM_result=clf.cv_results_

In [38]:
tmp_score=pd.DataFrame(SVM_result['mean_test_score'])
tmp_attr=pd.DataFrame(SVM_result['params'])

In [54]:
SVM_score=pd.concat([tmp_score,tmp_attr],axis=1)
SVM_score.columns=['Score','C','gamma','kernel']
SVM_score=SVM_score.sort_values(by='Score',ascending=False)

In [55]:
SVM_score

Unnamed: 0,Score,C,gamma,kernel
69,0.554145,7,0.0001,rbf
59,0.553792,6,0.0001,rbf
79,0.553086,8,0.0001,rbf
49,0.552381,5,0.0001,rbf
39,0.552028,4,0.0001,rbf
...,...,...,...,...
81,0.450441,9,1.0000,rbf
61,0.450441,7,1.0000,rbf
21,0.450441,3,1.0000,rbf
71,0.450441,8,1.0000,rbf


more refine with fixed kernel

In [74]:
parameters = {'kernel': ['rbf'], 'gamma': [0.0001,0.00001,0.000001], 'C': np.arange(1, 10)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print("test score:"+ str(clf.score(X_test,y_test)))
print("val score:"+str(clf.score(X_val,y_val)))

{'C': 7, 'gamma': 0.0001, 'kernel': 'rbf'}
0.5541446208112875
test score:0.5621468926553672
val score:0.5774647887323944


### optimal parameter for SVM classifier
> 'C': 7, 'gamma': 0.0001, 'kernel': 'rbf'

test score:0.56
val score:0.57

## SGD classifier

In [56]:
parameters = {'loss':['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],'max_iter':[500, 1000, 1500, 2000, 2500, 3000]}
svc = SGDClassifier()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print("test score:"+ str(clf.score(X_test,y_test)))
print("val score:"+str(clf.score(X_val,y_val)))

{'loss': 'hinge', 'max_iter': 2500}
0.5178130511463845
test score:0.5084745762711864
val score:0.504225352112676


In [57]:
SGD_result=clf.cv_results_

In [58]:
SGD_result

{'mean_fit_time': array([0.04657016, 0.04790359, 0.04653454, 0.04534016, 0.05208573,
        0.04828176, 0.06549859, 0.0649354 , 0.06655421, 0.07005467,
        0.06573906, 0.06637435, 0.05401974, 0.0549511 , 0.05217109,
        0.05947337, 0.05278435, 0.05589805, 0.04504905, 0.05192137,
        0.06072502, 0.05273027, 0.04371166, 0.04225407, 0.04774833,
        0.04566865, 0.04479065, 0.04643216, 0.04516473, 0.04904084,
        0.09585299, 0.09775386, 0.10857973, 0.10312271, 0.08956218,
        0.08989706, 0.03339806, 0.03314543, 0.03145885, 0.03165965,
        0.03286886, 0.03487067, 0.1069818 , 0.09043384, 0.09237661,
        0.08668704, 0.08226018, 0.09150434, 0.13087816, 0.12944274,
        0.1207067 , 0.12655139, 0.13280582, 0.14331365]),
 'std_fit_time': array([0.00634389, 0.01058731, 0.0081599 , 0.00489757, 0.00861178,
        0.00340617, 0.00385678, 0.00920806, 0.00482274, 0.01136555,
        0.00953257, 0.00462567, 0.00722481, 0.00530628, 0.01263617,
        0.01111073, 0.011

In [62]:
tmp_score=pd.DataFrame(SGD_result['mean_test_score'])
tmp_attr=pd.DataFrame(SGD_result['params'])

In [70]:
SGD_score=pd.concat([tmp_score,tmp_attr],axis=1)
SGD_score.columns=['Score','loss','max_iter']
SGD_score=SGD_score.sort_values(by='Score',ascending=False)

In [71]:
SGD_score

Unnamed: 0,Score,loss,max_iter
4,0.517813,hinge,2500
6,0.506173,log_loss,500
37,0.503704,huber,1000
2,0.500529,hinge,1500
10,0.500176,log_loss,2500
36,0.500176,huber,500
0,0.497707,hinge,500
40,0.496649,huber,2500
38,0.495944,huber,1500
8,0.493122,log_loss,1500


more refine with fixed loss function(hinge)

In [78]:
parameters = {'loss':['hinge'],'max_iter':np.arange(2000,6000,step=500),'random_state':np.arange(0,10)}
svc = SGDClassifier()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print("test score:"+ str(clf.score(X_test,y_test)))
print("val score:"+str(clf.score(X_val,y_val)))

{'loss': 'hinge', 'max_iter': 2000, 'random_state': 5}
0.508994708994709
test score:0.5338983050847458
val score:0.5774647887323944


### optimal parameter for SGD classifier
> 'loss': 'hinge', 'max_iter': 2000

test score:0.53
val score:0.57

## Decision Tree

In [12]:
parameters = {'random_state':np.arange(0,10)}
svc = SGDClassifier()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
print("test score:"+ str(clf.score(X_test,y_test)))
print("val score:"+str(clf.score(X_val,y_val)))
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)

{'random_state': 7}
0.5174603174603174
test score:0.5508474576271186
val score:0.543661971830986


### visualize decision tree

In [13]:
from sklearn.tree import export_graphviz
export_graphviz(clf, out_file="tree.dot", class_names = y_train,
                feature_names = X.columns, impurity=True, filled=True)

In [15]:
import graphviz
with open("tree.dot") as f:
 dot_graph = f.read()
graphviz.Source(dot_graph)

ModuleNotFoundError: No module named 'graphviz'

# Regression

## Logistic Regression

In [26]:
parameters={'max_iter':[500, 1000, 1500, 2000, 2500, 3000]}
 = LogisticRegression()
clf.max_iter=500
clf.fit(X_train, y_train)

In [27]:
clf.score(X_test,y_test)

0.519774011299435

In [28]:
clf.score(X_val,y_val)

0.5492957746478874