In [19]:
import pandas as pd 
import sqlite3
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [20]:
# Getting the input data
con = sqlite3.connect("database.sqlite")

In [21]:
# Converting query to DataFrame
raw_data = pd.read_sql_query("SELECT * FROM football_data", con)
print(raw_data.columns)

Index(['Season', 'Date', 'Div', 'Country', 'League', 'Referee', 'HomeTeam',
       'AwayTeam', 'FTHG', 'FTAG',
       ...
       'VCCD', 'VCCH', 'VCD', 'VCH', 'WHA', 'WHCA', 'WHCD', 'WHCH', 'WHD',
       'WHH'],
      dtype='object', length=174)


In [22]:
first_division = ['F1','SP1','I1','D1','E0']
second_division = ['N1','P1','E1','SC0','T1','B1']
third_division = ['G1','I2','F2','E2','SP2','SC1','D2']
fourth_division = ['SC2','SC3','EC','E3']

'''
    Div = League Division
        FTHG and HG = Full Time Home Team Goals
        FTAG and AG = Full Time Away Team Goals
        FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)
        HTHG = Half Time Home Team Goals
        HTAG = Half Time Away Team Goals
        HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)

    Match Statistics (where available)
        HS = Home Team Shots
        AS = Away Team Shots
        HST = Home Team Shots on Target
        AST = Away Team Shots on Target
        HHW = Home Team Hit Woodwork
        AHW = Away Team Hit Woodwork
        HF = Home Team Fouls Committed
        AF = Away Team Fouls Committed
        HFKC = Home Team Free Kicks Conceded
        AFKC = Away Team Free Kicks Conceded
        HO = Home Team Offsides
        AO = Away Team Offsides
        HY = Home Team Yellow Cards
        AY = Away Team Yellow Cards
        HR = Home Team Red Cards
        AR = Away Team Red Cards
'''

work_data = raw_data[['Div','HS','AS','HST','AST','HY','AY','HR','AR','HF','AF','FTHG','FTAG','FTR','HTR','HTHG','HTAG','HC','AC','IWA','IWD','IWH', 'B365H', 'B365D', 'B365A']]

work_data = work_data.dropna()
work_data.reset_index(drop = True)

Unnamed: 0,Div,HS,AS,HST,AST,HY,AY,HR,AR,HF,...,HTHG,HTAG,HC,AC,IWA,IWD,IWH,B365H,B365D,B365A
0,E0,13.0,6.0,6.0,4.0,2.0,0.0,0.0,0.0,10.0,...,0,1,8.0,1.0,4.20,3.30,1.95,1.830,3.40,4.600
1,E0,12.0,20.0,5.0,9.0,3.0,1.0,0.0,0.0,10.0,...,0,1,9.0,9.0,5.10,4.25,1.63,1.570,4.33,5.500
2,F1,18.0,18.0,2.0,6.0,3.0,3.0,0.0,0.0,15.0,...,1,2,12.0,6.0,2.50,3.05,3.10,3.200,3.00,2.500
3,SC0,3.0,11.0,1.0,2.0,1.0,1.0,0.0,0.0,19.0,...,0,0,2.0,7.0,4.05,3.25,1.90,1.830,3.60,4.200
4,SC0,8.0,16.0,3.0,6.0,1.0,1.0,0.0,0.0,10.0,...,0,2,3.0,7.0,2.55,3.10,2.75,2.750,3.50,2.400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70530,SC0,16.0,11.0,5.0,4.0,0.0,2.0,0.0,0.0,15.0,...,0,1,9.0,6.0,2.50,2.90,2.30,2.375,3.20,2.625
70531,SC0,12.0,8.0,4.0,2.0,3.0,0.0,0.0,0.0,17.0,...,1,0,7.0,3.0,2.60,3.00,2.20,2.375,3.20,2.625
70532,SC0,6.0,16.0,4.0,9.0,1.0,0.0,0.0,0.0,17.0,...,0,1,2.0,10.0,1.40,3.60,5.50,7.500,3.75,1.400
70533,SC0,9.0,9.0,2.0,3.0,1.0,2.0,0.0,0.0,11.0,...,0,0,6.0,2.0,2.70,2.90,2.20,2.250,3.25,2.750


In [23]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  
label_encoder_x = LabelEncoder()  

work_data['FTR'] = label_encoder_x.fit_transform(work_data['FTR']) 
work_data['HTR'] = label_encoder_x.fit_transform(work_data['HTR']) 

work_data = work_data.replace(first_division,1)
work_data = work_data.replace(second_division,2)
work_data = work_data.replace(third_division,3)
work_data = work_data.replace(fourth_division,4)

In [24]:
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test = train_test_split(work_data.iloc[:, 1:25], work_data.iloc[:, 0], test_size= 0.2, random_state=0)  

In [25]:
from sklearn.preprocessing import StandardScaler

st_x = StandardScaler()

x_train = st_x.fit_transform(x_train)
x_test= st_x.transform(x_test)  

In [26]:
from sklearn.linear_model import LogisticRegression

logiRegModel = LogisticRegression()
logiRegModel.fit(x_train, y_train)
logiRegModelOutput = logiRegModel.predict(x_test)

print("LogisticRegression Confusion Matrix:\n",confusion_matrix(y_test, logiRegModelOutput))

print("LogisticRegression Accuray:\n",accuracy_score(y_test, logiRegModelOutput))

filename_logiRegModel = "logiRegModel.sav"
pickle.dump(logiRegModel, open(filename_logiRegModel, 'wb'))
     

LogisticRegression Confusion Matrix:
 [[4346  486  166  280]
 [1341  332  271 1111]
 [1010  152  184 1258]
 [ 472  252  222 2224]]
LogisticRegression Accuray:
 0.5023038207981853


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
from sklearn.ensemble import RandomForestClassifier

randomForesModel = RandomForestClassifier(n_estimators = 100) 
randomForesModel.fit(x_train, y_train)
randomForesModelOutput = randomForesModel.predict(x_test)

print("RandomForestClassifier Confusion Matrix:\n",confusion_matrix(y_test, randomForesModelOutput))

print("RandomForestClassifier Accuray:\n",accuracy_score(y_test, randomForesModelOutput))

filename_randomForesModel = "randomForesModel.sav"
pickle.dump(randomForesModel, open(filename_randomForesModel, 'wb'))

RandomForestClassifier Confusion Matrix:
 [[4807  276   86  109]
 [1166  825  185  879]
 [ 807  333  306 1158]
 [ 338  433  262 2137]]
RandomForestClassifier Accuray:
 0.5724108598568087


In [28]:
from sklearn.naive_bayes import GaussianNB

gaussModel = GaussianNB()
gaussModel.fit(x_train, y_train)
gaussModelOutput = gaussModel.predict(x_test)

print("GaussianNB Confusion Matrix:\n",confusion_matrix(y_test, gaussModelOutput))

print("GaussianNB Accuray:\n",accuracy_score(y_test, gaussModelOutput))

filename_gaussModel = "gaussModel.sav"
pickle.dump(gaussModel, open(filename_gaussModel, 'wb'))

GaussianNB Confusion Matrix:
 [[1480  400 2272 1126]
 [ 411  182  926 1536]
 [ 151  118  907 1428]
 [  97  194  656 2223]]
GaussianNB Accuray:
 0.3396895158431984


In [29]:
from sklearn.neighbors import KNeighborsClassifier

knnModel = KNeighborsClassifier(n_neighbors=10)
knnModel.fit(x_train, y_train)
knnModelOutput = knnModel.predict(x_test)

print("KNeighborsClassifier Confusion Matrix: \n",confusion_matrix(y_test, knnModelOutput))

print("KNeighborsClassifier Accuray:\n",accuracy_score(y_test, knnModelOutput))

filename_knnModel = "knnModel.sav"
pickle.dump(knnModel, open(filename_knnModel, 'wb'))

KNeighborsClassifier Confusion Matrix: 
 [[3812  592  379  495]
 [1364  630  363  698]
 [1036  548  390  630]
 [ 853  779  523 1015]]
KNeighborsClassifier Accuray:
 0.4144750832919827


In [30]:
from sklearn.tree import DecisionTreeClassifier

treeModel = DecisionTreeClassifier(max_depth=10, random_state=101,max_features = None, min_samples_leaf = 15)
treeModel.fit(x_train, y_train)
treeModelOutput = treeModel.predict(x_test)

print("DecisionTreeClassifier Confusion Matrix:\n",confusion_matrix(y_test, treeModelOutput))

print("DecisionTreeClassifier Accuray:\n",accuracy_score(y_test, treeModelOutput))

filename_treeModel = "treeModel.sav"
pickle.dump(treeModel, open(filename_treeModel, 'wb'))

DecisionTreeClassifier Confusion Matrix:
 [[4458  454   90  276]
 [1011  788   60 1196]
 [ 663  379  264 1298]
 [ 298  505  116 2251]]
DecisionTreeClassifier Accuray:
 0.5501524066066492


In [31]:
# Nagyon lassúra sikeredett, 8 és fél perces futásidő 
'''
from sklearn.svm import SVC
model = SVC(kernel ="linear", C=0.5, random_state=101)
model.fit(x_train, y_train)
kimenet = model.predict(x_test)
print(confusion_matrix(y_test, kimenet))

print(accuracy_score(y_test, kimenet))
'''

In [32]:
print("DecisionTreeClassifier Accuray:\n",accuracy_score(y_test, treeModelOutput))
print("KNeighborsClassifier Accuray:\n",accuracy_score(y_test, knnModelOutput))
print("GaussianNB Accuray:\n",accuracy_score(y_test, gaussModelOutput))
print("RandomForestClassifier Accuray:\n",accuracy_score(y_test, randomForesModelOutput))
print("LogisticRegression Accuray:\n",accuracy_score(y_test, logiRegModelOutput))

DecisionTreeClassifier Accuray:
 0.5501524066066492
KNeighborsClassifier Accuray:
 0.4144750832919827
GaussianNB Accuray:
 0.3396895158431984
RandomForestClassifier Accuray:
 0.5724108598568087
LogisticRegression Accuray:
 0.5023038207981853
