## Cheaking the DataSet

### Function and imports

In [1]:
from time import time

from sklearn.metrics import r2_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_csv('projectdata.csv')
data.describe()

### Corr, clustermap and pairplot

In [None]:
#removal of non relevant columns
colToDrop = ['Date','HomeTeam','AwayTeam','FTR','HTGD','HTR','Referee']
#df_for_corr = data.drop(colToDrop, axis=1)
#df_for_corr.describe()

In [None]:
#mask = np.zeros_like(df_for_corr.corr(),dtype=np.bool)
#mask[np.triu_indices_from(mask)] = True
#cmap = sns.diverging_palette(10,150,as_cmap=True)

#plt.figure(figsize=(18,18))
#sns.heatmap(df_for_corr.corr(),cmap=cmap,annot=True,mask = mask,square = True,center = 0)
#plt.title('Correlation Matrix',size=20)
#plt.show()

In [None]:
#sns.clustermap(data.drop(colToDrop, axis=1), standard_scale=1)
#plt.show()

In [None]:
#corr_features = df_for_corr.columns
#sns.pairplot(data=df_for_corr)
#plt.show()

### Prediction Functions

In [None]:
def generate_CDF(data):

    data_counts = data.value_counts().sort_index()
    data_counts /= np.sum(data_counts.values)
    data_indice_list = data_counts.index
    data_counts = data_counts.reset_index().drop('index',axis=1)
    CDF = np.zeros(len(data_counts))
    CDF[0] = data_counts.iloc[0,0]

    for i in range(1, len(data_counts)):
        CDF[i] = CDF[i-1] + data_counts.iloc[i,0]
    return CDF, data_indice_list

target = 'FTGD'
feature_vector = data.columns[data.columns != target]

X = data[feature_vector].values
shares = data[target]
med = shares.median()
y = shares.apply(lambda x: 1 if x >  med else 0)

shares_CDF, indice_list = generate_CDF(shares)

plt.figure(figsize=(12,5))
plt.plot(indice_list.values, shares_CDF ,color='blue',marker='o', label='Shares')
plt.legend()
plt.title("Shares Cumulative Distribution Function", size=15)
plt.ylabel("Percentage %", size=15)
plt.xlabel("Num of Shares", size=15)
plt.grid()
plt.show()

In [None]:
def training(df, target):
    train = df.drop(colToDrop, axis=1)
    train.apply(pd.to_numeric)
    train= train.fillna(0)
    X = train.drop([target], axis=1).values
    y = train[target].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
    return train, X_train, X_test, X_val, y_train, y_test, y_val

In [None]:
#Decision Tree Classifier
def DTC(X_train, y_train, print_Flag):
    global newData
    global colToDropForPred
    
    hyperparams = {"max_depth":[5,10,15], "min_samples_split":[3,5,7]}  
    tree_clf = DecisionTreeClassifier()  
    clf = GridSearchCV(tree_clf, hyperparams, cv=5, n_jobs=-1, verbose=True) 
    t1 = time()
    clf.fit(X_train, y_train)
    print("It took {:.2f} seconds".format(time() - t1))
    
    y_pred = clf.predict(X_test)
    
    if print_Flag:
        print('----------Decision Tree Classifier----------')
        print("ACCURACY SCORE: ", clf.score(X_test, y_test))
        print("CROSS VALIDATION ACCURACY SCORE: ", clf.best_score_)
        print("BEST PARAMETERS: ", clf.best_params_)
        print("# === CONFUSION MATRIX === #\n", confusion_matrix(y_true=y_test, y_pred=y_pred))
        print("\n\t\t# === CLASSIFICATION REPORT === #\n\n", classification_report(y_true=y_test, y_pred=y_pred))
    
    pred = newData.drop(colToDropForPred, axis=1)
    pred.apply(pd.to_numeric)
    predictions = clf.predict(pred.values)
    return predictions

In [None]:
#K Neighbors Classifier
# Initiate our model
def NC(X_train, y_train, print_Flag):
    global newData
    global colToDropForPred
    
    hyperparams = {"n_neighbors":[11,21,31,41,51]}  # the number of neighbours which will be tested
    neighbour_clf = KNeighborsClassifier()  
    clf = GridSearchCV(neighbour_clf, hyperparams, cv=10, n_jobs=-1, verbose=True)  # 10-fold cross validation will make sure we're not suffering from overfitting 
    t1 = time()
    clf.fit(X_train, y_train)
    print("It took {:.2f} seconds".format(time() - t1))
    
    
    y_pred = clf.predict(X_test)
    if print_Flag:
        print('----------Neighbour Classifier----------')
        print("ACCURACY SCORE: ", clf.score(X_test, y_test))
        print("CROSS VALIDATION ACCURACY SCORE: ", clf.best_score_)
        print("BEST NUM OF NEIGHBOURS: ", clf.best_params_['n_neighbors'])
        print("# === CONFUSION MATRIX === #\n", confusion_matrix(y_true=y_test, y_pred=y_pred))
        print("\n\t\t# === CLASSIFICATION REPORT === #\n", classification_report(y_true=y_test, y_pred=y_pred))
        
    pred = newData.drop(colToDropForPred, axis=1)
    pred.apply(pd.to_numeric)
    predictions = clf.predict(pred.values)
    return predictions

In [None]:
def NBC(X_train, y_train, print_Flag): 
    global newData
    global colToDropForPred
    
    nb_clf = GaussianNB()
    t1 = time()
    nb_clf.fit(X_train, y_train)
    print("It took {:.2f} seconds".format(time() - t1))
    
    y_pred = nb_clf.predict(X_test)
    if print_Flag:
        print('----------GaussianNB Classifier----------')
        print("ACCURACY SCORE: ", nb_clf.score(X_test, y_test))
        print("K-Fold Cross Validation Accuracy Score :", np.mean(cross_val_score(nb_clf, X_train, y_train, cv=10, scoring='accuracy')))
        print("It took {:.2f} seconds".format(time() - t1))
        print("# === CONFUSION MATRIX === #\n", confusion_matrix(y_true=y_test, y_pred=y_pred))
        print("\n\t\t# === CLASSIFICATION REPORT === #\n", classification_report(y_true=y_test, y_pred=y_pred))
    
    pred = newData.drop(colToDropForPred, axis=1)
    pred.apply(pd.to_numeric)
    predictions = nb_clf.predict(pred.values)
    return predictions

In [None]:
def RFRC(X_train, y_train, print_Flag):
    global newData
    global colToDropForPred
    
    RFR = RandomForestRegressor()

    parameters = {'n_estimators': [3, 5, 10, 50],
                  #'criterion': ['mse'],
                  #'max_depth': [5, 10, 15], 
                  #'min_samples_split': [2, 5, 10],
                  'min_samples_leaf': [1,5]
                 }

    # Run the grid search
    grid_obj = GridSearchCV(RFR, parameters, cv=5, n_jobs=-1, verbose=1)
    grid_obj = grid_obj.fit(X_train, y_train)

    # Set the clf to the best combination of parameters
    RFR = grid_obj.best_estimator_

    # Fit the best algorithm to the data. 
    t1 = time()
    RFR.fit(X_train, y_train)
    print("It took {:.2f} seconds".format(time() - t1))
    
    y_pred = RFR_clf.predict(X_test)
    if print_Flag:
        print('----------RandomForestRegressor Classifier----------')
        print("ACCURACY SCORE: ", RFR.score(X_test, y_test))
        print("K-Fold Cross Validation Accuracy Score :", np.mean(cross_val_score(RFR, X_train, y_train, cv=10, scoring='accuracy')))
        print("It took {:.2f} seconds".format(time() - t1))
        
    pred = newData.drop(colToDropForPred, axis=1)
    pred.apply(pd.to_numeric)
    predictions = nb_clf.predict(pred.values)
    return predictions

In [None]:
def getNewData(newGames):
    global data
    
    K = len(newGames)
    N = len(data) - K
    newData = data.iloc[N:]
    newData = newData.reset_index(drop=True)
    data = data[:N]
    return newData

In [None]:
def trainingData(df, target, print_Flag):

    train, X_train, X_test, X_val, y_train, y_test, y_val = training(df, target)
    print("There are {} of samples in the training set".format(X_train.shape[0]))
    print("There are {} of samples in the test set".format(X_test.shape[0]))

    t1 = time()
    DTC_pred = DTC(X_train, y_train, print_Flag)
    print("This took ",time() - t1," seconds")
    
    t1 = time()
    NC_pred = NC(X_train, y_train)
    print("This took ",time() - t1," seconds")
    
    t1 = time()
    NB_pred = NBC(X_train, y_train)
    print("This took ",time() - t1," seconds")
    
    t1 = time()
    RFR_pred = RFRC(X_train, y_train)
    print("This took ",time() - t1," seconds")
    
    return DTC_pred, NC_pred, NB_pred, RFR_pred

In [None]:
def Result(newData, DTC_clf, NC_clf, NB_clf, RFR_clf):
    colToDropForPred = ['Date','HomeTeam','AwayTeam','FTGD','FTR','HTGD','HTR','Referee']
    #predTool = [DTC_clf, NC_clf, NB_clf, RFR_clf]

    tool = RFR_clf
    pred = newData.drop(colToDropForPred, axis=1)
    pred.apply(pd.to_numeric)
    X = pred.values
    predictions = tool.predict(X) 
    result = pd.DataFrame()
    result[colToDropForPred] = newData[colToDropForPred]
    result[str(tool) + "_GD_Predictions"] = np.around(predictions) #Rounded up

    count = 0
    for i in range (len(display)):
        if(result[str(tool) + "_GD_Predictions"][i] > 0 and result["FTGD"][i] > 0): 
            count+=1
        elif(result[str(tool) + "_GD_Predictions"][i] == 0 and result["FTGD"][i] == 0): 
            count+=1
        elif(result[str(tool) + "_GD_Predictions"][i] < 0 and result["FTGD"][i] < 0): 
            count+=1

        print("The Predictions of " + tool + " is Currect: ",count,"/",len(display),'of times, it:',count/len(display)*100,"precent")
    result.to_csv("result.csv", index=False, sep=',')

In [None]:
data = pd.read_csv('projectdata.csv')

target = 'FTGD'
print_Flag = False
colToDropForPred = ['Date','HomeTeam','AwayTeam','FTR','HTGD','HTR','Referee']

train = data.drop(colToDropForPred, axis=1)
train.apply(pd.to_numeric)
train= train.fillna(0)
X = train.drop([target], axis=1).values
y = train[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

t1 = time()
RFR = RandomForestRegressor()

parameters = {'n_estimators': [3, 5, 10, 50],
              #'criterion': ['mse'],
              #'max_depth': [5, 10, 15], 
              #'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1,5]
             }

# Run the grid search
grid_obj = GridSearchCV(RFR, parameters, cv=5, n_jobs=-1, verbose=1)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
RFR = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
t1 = time()
RFR.fit(X_train, y_train)
print("It took {:.2f} seconds".format(time() - t1))

y_pred = RFR.predict(X_test)
if print_Flag:
    print('----------RandomForestRegressor Classifier----------')
    print("ACCURACY SCORE: ", RFR.score(X_test, y_test))
    print("K-Fold Cross Validation Accuracy Score :", np.mean(cross_val_score(RFR, X_train, y_train, cv=10, scoring='accuracy')))
    print("It took {:.2f} seconds".format(time() - t1))



In [None]:
print('----------RandomForestRegressor Classifier----------')
print("ACCURACY SCORE: ", RFR.score(X_test, y_test))
print("K-Fold Cross Validation Accuracy Score :", np.mean(cross_val_score(RFR, X_train, y_train, cv=10, scoring='accuracy')))
print("It took {:.2f} seconds".format(time() - t1))


In [None]:
CSV_Name = 'season-1920_csv.csv'
#newGames = pd.read_csv(CSV_Name)
#games(newGames)
#newData = getNewData(newGames)
newData = pd.read_csv('newGames111.csv')
#newData.fillna(0)

colToDropForPred2 = ['Date','HomeTeam','AwayTeam','FTR','HTGD','HTR','Referee','FTGD']
pred = newData.drop(colToDropForPred2, axis=1)
pred.apply(pd.to_numeric)
X = pred.values
#X = X[np.logical_not(np.isnan(X))]
#X.fillna(0)
#print(type(X))
predictions = RFR.predict(X)
result = pd.DataFrame()
result[colToDropForPred2] = newData[colToDropForPred2]
result[str(tool) + "_GD_Predictions"] = np.around(predictions) #Rounded up

count = 0
for i in range (len(display)):
    if(result[str(tool) + "_GD_Predictions"][i] > 0 and result["FTGD"][i] > 0): 
        count+=1
    elif(result[str(tool) + "_GD_Predictions"][i] == 0 and result["FTGD"][i] == 0): 
        count+=1
    elif(result[str(tool) + "_GD_Predictions"][i] < 0 and result["FTGD"][i] < 0): 
        count+=1

print("The Predictions of " + tool + " is Currect: ",count,"/",len(display),'of times, it:',count/len(display)*100,"precent")
result.to_csv(str(tool) + "result.csv", index=False, sep=',')

In [None]:
#data = pd.read_csv('projectdata.csv')
CSV_Name = 'season-1920_csv.csv'
newGames = pd.read_csv(CSV_Name)
games(newGames)
newData = getNewData(newGames)
#newData.fillna(0)
#target = 'FTGD'
#print_Flag = False

#DTC_clf, NC_clf, NB_clf, RFR_clf = trainingData(data, target, print_Flag)
#Result(newData, DTC_clf, NC_clf, NB_clf, RFR_clf)

newData.to_csv("result.csv", index=False, sep=',')

In [None]:
#Future Matches
target = 'FTGD'
print_Flag = False

CSV_Name = 'newGames.csv'
newGames = pd.read_csv(CSV_Name)
games(newGames)
newData = getNewData(newGames)
Result(newData, trainingData(df, target, print_Flag))

In [None]:
df_SVLR = train.drop(['HomeGoals','AwayGoals'], axis=1) 
idx = 1
summary = []
plt.figure(figsize=(30,20))
plt.suptitle("Single Variate Linear Regression", size=20)
for col in df_SVLR.columns:
    
    formula = "FTGD ~ " + col
    
    single_variate_lin_reg = sm.ols(formula=formula, data=df_SVLR).fit()
    summary.append(single_variate_lin_reg.summary())
    plt.subplot(10, 4, idx)
    idx += 1
    plt.title(col)
    sns.scatterplot(data=df_SVLR, x=col, y="H_Wins")
    sns.lineplot(data=df_SVLR, x="A_Wins", y=single_variate_lin_reg.predict())
plt.show()

In [None]:
for col in range(len(df_SVLR.columns)):
    print(df_SVLR.columns[col])
    print(summary[col])
    print("-"*78)

In [None]:
df_SVLR.describe().T

In [None]:
target = 'VSNum'
cols = df_SVLR.columns[(df_SVLR.columns != target)]
cols = "+".join(cols)
formula = target + " ~ " + cols

multivariate_linear_regression = sm.ols(formula=formula, data=df_SVLR).fit()
multivariate_linear_regression.summary()