In [1]:
import pandas as pd
df=pd.read_csv('/workspaces/match_football_prediction/dangdienra/dulieu/data_train_final.csv', index_col=0)

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Assuming 'df' is your DataFrame containing the match data
# and 'score_home' and 'score_away' are columns in 'df' representing the home and away scores respectively

# Define the target variable 'y' to represent win (1), lose (-1), or draw (0)
y = df.apply(lambda row: 1 if row['score_home'] > row['score_away'] else (-1 if row['score_home'] < row['score_away'] else 0), axis=1)

# Define the features 'X' by dropping non-feature columns
X = df.drop(['home_name', 'away_name', 'score_home', 'score_away'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=0, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Predict on the test set using the best found parameters
y_pred = grid_search.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy with Grid Search: {accuracy}")
best_model = grid_search.best_estimator_
filename = 'best_model.joblib'
joblib.dump(best_model, filename)



Best parameters found:  {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best accuracy found:  0.8445434664567776
Model accuracy with Grid Search: 0.8413705583756346


['best_model.joblib']

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import joblib
import pandas as pd

# Assuming 'df' is your DataFrame containing the match data
# and 'score_home' and 'score_away' are columns in 'df' representing the home and away scores respectively

# Define the target variable 'y' to represent win (1), lose (-1), or draw (0)
y = df.apply(lambda row: 1 if row['score_home'] > row['score_away'] else (-1 if row['score_home'] < row['score_away'] else 0), axis=1)

# Define the features 'X' by dropping non-feature columns
X = df.drop(['home_name', 'away_name', 'score_home', 'score_away'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the K-Nearest Neighbors Classifier
knn = KNeighborsClassifier()

# Define the parameter grid to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Predict on the test set using the best found parameters
y_pred = grid_search.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy with Grid Search: {accuracy}")

# Save the best model to a file using joblib
best_model = grid_search.best_estimator_
filename = 'knn_best_model.joblib'
joblib.dump(best_model, filename)

print(f"Model saved to {filename}")


Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.3s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.3s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.2s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   0.2s
[CV] END ..metric=euclidean, n_neighbors=5, weig

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib
import pandas as pd

# Assuming 'df' is your DataFrame containing the match data
# and 'score_home' and 'score_away' are columns in 'df' representing the home and away scores respectively

# Define the target variable 'y' to represent win (1), lose (-1), or draw (0)
y = df.apply(lambda row: 1 if row['score_home'] > row['score_away'] else (-1 if row['score_home'] < row['score_away'] else 0), axis=1)

# Define the features 'X' by dropping non-feature columns
X = df.drop(['home_name', 'away_name', 'score_home', 'score_away'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
dt = DecisionTreeClassifier()

# Define the parameter grid to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Predict on the test set using the best found parameters
y_pred = grid_search.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy with Grid Search: {accuracy}")

# Save the best model to a file using joblib
best_model = grid_search.best_estimator_
filename = 'decision_tree_best_model.joblib'
joblib.dump(best_model, filename)

print(f"Model saved to {filename}")

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s

[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_spl