In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import ElasticNetCV
import numpy as np

In [4]:
df = pd.read_json('data/feature_extracted.ndjson', lines=True)
df = df.fillna(0)

In [5]:
df.head(2)

Unnamed: 0,result,moves,white_elo,black_elo,ECO,Opening,white_cpl,black_cpl,total_moves,end_r,...,end_N,end_R,end_Q,end_K,queen_moved_at,total_checks,first_check_at,queen_changed_at,promotion,can_claim_draw
0,1/2-1/2,"[e2e4, e7e5, g1f3, g8f6, d2d4, f6e4, f3e5, d7d...",2523,2460,C42,Petrov,"[-11, 81, -20, 3]","[78, -7, -6]",7,2.0,...,2.0,2.0,1.0,1,0.0,0.0,0.0,0.0,0.0,0.0
1,0-1,"[e2e4, d7d5, e4d5, g8f6, d2d4, f6d5, g1f3, g7g...",1915,1999,B01,Scandinavian,"[20, 0, 17, 28, 28, 92, 10, 9, -1, 5, -1, 16, ...","[25, -4, -13, 8, 34, 21, 31, 2, 11, 0, -5, 8, ...",53,1.0,...,0.0,1.0,0.0,1,13.0,8.0,13.0,16.0,0.0,0.0


In [81]:
df["ECO"] = df["ECO"].astype('category')
df["Opening"] = df["Opening"].astype('category')
df["result"] = df["result"].astype('category')

df["mean_elos"] = df[['white_elo', 'black_elo']].mean(axis=1)
df["diff_elos"]= df['white_elo'] - df['black_elo']

In [82]:
print(len(df))
df = df[df['white_cpl'].apply(lambda x: len(x) > 0)]
df = df[df['black_cpl'].apply(lambda x: len(x) > 0)]
print(len(df))

2400
2395


In [83]:
df['white_mean'] = df['white_cpl'].apply(np.mean)
df['white_std'] = df['white_cpl'].apply(np.std)
df['white_min'] = df['white_cpl'].apply(np.min)
df['white_max'] = df['white_cpl'].apply(np.max)

df['black_mean'] = df['black_cpl'].apply(np.mean)
df['black_std'] = df['black_cpl'].apply(np.std)
df['black_min'] = df['black_cpl'].apply(np.min)
df['black_max'] = df['black_cpl'].apply(np.max)

In [84]:
df.head(2)

Unnamed: 0,result,moves,white_elo,black_elo,ECO,Opening,white_cpl,black_cpl,total_moves,end_r,...,mean_elos,diff_elos,white_mean,white_std,white_min,white_max,black_mean,black_std,black_min,black_max
0,1/2-1/2,"[e2e4, e7e5, g1f3, g8f6, d2d4, f6e4, f3e5, d7d...",2523,2460,C42,Petrov,"[-11, 81, -20, 3]","[78, -7, -6]",7,2.0,...,2491.5,63,13.25,39.964828,-20,81,21.666667,39.835774,-7,78
1,0-1,"[e2e4, d7d5, e4d5, g8f6, d2d4, f6d5, g1f3, g7g...",1915,1999,B01,Scandinavian,"[20, 0, 17, 28, 28, 92, 10, 9, -1, 5, -1, 16, ...","[25, -4, -13, 8, 34, 21, 31, 2, 11, 0, -5, 8, ...",53,1.0,...,1957.0,-84,36.22,71.202329,-48,430,23.94,42.293929,-39,162


In [85]:
results = pd.get_dummies(df['result'])
ecos = pd.get_dummies(df["ECO"])

In [86]:
to_drop = ["white_elo", "black_elo", "diff_elos", "mean_elos", "diff_elos", "moves", "ECO", "Opening", "black_cpl", "white_cpl", "result"]

# Assuming 'target' is the name of the column you want to predict
features = df.drop(to_drop, axis=1)
X = pd.concat([features, ecos, results], axis=1, ignore_index=True)
y = df['mean_elos']


In [87]:
features.head(2)

Unnamed: 0,total_moves,end_r,end_n,end_b,end_q,end_k,end_p,end_P,end_B,end_N,...,promotion,can_claim_draw,white_mean,white_std,white_min,white_max,black_mean,black_std,black_min,black_max
0,7,2.0,2.0,2.0,1.0,1,7.0,7.0,2.0,2.0,...,0.0,0.0,13.25,39.964828,-20,81,21.666667,39.835774,-7,78
1,53,1.0,0.0,0.0,0.0,1,3.0,2.0,0.0,0.0,...,0.0,0.0,36.22,71.202329,-48,430,23.94,42.293929,-39,162


In [None]:
num_estims = [50, 100, 200, 300, 400, 500, 600]

maes = []

for n in num_estims:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # Initialize the model
    model = RandomForestRegressor(n_estimators=n, random_state=42, n_jobs=-1)
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    maes.append(mae)
    

In [102]:
for i in range(len(maes)):
    print(f"Number of Estimators: {num_estims[i]}. Mean Absolute Error: {maes[i]}")

Number of Estimators: 50. Mean Absolute Error: 177.65728322894918
Number of Estimators: 100. Mean Absolute Error: 178.38083211551844
Number of Estimators: 200. Mean Absolute Error: 178.32543258524703
Number of Estimators: 300. Mean Absolute Error: 178.19903705636744
Number of Estimators: 400. Mean Absolute Error: 177.86608733472514
Number of Estimators: 500. Mean Absolute Error: 177.9613423799583
Number of Estimators: 600. Mean Absolute Error: 177.82597367200182


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500]
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

In [104]:
# Best parameters
best_params = grid_search.best_params_
print(f"Best n_estimators: {best_params['n_estimators']}")

# Best model
best_rf = grid_search.best_estimator_

Best n_estimators: 100


In [107]:
# Make predictions with the best model
y_pred = best_rf.predict(X_test)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error with the best model: {mse}')
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error with the best model: {mae}')

Mean Squared Error with the best model: 48843.06967633294
Mean Absolute Error with the best model: 178.38083211551844


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
