Step 1:
  - Load the data
  - Define features to drop, in this case we want to ignore goal keeper stats, and other miscillaneous features.
  - Use one-hot encoding for categorical features we want, like position
  - Apply PCA


In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
url = 'https://raw.githubusercontent.com/liam-rich/transfermarket_data/main/transfermarkt_fbref_201920.csv'
data = pd.read_csv(url, delimiter=';', engine='python')

# Features to drop
bad_features = ['saves', 'save_pct', 'clean_sheets', 'clean_sheets_pct', 'pens_allowed', 'pens_saved', 'psnpxg_per_shot_on_target_against', 'goal_kicks','pct_goal_kicks_launched', 'xGDiff/90', 'xGDiff', 'xGA', 'xG', 'Pts/G', 'Pts', 'GDiff', 'GA', 'GF', 'L', 'D', 'W', 'MP', 'LgRk', 'xg_net', 'npxg_net', 'goal_kick_length_avg']
data.drop(columns=bad_features, inplace=True)

# Define non-numeric columns to drop
non_numeric_columns_to_drop = ['player', 'foot', 'position2', 'Season', 'Attendance', 'Column1']
data.drop(columns=non_numeric_columns_to_drop, inplace=True)

# Define the categorical columns to encode
categorical_columns = ['squad', 'nationality', 'position', 'league']

# Initialize a LabelEncoder for each categorical column
label_encoders = {}

# Encode each categorical column
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Drop rows with NaN values consistently for X and y
data.dropna(subset=['value'], inplace=True)

# Filter out rows with "GK" in the position column
data = data[data['position'] != 'GK']

# For 'CLBestScorer', fill with 0 (assuming no goals implies not applicable/missing)
data['CLBestScorer'].fillna(0, inplace=True)

# Identify and drop columns ending in 'gk'
gk_columns = [col for col in data.columns if col.endswith('gk')]
data.drop(gk_columns, axis=1, inplace=True)

# Define a list of 'm' columns to keep
keep_columns = ['passes_completed_medium', 'passes_medium', 'passes_pct_medium']

# Optionally, identify columns that end with 'm' but are not in 'keep_columns' to review or remove
m_columns = [col for col in data.columns if col.endswith('m') and col not in keep_columns]
data.drop(m_columns, axis=1, inplace=True)

# Ensure your data has no missing values
data.dropna(inplace=True)

# Separate features (X) and target variable (y)
X = data.drop(columns=['value'])
y = data['value']

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)

# Apply PCA to reduce dimensionality
pca = PCA(n_components=0.70)  # Retain 70% of variance
X_pca = pca.fit_transform(scaled_data)

# Extract the principal component loadings
loadings = pca.components_

# Get the absolute values of the loadings
abs_loadings = np.abs(loadings)

# Find the indices of the features with the highest loadings for each principal component
top_features_indices = np.argsort(abs_loadings, axis=1)[:, -3:]  # Choose top 3 features per component

# Get the names of the original features
feature_names = X.columns.values

# Display the top features for each principal component
for i, component in enumerate(top_features_indices):
    print(f"Principal Component {i+1}:")
    for feature_index in component:
        print(f"\t{feature_names[feature_index]}")

# Use only the top features for modeling
X_top_features = X.iloc[:, top_features_indices.flatten()]



# Now we can proceed with modeling using X_top_features


Principal Component 1:
	carries
	passes_pressure
	pass_targets
Principal Component 2:
	touches_def_3rd
	npxg_xa_per90
	xg_xa_per90
Principal Component 3:
	passes_dead
	passes_other_body
	shots_on_target_against
Principal Component 4:
	shots_on_target_against
	passes_other_body
	touches_def_pen_area
Principal Component 5:
	goals_pens_per90
	goals_assists_per90
	goals_assists_pens_per90
Principal Component 6:
	passes_pct_medium
	passes_pct_long
	passes_pct
Principal Component 7:
	corner_kicks
	crosses_into_penalty_area
	throw_ins
Principal Component 8:
	throw_ins
	pressures_mid_3rd
	position
Principal Component 9:
	sca_per90
	goals_per_shot
	shots_total_per90
Principal Component 10:
	assists
	assists_per90
	xa_net


Step 2:
  - Linear regression
  - Linear regression with polynomial features
  - Ridge Regression
    - Compare

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming 'X' and 'y' have been defined as in your previous code snippet
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_top_features, y, test_size=0.3, random_state=42)

# Feature Engineering: Add interaction terms or polynomial features
poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)

# Hyperparameter Tuning for Random Forest Regression
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)
best_params_rf = grid_search_rf.best_params_

# Model Selection: Random Forest Regression
rf_model = RandomForestRegressor(**best_params_rf, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Random Forest Model
rf_y_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_y_pred)
print(f"Random Forest MSE: {rf_mse}")

# Evaluate Linear Regression Model with Polynomial Features
lr_poly = LinearRegression()
lr_poly.fit(X_poly_train, y_train)
X_poly_test = poly.transform(X_test)
lr_y_pred_poly = lr_poly.predict(X_poly_test)
lr_mse_poly = mean_squared_error(y_test, lr_y_pred_poly)
print(f"Polynomial Regression MSE: {lr_mse_poly}")
