## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv("data/FIFA4.csv")


#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,Age,Nationality,Overall,Potential,Club,Value,Wage,Special,Preferred Foot,International Reputation,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,29,Uruguay,92,92,FC Barcelona,€83M,€525K,2291,Right,5.0,...,85.0,83.0,30.0,45.0,38.0,27.0,25.0,31.0,33.0,37.0
1,28,Belgium,86,86,Roma,€37.5M,€130K,2290,Right,3.0,...,63.0,85.0,78.0,85.0,88.0,11.0,11.0,14.0,8.0,11.0
2,29,Chile,87,87,FC Bayern München,€41.5M,€180K,2285,Right,4.0,...,84.0,86.0,76.0,89.0,84.0,4.0,2.0,4.0,2.0,4.0
3,24,Austria,86,89,FC Bayern München,€41.5M,€140K,2279,Left,4.0,...,80.0,79.0,81.0,83.0,83.0,5.0,7.0,14.0,15.0,9.0
4,23,France,88,94,Manchester United,€71.5M,€225K,2271,Right,4.0,...,76.0,83.0,68.0,73.0,73.0,5.0,6.0,2.0,4.0,3.0


#### Data Preprocessing for model training

##### 1.Handle Categorical Columns

In [4]:
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


##### 1.1 Convert Height & Weight to Numeric

In [5]:
# Convert Height from "5'9"" to total inches
df["Height"] = df["Height"].apply(lambda x: int(x.split("'")[0]) * 12 + int(x.split("'")[1]) if isinstance(x, str) else x)

# Convert Weight from "165lbs" to integer
df["Weight"] = df["Weight"].apply(lambda x: int(x.replace("lbs", "")) if isinstance(x, str) else x)


In [6]:
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

Categorical columns: Index([], dtype='object')


##### 1.3 convert value and wage 

In [7]:
def convert_currency(value):
    if isinstance(value, str):  # Check if value is a string
        value = value.replace("€", "").replace("K", "000").replace("M", "000000")
        return float(value)
    return value  # Return as-is if already numeric

# Apply function to Value and Wage columns
df["Value"] = df["Value"].apply(convert_currency)
df["Wage"] = df["Wage"].apply(convert_currency)


##### 2. Feature Selection

In [8]:
important_features = [
    'Age', 'Overall', 'Potential','Dribbling', 'BallControl', 
    'Acceleration', 'Agility', 'Reactions','Aggression', 'Jumping', 'Stamina', 'LongShots',  'Vision', 'Penalties', 
    'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',  'GKPositioning', 
]

# Filter X to only keep the selected numerical features
X_filtered = df[important_features]

In [9]:
target = "Value"
features = [col for col in df.columns if col != target]


##### 3. Splitting Data into Training & Testing Sets

In [10]:
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


##### 4. Standardize numerical features

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


##### 4.Train the Model using Random Forest model with hyperparameter tuning

In [12]:
rf_model = RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)
rf_model.fit(X_train, y_train)


##### 5.Make prediction

In [21]:
y_pred = rf_model.predict(X_test)

##### 6.evaluate model

In [22]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

r2, mae


(0.8638257862474628, 11.410481692260676)