In [2]:
import pandas as pd
import requests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [3]:
response = requests.get("https://www.fotmob.com/api/playerData?id=1083323", headers = {
    "sec-ch-ua": "\"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?1",
    "sec-ch-ua-platform": "\"Android\"",
    "x-mas": "eyJib2R5Ijp7InVybCI6Ii9hcGkvcGxheWVyRGF0YT9pZD0xMDgzMzIzIiwiY29kZSI6MTc0NzQyMTg5MzQxOCwiZm9vIjoicHJvZHVjdGlvbjo4OTMzMDRhNDBhYmUyZTM4YzJmOTE1NTk4YzAzMDZmZjMzZjRmYjE1LXVuZGVmaW5lZCJ9LCJzaWduYXR1cmUiOiJCQkY3ODA0ODhBQTM2M0ZBMEU0RjhGQzhCQjNBRTY4RiJ9"
  })


In [6]:
data = response.json()
stats_section = data['firstSeasonStats']['statsSection']['items']
filtered_stats = {}
for group in stats_section:
    group_title = group['title']
    for stat in group['items']:
        stat_title = stat['title']
        per90_value = stat['per90']
        
        # Round the per90 value to 2 decimal places
        if isinstance(per90_value, float):
            per90_value = round(per90_value, 2)
        
        # Store the stat with its title and per90 value
        filtered_stats[stat_title] = per90_value

for stat_name, stat_value in filtered_stats.items():
    print(f"{stat_name}: {stat_value}")


Goals: 0.13
xG: 0.07
xGOT: 0.11
xG excl. penalty: 0.07
Shots: 0.81
Shots on target: 0.36
Assists: 0.16
xA: 0.29
Accurate passes: 71.75
Pass accuracy: 88.73
Accurate long balls: 4.33
Long ball accuracy: 69.63
Chances created: 2.15
Successful crosses: 0.33
Cross accuracy: 17.24
Dribbles: 1.24
Dribbles success rate: 65.52
Touches: 97.35
Touches in opposition box: 1.72
Dispossessed: 1.07
Fouls won: 1.59
Tackles won: 1.04
Tackles won %: 54.24
Duels won: 4.98
Duels won %: 57.74
Aerials won: 0.23
Aerials won %: 53.85
Interceptions: 0.85
Blocked scoring attempt: 0.2
Fouls committed: 0.33
Recoveries: 8.04
Possession won final 3rd: 0.88
Dribbled past: 1.43
Yellow cards: 0.1
Red cards: 0


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Read data
df = pd.read_csv('test2.csv')

# Clean column names
df.columns = df.columns.str.strip().str.replace(' ', '_')

# Verify target column exists
if 'Market_Value' not in df.columns:
    # Fallback to last numeric column if 'Market_Value' not found
    numeric_cols = df.select_dtypes(include=['number']).columns
    df['Market_Value'] = df[numeric_cols[-1]]
    print(f"Using column '{numeric_cols[-1]}' as Market Value")

# Create feature matrix and target
X = df.drop(['Player', 'Market_Value'], axis=1)
y = df['Market_Value']

# Handle missing values
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Modified train-test split - remove stratification or handle duplicates
try:
    # First try with stratification
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=pd.qcut(y, q=5, duplicates='drop'))  # Handle duplicate bins
except ValueError:
    # Fallback to simple split if stratification fails
    print("Couldn't stratify - duplicate values found. Using simple split.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42
    )

# Feature scaling
numeric_features = X.select_dtypes(include=['number']).columns
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Model training with adjusted parameters
model = RandomForestRegressor(
    n_estimators=200,
    min_samples_split=5,  # Higher to prevent overfitting
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("\nModel Performance:")
print(f"MAE: ${mean_absolute_error(y_test, y_pred):.2f}M")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

# Feature importance
importances = model.feature_importances_
features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("\nTop 10 Important Features:")
print(features.head(10))

Couldn't stratify - duplicate values found. Using simple split.

Model Performance:
MAE: $29.03M
R²: -0.22

Top 10 Important Features:
                      Feature  Importance
32                 Recoveries    0.178281
34              Dribbled_past    0.130794
19  Touches_in_opposition_box    0.089286
35               Yellow_cards    0.089286
1                          xG    0.053571
4            xG_excl._penalty    0.041667
25                  Duels_won    0.035714
3               Penalty_goals    0.035693
16                   Dribbles    0.030036
31            Fouls_committed    0.029762


In [8]:
# 1. Get top 10 features from your model
top_10_features = features['Feature'].head(10).tolist()

# 2. Initialize dictionary with model's top features
required_features = {feature: 0 for feature in top_10_features}

# 3. Enhanced matching function
def find_matching_stat(api_stat_title, model_feature_name):
    # Convert to lowercase and remove special chars for comparison
    api_title = api_stat_title.lower().replace('%', '').replace('_', ' ').strip()
    model_feature = model_feature_name.lower().replace('_', ' ').strip()
    
    # Direct match
    if model_feature in api_title:
        return True
    
    # Special cases mapping
    aliases = {
        'accurate passes': ['passes completed', 'completed passes', 'accurate pass'],
        'pass accuracy': ['passing accuracy', 'pass success', 'pass completion %'],
        'dribbles success rate': ['successful dribbles', 'dribble success', 'dribbles won'],
        'duels won %': ['duel success rate', 'duels won', 'aerial duels won'],
        'possession won final 3rd': ['final third recoveries', 'attacking third tackles'],
        'xa': ['expected assists', 'expected goal assists'],
        'yellow cards': ['yellows', 'cards yellow']
    }
    
    # Check all possible aliases
    for alias in aliases.get(model_feature, [model_feature]):
        if alias in api_title:
            return True
    
    return False

# 4. Populate the dictionary from API response
print("Attempting to match API stats to model features...")
for group in stats_section:
    for stat in group['items']:
        stat_title = stat['title']
        per90_value = stat.get('per90', 0)
        
        # Convert and round numerical values
        if isinstance(per90_value, (int, float)):
            per90_value = round(float(per90_value), 2)
        
        # Try to match to each model feature
        for model_feature in required_features:
            if find_matching_stat(stat_title, model_feature):
                if required_features[model_feature] == 0:  # Only set if not already found
                    required_features[model_feature] = per90_value
                break




# 6. Create final DataFrame for prediction
player_data = pd.DataFrame([required_features])[top_10_features]  # Maintain correct column order
required_features


Attempting to match API stats to model features...


{'Recoveries': 8.04,
 'Dribbled_past': 1.43,
 'Touches_in_opposition_box': 1.72,
 'Yellow_cards': 0.1,
 'xG': 0.07,
 'xG_excl._penalty': 0,
 'Duels_won': 4.98,
 'Penalty_goals': 0,
 'Dribbles': 1.24,
 'Fouls_committed': 0.33}