In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [9]:
# Load datasets
ball_to_ball_data = pd.read_csv('ball_to_ball_data.csv')
matchwise_player_stats = pd.read_csv('matchwise_player_stats.csv')
tournament_player_stats = pd.read_csv('tournament_player_stats.csv')
tournament_team_stats = pd.read_csv('tournament_team_stats.csv')

# Preprocess ball-to-ball data
ball_to_ball_data['date'] = pd.to_datetime(ball_to_ball_data['date'])
ball_to_ball_data['ball_number'] = ball_to_ball_data['over'].apply(lambda x: int(x) * 6 + int((x - int(x)) * 10))


In [10]:
#Aggregate runs scored by players per match
player_runs = ball_to_ball_data.groupby(['match_id', 'striker'])['runs_of_bat'].sum().reset_index()

# Merge with matchwise player stats
matchwise_player_stats = matchwise_player_stats.merge(player_runs, left_on=['match_id', 'player'], right_on=['match_id', 'striker'], how='left')

# Feature Engineering: Recent Form
recent_form = matchwise_player_stats.groupby('player')['runs_scored'].rolling(window=3, min_periods=1).mean().reset_index()
recent_form = recent_form.rename(columns={'runs_scored': 'recent_form'})

matchwise_player_stats = matchwise_player_stats.merge(recent_form[['level_1', 'recent_form']], left_index=True, right_on='level_1', how='left')
matchwise_player_stats.drop(columns=['level_1'], inplace=True)

# Preprocess tournament_player_stats (Feature scaling could be applied if needed)
tournament_player_stats['strike_rate'] = tournament_player_stats['runs_scored'] / tournament_player_stats['balls_faced'] * 100

# Preprocess tournament_team_stats
tournament_team_stats['win_loss_ratio'] = tournament_team_stats['matches_won'] / tournament_team_stats['matches_lost']

In [11]:
# Create target variables
player_target = matchwise_player_stats['runs_scored']
team_target = tournament_team_stats['matches_won']

# Feature set for player prediction
player_features = matchwise_player_stats[['recent_form', 'balls_faced', 'fours', 'sixes']]

# Feature set for team prediction
team_features = tournament_team_stats[['total_runs_scored', 'total_wickets_taken', 'win_loss_ratio']]

# Train-test split
X_player_train, X_player_test, y_player_train, y_player_test = train_test_split(player_features, player_target, test_size=0.2, random_state=42)
X_team_train, X_team_test, y_team_train, y_team_test = train_test_split(team_features, team_target, test_size=0.2, random_state=42)

In [12]:
# Random Forest Regressor for player performance prediction
player_model = RandomForestRegressor(n_estimators=100, random_state=42)
player_model.fit(X_player_train, y_player_train)

# Predict and evaluate
player_predictions = player_model.predict(X_player_test)
player_mae = mean_absolute_error(y_player_test, player_predictions)

print(f'Player Prediction MAE: {player_mae}')

Player Prediction MAE: 1.8030384914035829


In [13]:
# Consistency: Measure as standard deviation of runs scored in recent matches
matchwise_player_stats['consistency'] = matchwise_player_stats.groupby('player')['runs_scored'].transform(
    lambda x: x.rolling(window=3, min_periods=1).std())

# Handle NaN values in the consistency column by filling with 0
matchwise_player_stats['consistency'].fillna(0, inplace=True)

# Prepare features and target for consistency prediction
consistency_features = matchwise_player_stats[['recent_form', 'balls_faced', 'fours', 'sixes']]
consistency_target = matchwise_player_stats['consistency']

# Split the data into training and testing sets
X_consistency_train, X_consistency_test, y_consistency_train, y_consistency_test = train_test_split(
    consistency_features, consistency_target, test_size=0.2, random_state=42)

# Train a Linear Regression model for player consistency prediction
consistency_model = LinearRegression()
consistency_model.fit(X_consistency_train, y_consistency_train)

# Make predictions on the test set
consistency_predictions = consistency_model.predict(X_consistency_test)

# Calculate the Mean Absolute Error (MAE) of the predictions
consistency_mae = mean_absolute_error(y_consistency_test, consistency_predictions)

print(f'Player Consistency Prediction MAE: {consistency_mae}')

Player Consistency Prediction MAE: 5.7138839272737245
