In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import glob
import os


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [3]:
def get_batting_cols(k):
    return [
        'Total_matches_played_sum', f'cumulative_Innings Batted_sum', f'last_{k}_matches_Innings Batted_sum',
        'cumulative_Runs_sum', f'last_{k}_matches_Runs_sum', 'cumulative_Fours_sum', f'last_{k}_matches_Fours_sum',
        'cumulative_Sixes_sum', f'last_{k}_matches_Sixes_sum', 'cumulative_Outs_sum', f'last_{k}_matches_Outs_sum',
        'cumulative_Dot Balls_sum', f'last_{k}_matches_Dot Balls_sum', 'cumulative_Balls Faced_sum',
        f'last_{k}_matches_Balls Faced_sum', f'last_{k}_matches_centuries_sum', f'last_{k}_matches_half_centuries_sum',
        f'last_{k}_matches_duck_outs_sum', 'last_year_avg_Runs',
        'cumulative_derived_Batting Strike Rate', 'cumulative_derived_Batting Avg', 'cumulative_derived_Mean Score',
        'cumulative_derived_Boundary%', 'cumulative_derived_Mean Balls Faced', 'cumulative_derived_Dismissal Rate',
        f'last_{k}_matches_derived_Batting Strike Rate', f'last_{k}_matches_derived_Batting Avg', f'last_{k}_matches_derived_Mean Score',
        f'last_{k}_matches_derived_Boundary%', f'last_{k}_matches_derived_Mean Balls Faced', f'last_{k}_matches_derived_Dismissal Rate',
        'cumulative_Venue_Runs_sum', f'last_{k}_matches_Venue_Runs_sum', 'cumulative_Opposition_Runs_sum',
        f'last_{k}_matches_Opposition_Runs_sum', 'cumulative_match_type_Runs_sum', f'last_{k}_matches_match_type_Runs_sum',
        'venue_avg_runs_sum', 'league_avg_runs_sum'
    ]

data_folder = "../data/processed/combined/"
csv_files = sorted(glob.glob(os.path.join(data_folder, "*_ODI.csv")), key=lambda x: int(os.path.basename(x).split("_")[0]))

results = []

target_var = "batting_fantasy_points"

for file in csv_files:
    k = int(os.path.basename(file).split("_")[0])
    df = pd.read_csv(file)
    
    batting_cols = get_batting_cols(k)
    numerical_cols = [col for col in batting_cols if col in df.columns]
    
    if target_var not in df.columns or not numerical_cols:
        print(f"Skipping k={k} due to missing columns.")
        continue
    
    correlation_matrix = df[numerical_cols + [target_var]].corr()
    target_correlation = correlation_matrix[target_var].drop(target_var).abs().sort_values(ascending=False)
    
    max_corr_feature = target_correlation.idxmax()
    max_corr_value = target_correlation.max()
    avg_corr = target_correlation.mean()
    highly_corr_features = (target_correlation > 0.3).sum()
    
    results.append({
        "k": k,
        "Max_Correlation_Feature": max_corr_feature,
        "Max_Correlation_Value": max_corr_value,
        "Avg_Correlation": avg_corr,
        "Highly_Correlated_Features": highly_corr_features
    })

results_df = pd.DataFrame(results).sort_values(by="Max_Correlation_Value", ascending=False)
print("\n**Best k values based on correlation analysis:**")
print(results_df)

best_k = results_df.iloc[0]["k"]
print(f"\nOptimal k for batting fantasy points: {best_k}")

Skipping k=15 due to missing columns.

**Best k values based on correlation analysis:**
    k        Max_Correlation_Feature  Max_Correlation_Value  Avg_Correlation  \
4  10       last_10_matches_Runs_sum               0.375113         0.267899   
0   3  cumulative_derived_Mean Score               0.373465         0.248453   
1   4  cumulative_derived_Mean Score               0.373465         0.253559   
2   5  cumulative_derived_Mean Score               0.373465         0.257794   
3   7  cumulative_derived_Mean Score               0.373465         0.263384   

   Highly_Correlated_Features  
4                          14  
0                          10  
1                          12  
2                          13  
3                          14  

Optimal k for batting fantasy points: 10


In [5]:
# Path to the folder containing CSV files
folder_path = "../data/processed/combined/"

# Define the flexible number of top correlated features to display
TOP_N = 15

# Define column mappings
category_cols = {
    "batting": {
        "target": "batting_fantasy_points",
        "columns": [
            "Total_matches_played_sum", "cumulative_Innings Batted_sum", "last_{k}_matches_Innings Batted_sum",
            "cumulative_Runs_sum", "last_{k}_matches_Runs_sum", "cumulative_Fours_sum", "last_{k}_matches_Fours_sum",
            "cumulative_Sixes_sum", "last_{k}_matches_Sixes_sum", "cumulative_Outs_sum", "last_{k}_matches_Outs_sum",
            "cumulative_Dot Balls_sum", "last_{k}_matches_Dot Balls_sum", "cumulative_Balls Faced_sum",
            "last_{k}_matches_Balls Faced_sum", "last_{k}_matches_centuries_sum", "last_{k}_matches_half_centuries_sum",
            "last_{k}_matches_duck_outs_sum", "last_year_avg_Runs",
            "cumulative_derived_Batting Strike Rate", "cumulative_derived_Batting Avg", "cumulative_derived_Mean Score",
            "cumulative_derived_Boundary%", "cumulative_derived_Mean Balls Faced", "cumulative_derived_Dismissal Rate",
            "last_{k}_matches_derived_Batting Strike Rate", "last_{k}_matches_derived_Batting Avg", "last_{k}_matches_derived_Mean Score",
            "last_{k}_matches_derived_Boundary%", "last_{k}_matches_derived_Mean Balls Faced", "last_{k}_matches_derived_Dismissal Rate",
            "cumulative_Venue_Runs_sum", "last_{k}_matches_Venue_Runs_sum", "cumulative_Opposition_Runs_sum",
            "last_{k}_matches_Opposition_Runs_sum", "cumulative_match_type_Runs_sum", "last_{k}_matches_match_type_Runs_sum",
            "venue_avg_runs_sum", "league_avg_runs_sum"
        ]
    },
    "bowling": {
        "target": "bowling_fantasy_points",
        "columns": [
            "Total_matches_played_sum", "cumulative_Innings Bowled_sum", "last_{k}_matches_Innings Bowled_sum",
            "cumulative_Balls Bowled_sum", "last_{k}_matches_Balls Bowled_sum", "cumulative_Wickets_sum", "last_{k}_matches_Wickets_sum",
            "cumulative_LBWs_sum", "last_{k}_matches_LBWs_sum", "cumulative_Bowleds_sum", "last_{k}_matches_Bowleds_sum",
            "cumulative_Extras_sum", "last_{k}_matches_Extras_sum", "cumulative_Maiden Overs_sum", "last_{k}_matches_Maiden Overs_sum",
            "cumulative_Runsgiven_sum", "last_{k}_matches_Runsgiven_sum", "cumulative_Dot Balls Bowled_sum",
            "last_{k}_matches_Dot Balls Bowled_sum", "cumulative_Foursgiven_sum", "last_{k}_matches_Foursgiven_sum",
            "cumulative_Sixesgiven_sum", "last_{k}_matches_Sixesgiven_sum", "last_{k}_matches_3wickets_sum",
            "last_{k}_matches_4wickets_sum", "last_{k}_matches_5wickets_sum", "last_year_avg_Wickets",
            "cumulative_derived_Economy Rate", "cumulative_derived_Bowling Dot Ball%", "cumulative_derived_Boundary Given%",
            "cumulative_derived_Bowling Avg", "cumulative_derived_Bowling Strike Rate",
            "last_{k}_matches_derived_Economy Rate", "last_{k}_matches_derived_Bowling Dot Ball%",
            "last_{k}_matches_derived_Boundary Given%", "last_{k}_matches_derived_Bowling Avg",
            "last_{k}_matches_derived_Bowling Strike Rate", "cumulative_Venue_Wickets_sum",
            "last_{k}_matches_Venue_Wickets_sum", "cumulative_Opposition_Wickets_sum", "last_{k}_matches_Opposition_Wickets_sum",
            "cumulative_match_type_Wickets_sum", "last_{k}_matches_match_type_Wickets_sum", "venue_avg_wickets_sum", "league_avg_wickets_sum"
        ]
    },
    "fielding": {
        "target": "fielding_fantasy_points",
        "columns": [
            "Total_matches_played_sum", "cumulative_Stumpings_sum", "last_{k}_matches_Stumpings_sum",
            "cumulative_Catches_sum", "last_{k}_matches_Catches_sum", "cumulative_direct run_outs_sum",
            "last_{k}_matches_direct run_outs_sum", "cumulative_indirect run_outs_sum",
            "last_{k}_matches_indirect run_outs_sum"
        ]
    }
}

# Store results
results = []

# Process each k_ODI.csv file
for file_path in glob.glob(os.path.join(folder_path, "*_ODI.csv")):
    k = int(os.path.basename(file_path).split("_")[0])  # Extract k from filename
    df = pd.read_csv(file_path)
    
    for category, details in category_cols.items():
        target_col = details["target"]
        numerical_cols = [col.format(k=k) for col in details["columns"]]
        numerical_cols = [col for col in numerical_cols if col in df.columns]  # Ensure columns exist
        
        if target_var not in df.columns or not numerical_cols:
            print(f"Skipping k={k} due to missing columns.")
            continue
        # Compute correlation
        corr_matrix = df[numerical_cols + [target_col]].corr()
        target_corr = corr_matrix[target_col].drop(target_col).abs().sort_values(ascending=False)
        
        # Get top N correlated features
        top_features = target_corr.head(TOP_N).reset_index()
        top_features.columns = ["Feature", "Correlation"]
        
        # Store results
        results.append({
            "k": k,
            "Category": category,
            "Best Feature": top_features.iloc[0]["Feature"],
            "Best Correlation": top_features.iloc[0]["Correlation"],
            "Avg Correlation": target_corr.mean(),
            "Features > 0.3": (target_corr > 0.3).sum(),
            "Top Features": top_features
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df.sort_values(by=["Category", "Avg Correlation"], ascending=[True, False])

Skipping k=15 due to missing columns.
Skipping k=15 due to missing columns.
Skipping k=15 due to missing columns.


Unnamed: 0,k,Category,Best Feature,Best Correlation,Avg Correlation,Features > 0.3,Top Features
3,10,batting,last_10_matches_Runs_sum,0.375113,0.267899,14,Feature ...
12,7,batting,cumulative_derived_Mean Score,0.373465,0.263384,14,Feature C...
0,5,batting,cumulative_derived_Mean Score,0.373465,0.257794,13,Feature C...
6,4,batting,cumulative_derived_Mean Score,0.373465,0.253559,12,Feature C...
9,3,batting,cumulative_derived_Mean Score,0.373465,0.248453,10,Feature C...
13,7,bowling,last_7_matches_Dot Balls Bowled_sum,0.501997,0.29847,17,Feature ...
4,10,bowling,last_10_matches_Dot Balls Bowled_sum,0.491068,0.298054,18,Feature ...
1,5,bowling,last_5_matches_Dot Balls Bowled_sum,0.507828,0.297182,16,Feature ...
7,4,bowling,last_4_matches_Dot Balls Bowled_sum,0.508825,0.295963,17,Feature ...
10,3,bowling,last_3_matches_Dot Balls Bowled_sum,0.507677,0.2935,16,Feature...


In [13]:
results_df.sort_values(by=["Category", "Avg Correlation"], ascending=[True, False]).to_csv("../data/processed/evaluation/k_correlation_analysis.csv", index=False)

In [18]:
# Path to the folder containing CSV files
folder_path = "../data/processed/combined/"

# Define the flexible number of top correlated features to display
TOP_N = 15

# Store results
results = []

# Process each k_ODI.csv file
for file_path in glob.glob(os.path.join(folder_path, "*_ODI.csv")):
    k = int(os.path.basename(file_path).split("_")[0])  # Extract k from filename
    df = pd.read_csv(file_path)
    
    for category, details in category_cols.items():
        target_col = details["target"]
        numerical_cols = [col.format(k=k) for col in details["columns"]]
        numerical_cols = [col for col in numerical_cols if col in df.columns]  # Ensure columns exist
        
        if target_col not in df.columns or not numerical_cols:
            print(f"Skipping k={k} due to missing columns.")
            continue
        
        # Compute correlation
        corr_matrix = df[numerical_cols + [target_col]].corr()
        target_corr = corr_matrix[target_col].drop(target_col).abs().sort_values(ascending=False)
        
        # Get top N correlated features
        top_features = target_corr.head(TOP_N).reset_index()
        top_features.columns = ["Feature", "Correlation"]
        
        # Filter for 'last_k' features
        last_k_features = target_corr[target_corr.index.str.contains(f"last_{k}")]
        avg_last_k_corr = last_k_features.mean() if not last_k_features.empty else 0
        
        # Get top 15 'last_k' correlated features
        top_last_k_features = last_k_features.head(TOP_N).reset_index()
        top_last_k_features.columns = ["Feature", "Correlation"]
        
        # Store results
        results.append({
            "k": k,
            "Category": category,
            "Best Feature": top_features.iloc[0]["Feature"],
            "Best Correlation": top_features.iloc[0]["Correlation"],
            "Avg Correlation": target_corr.mean(),
            "Avg last_k Correlation": avg_last_k_corr,
            "Features > 0.3": (target_corr > 0.3).sum(),
            "Top Features": top_features,
            "Top last_k Features": top_last_k_features
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort based on average last_k correlation
results_df = results_df.sort_values(by=["Category", "Avg last_k Correlation"], ascending=[True, False])

results_df.to_csv("../data/processed/evaluation/k_correlation_analysis2.csv", index=False)


In [19]:
results_df

Unnamed: 0,k,Category,Best Feature,Best Correlation,Avg Correlation,Avg last_k Correlation,Features > 0.3,Top Features,Top last_k Features
42,25,batting,last_25_matches_derived_Mean Score,0.376104,0.271834,0.285173,16,Feature ...,Feature ...
21,18,batting,last_18_matches_Runs_sum,0.380775,0.271807,0.28512,15,Feature ...,Feature ...
9,20,batting,last_20_matches_Runs_sum,0.379112,0.271797,0.285098,16,Feature ...,Feature ...
39,16,batting,last_16_matches_Runs_sum,0.381028,0.271394,0.284293,15,Feature ...,Feature ...
0,15,batting,last_15_matches_Runs_sum,0.38046,0.270988,0.283481,15,Feature ...,Feature ...
3,13,batting,last_13_matches_Runs_sum,0.379705,0.270233,0.281972,15,Feature ...,Feature ...
6,12,batting,last_12_matches_Runs_sum,0.378297,0.2695,0.280504,14,Feature ...,Feature ...
18,10,batting,last_10_matches_Runs_sum,0.375113,0.267899,0.277138,14,Feature ...,Feature ...
33,9,batting,cumulative_derived_Mean Score,0.37338,0.266502,0.274509,14,Feature C...,Feature C...
12,8,batting,cumulative_derived_Mean Score,0.37338,0.265078,0.271662,14,Feature C...,Feature C...
