# T20 Cricket Data Exploration for Linear Regression

This notebook explores T20 cricket data to understand patterns and prepare for linear regression modeling.

## Objectives
1. Load and explore T20/IT20 male cricket data
2. Analyze target variable distribution (innings totals)
3. Examine feature relationships and correlations
4. Assess data quality and completeness
5. Generate insights for model development

In [None]:
# Import required libraries
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

# Import project modules
from cricket.ml.data_preparation import T20DataPreparator, load_cricket_data
from cricket.transformation.match import (
    get_current_score,
    get_wickets_fallen,
    get_overs_remaining,
)

# Set plotting style
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (12, 8)

print("Libraries imported successfully!")

## 1. Data Loading and Initial Exploration

In [None]:
# Load cricket data
DATA_PATH = "../data/ball_level_data.parquet"

print(f"Loading data from: {DATA_PATH}")
raw_data = load_cricket_data(DATA_PATH)

print(f"\nDataset shape: {raw_data.shape}")
print(f"Columns: {raw_data.columns}")
print("\nFirst few rows:")
raw_data.head()

In [None]:
# Basic data summary
print("Data Summary:")
print(f"Total matches: {raw_data.select('match_id').n_unique()}")
print(f"Total balls: {len(raw_data)}")
print(
    f"Date range: {raw_data.select('match_id').min().item()} to {raw_data.select('match_id').max().item()}"
)

# Match format distribution
print("\nMatch format distribution:")
format_counts = (
    raw_data.group_by("match_type")
    .agg(pl.col("match_id").n_unique().alias("matches"))
    .sort("matches", descending=True)
)
print(format_counts)

# Gender distribution
print("\nGender distribution:")
gender_counts = (
    raw_data.group_by("gender")
    .agg(pl.col("match_id").n_unique().alias("matches"))
    .sort("matches", descending=True)
)
print(gender_counts)

## 2. T20 Data Filtering and Preparation

In [None]:
# Initialize data preparator and filter T20 data
preparator = T20DataPreparator(sample_overs=[5.0, 10.0, 15.0])

# Filter for T20/IT20 male matches
t20_data = preparator.filter_t20_matches(raw_data)

print("After T20/male filtering:")
print(f"Matches: {t20_data.select('match_id').n_unique()}")
print(f"Balls: {len(t20_data)}")
print(f"Reduction: {(1 - len(t20_data) / len(raw_data)) * 100:.1f}%")

# Validate data quality
quality_summary = preparator.validate_data_quality(t20_data)
print("\nData Quality Summary:")
for key, value in quality_summary.items():
    print(f"{key}: {value}")

## 3. Add Match State Features

In [None]:
# Add match state features
print("Adding match state features...")

# Add current score
t20_data = get_current_score(t20_data)
print("✓ Added current_score")

# Add wickets fallen
t20_data = get_wickets_fallen(t20_data)
print("✓ Added wickets_fallen")

# Add overs remaining
t20_data = get_overs_remaining(t20_data)
print("✓ Added overs_remaining")

print(f"\nDataset now has {len(t20_data.columns)} columns")
print("New feature columns: current_score, wickets_fallen, overs_remaining")

In [None]:
# Examine feature distributions
feature_cols = ["current_score", "wickets_fallen", "overs_remaining"]

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(feature_cols):
    data_pd = t20_data.select(col).to_pandas()[col]
    axes[i].hist(data_pd, bins=50, alpha=0.7, edgecolor="black")
    axes[i].set_title(f"Distribution of {col}")
    axes[i].set_xlabel(col)
    axes[i].set_ylabel("Frequency")
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics for features
print("\nFeature Summary Statistics:")
feature_stats = t20_data.select(feature_cols).describe()
print(feature_stats)

## 4. Target Variable Analysis

In [None]:
# Create innings-level target variable
target_df = preparator.create_target_variable(t20_data)

print(f"Created {len(target_df)} innings targets")
print("Target variable statistics:")
print(target_df.select("total_runs_innings").describe())

# Plot target variable distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Histogram
runs_data = target_df.select("total_runs_innings").to_pandas()["total_runs_innings"]
ax1.hist(runs_data, bins=30, alpha=0.7, edgecolor="black")
ax1.axvline(
    runs_data.mean(), color="red", linestyle="--", label=f"Mean: {runs_data.mean():.1f}"
)
ax1.axvline(
    runs_data.median(),
    color="orange",
    linestyle="--",
    label=f"Median: {runs_data.median():.1f}",
)
ax1.set_title("Distribution of T20 Innings Totals")
ax1.set_xlabel("Total Runs")
ax1.set_ylabel("Frequency")
ax1.legend()
ax1.grid(True, alpha=0.3)

# Box plot
ax2.boxplot(runs_data)
ax2.set_title("Box Plot of T20 Innings Totals")
ax2.set_ylabel("Total Runs")
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Additional target analysis
print("\nTarget Variable Analysis:")
print(f"Range: {runs_data.min():.0f} - {runs_data.max():.0f} runs")
print(f"Std Dev: {runs_data.std():.1f} runs")
print("\nPercentiles:")
for p in [10, 25, 50, 75, 90, 95]:
    print(f"  {p}th: {np.percentile(runs_data, p):.0f} runs")

## 5. Feature Sampling and Analysis

In [None]:
# Sample features at specified over marks
feature_samples = preparator.sample_features_at_overs(t20_data)

print(f"Created {len(feature_samples)} feature samples")
print(f"Sample points: {preparator.sample_overs}")

# Examine sample distribution across overs
sample_counts = (
    feature_samples.group_by("sample_over")
    .agg(pl.count().alias("samples"))
    .sort("sample_over")
)
print("\nSamples per over mark:")
print(sample_counts)

# Join with target variable for analysis
joined_data = feature_samples.join(
    target_df.select(["match_id", "innings_number", "total_runs_innings"]),
    on=["match_id", "innings_number"],
    how="inner",
)

print(f"\nJoined data shape: {joined_data.shape}")
print(f"Features available for modeling: {len(joined_data)}")

In [None]:
# Analyze features by over mark
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

# Convert to pandas for easier plotting
joined_pd = joined_data.to_pandas()

feature_cols = ["current_score", "wickets_fallen", "overs_remaining"]

for i, col in enumerate(feature_cols):
    ax = axes[i]

    # Box plot by over mark
    sns.boxplot(data=joined_pd, x="sample_over", y=col, ax=ax)
    ax.set_title(f"{col} by Over Mark")
    ax.grid(True, alpha=0.3)

# Target vs current score colored by over mark
ax = axes[3]
for over in preparator.sample_overs:
    over_data = joined_pd[joined_pd["sample_over"] == over]
    ax.scatter(
        over_data["current_score"],
        over_data["total_runs_innings"],
        label=f"After {over} overs",
        alpha=0.6,
    )

ax.set_xlabel("Current Score")
ax.set_ylabel("Final Total")
ax.set_title("Current Score vs Final Total by Over Mark")
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Feature Correlation Analysis

In [None]:
# Calculate correlation matrix
analysis_cols = [
    "current_score",
    "wickets_fallen",
    "overs_remaining",
    "total_runs_innings",
]
corr_data = joined_pd[analysis_cols]

correlation_matrix = corr_data.corr()

# Plot correlation heatmap
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap="RdBu_r",
    center=0,
    square=True,
    linewidths=0.5,
    ax=ax,
)
ax.set_title("Feature Correlation Matrix", fontweight="bold")
plt.tight_layout()
plt.show()

print("Correlation with Target Variable (total_runs_innings):")
target_corr = correlation_matrix["total_runs_innings"].sort_values(ascending=False)
for feature, corr in target_corr.items():
    if feature != "total_runs_innings":
        print(f"  {feature:15}: {corr:6.3f}")

print("\nFeature Inter-correlations:")
feature_cols = ["current_score", "wickets_fallen", "overs_remaining"]
for i, col1 in enumerate(feature_cols):
    for col2 in feature_cols[i + 1 :]:
        corr = correlation_matrix.loc[col1, col2]
        print(f"  {col1} vs {col2}: {corr:6.3f}")

## 7. Scatter Plot Matrix

In [None]:
# Create scatter plot matrix
fig, axes = plt.subplots(3, 3, figsize=(15, 15))

plot_cols = ["current_score", "wickets_fallen", "overs_remaining"]

for i, col1 in enumerate(plot_cols):
    for j, col2 in enumerate(plot_cols):
        ax = axes[i, j]

        if i == j:
            # Diagonal: histogram
            ax.hist(joined_pd[col1], bins=30, alpha=0.7, edgecolor="black")
            ax.set_title(f"Distribution of {col1}")
        else:
            # Off-diagonal: scatter plot
            ax.scatter(joined_pd[col2], joined_pd[col1], alpha=0.5)
            ax.set_xlabel(col2)
            ax.set_ylabel(col1)

            # Add correlation coefficient
            corr = joined_pd[col1].corr(joined_pd[col2])
            ax.text(
                0.05,
                0.95,
                f"r = {corr:.3f}",
                transform=ax.transAxes,
                bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8),
            )

        ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Target vs Features Analysis

In [None]:
# Plot target vs each feature
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, feature in enumerate(["current_score", "wickets_fallen", "overs_remaining"]):
    ax = axes[i]

    # Scatter plot
    ax.scatter(joined_pd[feature], joined_pd["total_runs_innings"], alpha=0.5)

    # Add trend line
    z = np.polyfit(joined_pd[feature], joined_pd["total_runs_innings"], 1)
    p = np.poly1d(z)
    ax.plot(
        sorted(joined_pd[feature]),
        p(sorted(joined_pd[feature])),
        "r--",
        alpha=0.8,
        linewidth=2,
    )

    # Correlation coefficient
    corr = joined_pd[feature].corr(joined_pd["total_runs_innings"])
    ax.text(
        0.05,
        0.95,
        f"r = {corr:.3f}",
        transform=ax.transAxes,
        bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8),
        fontsize=12,
        fontweight="bold",
    )

    ax.set_xlabel(feature, fontsize=12)
    ax.set_ylabel("Total Runs (Target)", fontsize=12)
    ax.set_title(f"Target vs {feature}", fontsize=14, fontweight="bold")
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Data Quality Assessment

In [None]:
# Check for missing values
print("Missing Values Analysis:")
missing_analysis = joined_data.null_count()
print(missing_analysis)

# Check for outliers using IQR method
print("\nOutlier Analysis:")
outlier_cols = [
    "current_score",
    "wickets_fallen",
    "overs_remaining",
    "total_runs_innings",
]

for col in outlier_cols:
    data_series = joined_pd[col]
    Q1 = data_series.quantile(0.25)
    Q3 = data_series.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data_series[(data_series < lower_bound) | (data_series > upper_bound)]
    outlier_pct = len(outliers) / len(data_series) * 100

    print(f"  {col:20}: {len(outliers):4d} outliers ({outlier_pct:4.1f}%)")
    if len(outliers) > 0:
        print(
            f"                       Range: {data_series.min():.1f} - {data_series.max():.1f}"
        )
        print(f"                       Bounds: {lower_bound:.1f} - {upper_bound:.1f}")

# Data completeness by over mark
print("\nData Completeness by Over Mark:")
completeness = (
    joined_data.group_by("sample_over")
    .agg(
        [
            pl.count().alias("total_samples"),
            pl.col("current_score").is_not_null().sum().alias("current_score_complete"),
            pl.col("wickets_fallen")
            .is_not_null()
            .sum()
            .alias("wickets_fallen_complete"),
            pl.col("overs_remaining")
            .is_not_null()
            .sum()
            .alias("overs_remaining_complete"),
        ]
    )
    .sort("sample_over")
)

print(completeness)

## 10. Insights and Conclusions

In [None]:
# Generate insights
print("=" * 60)
print("T20 LINEAR REGRESSION - DATA EXPLORATION INSIGHTS")
print("=" * 60)

# Dataset summary
print("\n📊 DATASET SUMMARY:")
print(f"   • Total T20/IT20 matches: {target_df.select('match_id').n_unique()}")
print(f"   • Total innings analyzed: {len(target_df)}")
print(f"   • Feature samples for modeling: {len(joined_data)}")
print(f"   • Sample points: {preparator.sample_overs} overs")

# Target variable insights
print("\n🎯 TARGET VARIABLE (Total Runs):")
print(f"   • Mean: {runs_data.mean():.1f} runs")
print(f"   • Range: {runs_data.min():.0f} - {runs_data.max():.0f} runs")
print(f"   • Standard deviation: {runs_data.std():.1f} runs")
print(
    f"   • Typical range (25th-75th): {runs_data.quantile(0.25):.0f} - {runs_data.quantile(0.75):.0f} runs"
)

# Feature correlation insights
print("\n🔗 FEATURE CORRELATIONS WITH TARGET:")
target_corrs = (
    correlation_matrix["total_runs_innings"]
    .drop("total_runs_innings")
    .sort_values(ascending=False)
)
for feature, corr in target_corrs.items():
    strength = (
        "Strong" if abs(corr) > 0.7 else "Moderate" if abs(corr) > 0.4 else "Weak"
    )
    direction = "positive" if corr > 0 else "negative"
    print(f"   • {feature}: {corr:.3f} ({strength} {direction})")

# Model readiness assessment
print("\n✅ MODEL READINESS ASSESSMENT:")

# Check linear relationships
strong_corrs = [f for f, c in target_corrs.items() if abs(c) > 0.5]
if strong_corrs:
    print(f"   ✓ Strong linear relationships found: {', '.join(strong_corrs)}")
else:
    print("   ⚠ No strong linear relationships (may affect model performance)")

# Check multicollinearity
feature_corrs = []
for i, col1 in enumerate(feature_cols):
    for col2 in feature_cols[i + 1 :]:
        corr = correlation_matrix.loc[col1, col2]
        feature_corrs.append(abs(corr))

max_feature_corr = max(feature_corrs)
if max_feature_corr < 0.7:
    print(
        f"   ✓ Low multicollinearity (max inter-feature correlation: {max_feature_corr:.3f})"
    )
else:
    print(
        f"   ⚠ Potential multicollinearity concern (max correlation: {max_feature_corr:.3f})"
    )

# Check data quality
total_nulls = joined_data.null_count().sum_horizontal().sum()
if total_nulls == 0:
    print("   ✓ No missing values in modeling dataset")
else:
    print(f"   ⚠ {total_nulls} missing values need handling")

print("\n🚀 RECOMMENDATIONS:")
print("   • Proceed with linear regression modeling")
print("   • Consider feature scaling (StandardScaler recommended)")
print("   • Monitor for overfitting given dataset size")
print("   • Validate model assumptions (linearity, homoscedasticity)")

print("\n" + "=" * 60)
print("Exploration complete! Ready for model training.")
print("=" * 60)

## Next Steps

Based on this exploration:

1. **Data Quality**: ✅ Good data quality with minimal missing values
2. **Linear Relationships**: Found meaningful correlations between features and target
3. **Feature Engineering**: Current features are suitable for linear regression
4. **Model Development**: Ready to proceed with training pipeline

**Next notebook**: `t20_model_training.ipynb` for model development and training.