### **Load dataset and clean it

In [1]:
import pandas as pd
import numpy as np

# Load both datasets
df = pd.read_csv('/workspaces/Cricket_website_ML_project/data/cricket_data.csv')

def restructure_data(df):
    # Initialize new dataframe with basic info
    players = df[['ID', 'NAME', 'COUNTRY', 'Full name', 'Born', 'Died', 
                 'Current age', 'Major teams', 'Playing role', 'Batting style', 
                 'Bowling style']].copy()
    
    # Add gender identification (we'll implement this next)
    players['gender'] = None
    
    # Define cricket formats
    formats = ['Tests', 'ODIs', 'T20Is', 'First-class', 'List A', 'T20s']
    
    # Create nested batting and bowling stats
    for fmt in formats:
        # Batting stats
        bat_cols = [f'BATTING_{fmt}_{stat}' for stat in 
                   ['Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR',
                    '100', '50', '4s', '6s', 'Ct', 'St']]
        
        # Bowling stats
        bowl_cols = [f'BOWLING_{fmt}_{stat}' for stat in 
                    ['Mat', 'Inns', 'Balls', 'Runs', 'Wkts', 'BBI', 'BBM',
                     'Ave', 'Econ', 'SR', '4w', '5w', '10']]
        
        # Create nested dictionaries
        players[f'{fmt.lower()}_batting'] = df[bat_cols].to_dict('records')
        players[f'{fmt.lower()}_bowling'] = df[bowl_cols].to_dict('records')
    
    return players

restructured_df = restructure_data(df)

  df = pd.read_csv('/workspaces/Cricket_website_ML_project/data/cricket_data.csv')


### **Gender Indentification**

In [5]:
def identify_gender(df):
    # Initialize gender column
    df['gender'] = 'male'  # Default assumption
    
    # Rule 1: Check if "women" appears in team names
    women_teams = df['Major teams'].str.contains('women|Women', case=False, na=False)
    
    # Rule 2: Check for female pronouns in description (if column exists)
    female_pronouns = pd.Series(False, index=df.index)
    if 'DESCRIPTION' in df.columns:
        female_pronouns = df['DESCRIPTION'].str.contains('she|her', case=False, na=False)
    
    # Rule 3: Known female name patterns
    female_names = df['NAME'].str.contains(
        r'\b(Sharmin|Mithali|Smriti|Meg|Ellyse|Sophie|Harmanpreet|Jemimah|Deepti|Poonam|Rajeshwari)\b', 
        case=False, na=False
    )
    
    # Apply rules - any positive indicates female player
    df.loc[women_teams | female_pronouns | female_names, 'gender'] = 'female'
    
    return df

### **Cricket Features**

In [6]:
def add_cricket_metrics(df):
    # Calculate overall batting/bowling metrics
    formats = ['tests', 'odis', 't20is']
    
    for fmt in formats:
        # Batting strike rate impact
        df[f'{fmt}_batting_impact'] = df[f'{fmt}_batting'].apply(
            lambda x: x.get('SR', 0) * x.get('Ave', 0) if isinstance(x, dict) else 0)
        
        # Bowling economy impact
        df[f'{fmt}_bowling_impact'] = df[f'{fmt}_bowling'].apply(
            lambda x: (100 - x.get('Econ', 100)) * x.get('Wkts', 0) if isinstance(x, dict) else 0)
    
    # Identify all-rounders (significant batting and bowling)
    df['is_allrounder'] = False
    for fmt in formats:
        bat_cond = df[f'{fmt}_batting'].apply(
            lambda x: x.get('Mat', 0) > 20 and x.get('Ave', 0) > 25 if isinstance(x, dict) else False)
        bowl_cond = df[f'{fmt}_bowling'].apply(
            lambda x: x.get('Mat', 0) > 20 and x.get('Ave', 0) < 35 if isinstance(x, dict) else False)
        df.loc[bat_cond & bowl_cond, 'is_allrounder'] = True
    
    # Era classification
    df['era'] = pd.cut(
        pd.to_numeric(df['Born'].str.extract(r'(\d{4})')[0]),
        bins=[1800, 1920, 1970, 1990, 2010, 2025],
        labels=['Pre-WWII', 'Golden Age', 'Modern', 'Contemporary', 'Current']
    )
    
    return df

restructured_df = add_cricket_metrics(restructured_df)

In [8]:
# Save to JSON for nested structure
restructured_df.to_json('/workspaces/Cricket_website_ML_project/data/structured_cricket_data.json', orient='records', indent=2)

# Save to CSV (flattened version)
flattened = pd.json_normalize(restructured_df.to_dict('records'))
flattened.to_csv('/workspaces/Cricket_website_ML_project/data/flattened_cricket_data.csv', index=False)