In [None]:
import pandas as pd
import random
import re


df = pd.read_csv("Inba_Cricket_commentary11.csv")

ball_type_patterns = {
    "Slower Ball": ["knuckle ball", "knuckle","slow", "slower", "slowest", "off-cutter", "leg-cutter", "off cutter", "leg cutter", "pace off", "change of pace","slower ball","SLOWER BALL","Slower Ball","Slower ball"], 
    "Full Toss": ["full toss", "waist-high full toss", "low full toss","FULLTOSS","full toss","Full toss","Full Toss"], 
    "Spin": ["spinner", "spinning delivery", "turn", "turning delivery","spin","SPIN","Spin"], 
    "Swing": ["seam up", "upright seam","swinging delivery", "swing", "outswinger", "inswinger", "in-swinger", "out-swinger","bouncer", "rising delivery", "hits the helmet", "climbs sharply", "short rising ball","SWING","Swing"], 
    "Wide Ball": ["wide", "way outside off", "too wide", "well outside","Wide Ball","wide ball","WIDE BALL","WIDE ball"]

}


length_patterns = {
    "Yorker Length": ["yorker", "blockhole", "at the base", "toe crusher", "right in the blockhole","yorker length"],
    "Full Length": ["full length", "fuller", "overpitched", "half-volley", "pitched up", "up to the bat"],
    "Good Length": ["good length", "good length ball", "just short of a length"],
    "Short Length": ["short", "bouncer","short length", "pitched short", "rising delivery","back of a length", "short of good length", "awkward length", "in-between length"]
}


line_patterns = {

    "Off Stump": ["on off stump", "hits off stump", "right on off", "angling into off","off stump"], 
    "Middle Stump": ["at the body", "into the ribs", "bodyline delivery", "on the rib cage","outside off", "well outside off", "wide of off", "wide outside off stump", "fifth stump", "sixth stump","on middle", "hits middle stump", "angling into middle", "middle of the stumps","straight", "on the stumps", "targeting the stumps", "straight ball","middle stump"], 
    "Leg Stump": ["on leg stump", "clips leg stump", "tailing into leg stump","leg","outside leg", "wide of leg", "way down leg", "drifting down leg side","leg stump"]

}

wagonwheel_patterns = {
    "Third Man": ["third man", "down to third", "late cut", "guided to third man","behind the wicket", "over the keeper", "upper cut", "past the keeper","third man"], 
    "Straight": ["straight", "down the ground", "back past the bowler", "straight down","straight"], 
    "Off": ["to mid-off", "through mid-off", "beats mid-off", "wide of mid-off","mid off","off side", "off-side", "through the off","off"], 
    "Mid On": ["to mid-on", "through mid-on", "beats mid-on", "wide of mid-on","mid on"],
    "Cover": ["cover", "extra cover", "through the covers", "cover drive"],
    "Point": ["point", "backward point", "square point", "behind point"],
    "Fine Leg": ["fine leg", "down leg side", "around fine", "around the corner"],
    "Square Leg": ["square leg", "backward square", "square boundary", "square region"],
    "Mid Wicket": ["mid wicket", "through mid wicket", "wide of mid wicket","mid-wicket"]

}

shot_patterns = {
    "Sweep": ["sweep", "swept","reverse sweep", "reverse hit","paddle sweep", "paddle shot"],
    "Slog & Scoop": ["slog", "slog sweep","scoop", "scooped","slog & scoop"], 
    "Pull Shot": ["pull", "pulled","push", "pushed","punch", "punched","pull shot"], 
    "Cut Shot": ["cut", "cuts", "square cut","ramp", "ramped","upper cut", "over slips","cut shot"], 
    "Drive": ["cover drive", "through cover","straight drive", "down the ground","square drive","lofted drive", "aerial drive","drive"], 
    "Flick": ["flick", "flicked"]
} 

feature_patterns = {
    "ShotType": shot_patterns,
    "BallType": ball_type_patterns,
    "Length": length_patterns,
    "Line": line_patterns,
    "WagonWheel": wagonwheel_patterns
}


def extract_label(comment, patterns):
    comment_lower = comment.lower()
    for label, keywords in patterns.items():
        for keyword in keywords:
            if keyword.lower() in comment_lower:
                return label
    return None

def balance_feature(df, col_name, patterns, total_rows):
    df[col_name] = df['Commentary'].apply(lambda x: extract_label(x, patterns))
    df = df.dropna(subset=[col_name])

    unique_labels = list(patterns.keys())
    num_labels = len(unique_labels)
    target_per_label = total_rows // num_labels
    balanced_data = []

    for label in unique_labels:
        group = df[df[col_name] == label].copy()

        if len(group) > target_per_label:
            group = group.sample(target_per_label, random_state=42)

        elif len(group) < target_per_label:
            extra_needed = target_per_label - len(group)
            duplicates = []
            for _ in range(extra_needed):
                row = group.sample(1).iloc[0].copy()
                replaced = False
                for keyword in patterns[label]:
                    if re.search(rf'\b{re.escape(keyword)}\b', row['Commentary'], re.IGNORECASE):
                        row['Commentary'] = re.sub(rf'\b{re.escape(keyword)}\b', label, row['Commentary'], flags=re.IGNORECASE)
                        replaced = True
                        break
                if not replaced:
                    row['Commentary'] += f" [{label}]"
                row[col_name] = label
                duplicates.append(row)
            group = pd.concat([group, pd.DataFrame(duplicates)], ignore_index=True)

        balanced_data.append(group)

    return pd.concat(balanced_data, ignore_index=True)


for col_name, patterns in feature_patterns.items():
    print(f"🔄 Balancing feature: {col_name}")
    df = balance_feature(df, col_name, patterns, total_rows=11575)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.to_csv("Inba_Cricket_commentary12.csv", index=False)
print("✅ Done! Saved to 'balanced_commentary_per_feature.csv'")


🔄 Balancing feature: ShotType
🔄 Balancing feature: BallType
🔄 Balancing feature: Length
🔄 Balancing feature: Line
🔄 Balancing feature: WagonWheel
✅ Done! Saved to 'balanced_commentary_per_feature.csv'
