In [2]:
pip install transformers peft bitsandbytes accelerate pandas numpy torch scikit-learn

Collecting transformers
  Using cached transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting peft
  Using cached peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.45.0-py3-none-win_amd64.whl.metadata (2.9 kB)
Collecting accelerate
  Using cached accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Using cached huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safet

In [15]:
import pandas as pd
import json

# Load JSON file
file_path = "commentarydata.json"  # Replace with your actual file path
with open(file_path, 'r') as file:
    data = json.load(file)

# Normalize both innings
inning1 = pd.json_normalize(data, record_path=['commentary_innings1'])
inning2 = pd.json_normalize(data, record_path=['commentary_innings2'])

# Add a column to distinguish between innings
inning1['inning'] = 1
inning2['inning'] = 2

# Combine both innings into a single DataFrame
commentary = pd.concat([inning1, inning2], ignore_index=True)


# Function to extract runs scored
def extract_runs(comm_text):
    if "SIX" in comm_text or "six" in comm_text:
        return 6
    elif "FOUR" in comm_text or "four" in comm_text:
        return 4
    elif "1 run" in comm_text:
        return 1
    elif "2 runs" in comm_text:
        return 2
    elif "3 runs" in comm_text:
        return 3
    elif "no run" in comm_text:
        return 0
    else:
        return 0
    return None

def wicket_type(comm_text):
    # Array of wicket type keywords for easy future additions
    wicket_keywords = ['runout', 'stumped', 'lbw', 'hit wicket']
    
    # Check for special combinations first
    text_lower = comm_text.lower()
    if 'caught' in text_lower and 'edge' in text_lower:
        return 'edge&caught'
    elif 'played on' in text_lower and 'bowled' in text_lower:
        return 'played_on'
    
    # Check for individual keywords
    if 'caught' in text_lower:
        return 'caught'
    if 'bowled' in text_lower:
        return 'bowled'
    
    # Check other wicket types
    for wicket in wicket_keywords:
        if wicket in text_lower:
            return wicket
    
    return 'wicket'

def extract_shot(comm_text):
    shot_keywords = ['pull', 'cut', 'drive', 'sweep', 'flick', 'hook', 'loft','slog', 'punch','down the ground','across the line','scoop']
    if 'goes down' in comm_text.lower() or 'whip' in comm_text.lower():
        return 'loft'
    if 'nudge' in comm_text.lower() or 'tap' in comm_text.lower() or 'darted' in comm_text.lower() or 'paddle' in comm_text.lower() or 'tuck' in comm_text.lower() or 'pushed' in comm_text.lower() or 'knock' in comm_text.lower() or 'clipped' in comm_text.lower() or 'steer' in comm_text.lower() or 'dab' in comm_text.lower():
        return 'pushed slightly'
    if 'defend' in comm_text.lower() or 'defence' in comm_text.lower() or 'blocked' in comm_text.lower():
        return 'defend'
    for shot in shot_keywords:
        if shot in comm_text.lower():
            return shot
    return None

def extract_direction(comm_text):
    direction_keywords = ['cover', 'square', 'point']
    
    for direction in direction_keywords:
        if direction in comm_text.lower():
            return direction
    if 'fine leg' in comm_text.lower() or 'fine-leg' in comm_text.lower() or 'fine' in comm_text.lower():
        return 'fine-leg'
    if 'third-man' in comm_text.lower() or 'third man' in comm_text.lower():
        return 'third-man'
    if 'mid-wicket' in comm_text.lower() or 'midwicket' in comm_text.lower() or 'mid wicket' in comm_text.lower():
        return 'mid-wicket'
    if 'mid-on' in comm_text.lower() or 'mid on' in comm_text.lower():
        return 'mid-on'
    if 'mid-off' in comm_text.lower() or 'mid off' in comm_text.lower():
        return 'mid-off'
    if 'long-on' in comm_text.lower() or 'long on' in comm_text.lower():
        return 'long-on'
    if 'long-off' in comm_text.lower() or 'long off' in comm_text.lower():
        return 'long-off'
    if 'defend' in comm_text.lower() or 'defence' in comm_text.lower() or 'blocked' in comm_text.lower() or 'wide' in comm_text.lower():
        return 'blocked'
    if  'keeper' in comm_text.lower():
        return 'keeper'
    return None

def extract_length(comm_text):
    length_keywords = ['short', 'full', 'yorker', 'bouncer','half-volley','good length','tossed up']
    
    if 'full toss' in comm_text.lower() or 'full-toss' in comm_text.lower():
        return 'full-toss'
    for length in length_keywords:
        if length in comm_text.lower():
            return length
    if 'back of a length' in comm_text.lower() or 'short of a length' in comm_text.lower():
        return 'back of a length'
    elif 'length delivery' in comm_text.lower() or 'length ball' in comm_text.lower():
        return 'length ball'
    
    return None

def extract_ball_line(comm_text):
    text_lower = comm_text.lower()
    line_keywords = {
        'outside-off': ['outside off', 'outside-off'],
        'off-stump': ['off stump line', 'off-stump', 'off stump', 'around off','middle and off'],
        'middle-stump': ['middle stump', 'middle-stump','on middle', 'on stump', 'around middle','middle and leg', 'body'],
        'leg-stump': ['leg stump', 'leg-stump', 'outside leg','on leg','leg bye', 'around leg'],
        'down-leg': ['down leg', 'down-leg'],
        'wide': ['wide']
    }
    variations = ['seam', 'swing', 'seam-away', 'seam-in', 'seam-up',]
    for line, variations in line_keywords.items():
        for variation in variations:
            if variation in text_lower:
                return line
        if ('swing' not in text_lower or 'seam' not in text_lower) and 'pads' in text_lower:
            return 'leg-stump'
    return None

def extract_batting_control(row):
    text_lower = row['commText'].lower()
    control_keywords = {
        'no-control': ['missed', 'misses', 'edge', 'edges'],
        'beaten': ['beaten', 'beats']
    }

    if row['event'] == 'WICKET':
        return 'no-control'
    
    for control, keywords in control_keywords.items():
        for keyword in keywords:
            if keyword in text_lower:
                return control
    return 'in-control'



def extract_bowling_variation(comm_text):
    text_lower = comm_text.lower()
    
    # Check for combined variations first
    if 'seam' in text_lower and 'away' in text_lower:
        return 'seam-away'
    if 'seaming' in text_lower and 'in' in text_lower:
        return 'seam-in'
    
    variation_keywords = {
        'seam-up': ['seam up', 'seam-up'],
        'out-swing': ['swinging away','outswinger'],
        'in-swing': ['swinging in','inswinger'],
        'swing': ['swinging'],
        'googly': ['googly'],
        'slower': ['slower', 'lack of pace'],
        'cutter': ['cutter'],
        'yorker': ['yorker'],
        'bouncer': ['bouncer'],
        'off-break': ['off-break'],
        'carrom ball': ['carrom'],
        'flipper': ['flipper'],
        'leg-break': ['leg-break'],
        'off-spin': ['off-spin'],
        'leg-spin': ['leg spin'],
        'knuckle ball': ['knuckle ball'],
        'quick': ['quick'],
        'arm ball': ['arm ball'],
        'slider':['slider'],
        'skidding':['skidding', 'skidded']

    }
    
    for variation, keywords in variation_keywords.items():
        for keyword in keywords:
            if keyword in text_lower:
                return variation
    return 'normal'

# Apply all transformations
def process_commentary(commentary):
    # Extract all required information
    commentary['runs'] = commentary['commText'].apply(extract_runs)
    
    # Extract player details before removing the original columns
    commentary['batsman_name'] = commentary['batsman.batName']
    commentary['bowler_name'] = commentary['bowler.bowlName']
    commentary['batsman_id'] = commentary['batsman.batId']
    commentary['bowler_id'] = commentary['bowler.bowlId']
    
    # Identify if it's a wicket ball
    commentary['is_wicket'] = commentary['event'] == 'WICKET'
    
    # Apply wicket type only if it's a wicket ball
    commentary['wicket_type'] = commentary.apply(
        lambda row: wicket_type(row['commText']) if row['event'] == 'WICKET' else 'N/A',
        axis=1
    )
    
    # Extract shot, length, line, control and variation information
    commentary['shot_played'] = commentary['commText'].apply(extract_shot)
    commentary['ball_length'] = commentary['commText'].apply(extract_length)
    commentary['ball_line'] = commentary['commText'].apply(extract_ball_line)
    # Apply this updated function
    commentary['batting_control'] = commentary.apply(extract_batting_control, axis=1)
    commentary['bowling_variation'] = commentary['commText'].apply(extract_bowling_variation)
    commentary['shot_direction'] = commentary['commText'].apply(extract_direction)
    
    # Remove unwanted columns
    columns_to_remove = [
        'batsman.batId',
        'batsman.batName',
        'bowler.bowlId',
        'bowler.bowlName',
        'event',
        # 'batsman_name',
        # 'bowler_name',
        'batsman_id',
        'bowler_id',
        'batTeamName'
    ]
    commentary.drop(columns=columns_to_remove, inplace=True)
    
    return commentary

# Process the commentary DataFrame
commentary = process_commentary(commentary)
commentary.head(1)





Unnamed: 0,commText,overNumber,inning,runs,batsman_name,bowler_name,is_wicket,wicket_type,shot_played,ball_length,ball_line,batting_control,bowling_variation,shot_direction
0,"Joshua Little to Dhoni, 1 run, CSK finish wit...",19.6,1,1,MS Dhoni,Joshua Little,False,,pull,,,in-control,slower,square


In [16]:
def refine_commentary_fields(commentary):
    """
    Refine values in the filtered commentary based on specific cricket-related conditions
    """
    # Create a copy to avoid modifying the original DataFrame
    refined_df = commentary.copy()
    
    # Apply condition 1: Update shot_direction based on batting control and wicket type
    mask_control = (refined_df['batting_control'] != 'in-control') & (refined_df['shot_direction'].isnull())
    mask_stumped = refined_df['wicket_type'] == 'stumped'
    refined_df.loc[mask_control | mask_stumped, 'shot_direction'] = 'keeper'
    
    # Apply condition 2: Update shot_played based on direction and runs
    mask_shot_null = refined_df['shot_played'].isnull() & refined_df['shot_direction'].notnull()
    mask_low_runs = refined_df['runs'].isin([0, 1, 2])
    refined_df.loc[mask_shot_null & mask_low_runs, 'shot_played'] = 'pushed slightly'
    refined_df.loc[mask_shot_null & ~mask_low_runs, 'shot_played'] = 'normal'
    
    # Apply condition 3: Update ball_line based on shot direction and bowling variation
    mask_offside_direction = refined_df['shot_direction'].isin(['cover', 'point', 'third-man'])
    mask_not_away = ~refined_df['bowling_variation'].isin(['seam-away', 'out-swing'])
    mask_valid_line = refined_df['ball_line'].isnull()
    refined_df.loc[mask_valid_line & mask_offside_direction & mask_not_away, 'ball_line'] = 'outside-off'
    
    # Apply condition 4: Update ball_length based on multiple conditions
    # Condition 4.1: outside-off drive
    mask_outside_off_drive = (refined_df['ball_line'] == 'outside-off') & (refined_df['shot_played'] == 'drive')
    
    # Condition 4.2: specific shots
    mask_specific_shots = refined_df['shot_played'].isin(['slog', 'scoop', 'sweep'])
    
    # Condition 4.3: swing variations
    mask_swing = refined_df['bowling_variation'].isin(['in-swing', 'out-swing'])
    
    # Condition 4.4: point/third-man cut
    mask_cut_direction = refined_df['shot_direction'].isin(['point', 'third-man'])
    mask_cut_shot = refined_df['shot_played'] == 'cut'
    
    # Apply length updates
    mask_valid_length = refined_df['ball_length'].isnull()
    refined_df.loc[mask_valid_length & (mask_outside_off_drive | mask_specific_shots | mask_swing), 'ball_length'] = 'full'
    refined_df.loc[mask_valid_length & mask_cut_direction & mask_cut_shot, 'ball_length'] = 'back of a length'
    
    # Apply condition 5: Update ball_line based on additional conditions
    # Condition 5.1: off-stump line
    mask_off_direction = refined_df['shot_direction'].isin(['mid-off', 'long-off'])
    mask_pushed = refined_df['shot_played'] == 'pushed slightly'
    mask_not_short = refined_df['ball_length'] != 'short'
    mask_valid_line = refined_df['ball_line'].isnull()
    refined_df.loc[mask_valid_line & mask_off_direction & mask_pushed & mask_not_short, 'ball_line'] = 'off-stump'
    
    # Condition 5.2: leg-stump line for fine-leg direction
    mask_fine_leg = refined_df['shot_direction'] == 'fine-leg'
    mask_leg_shots = refined_df['shot_played'].isin(['pull', 'flick', 'sweep'])
    refined_df.loc[mask_valid_line & mask_fine_leg & mask_leg_shots, 'ball_line'] = 'leg-stump'
    
    # Condition 5.3: leg-stump line for square direction
    mask_square = refined_df['shot_direction'] == 'square'
    mask_not_short_back = ~refined_df['ball_length'].isin(['short', 'back of a length'])
    refined_df.loc[mask_valid_line & mask_square & mask_not_short_back, 'ball_line'] = 'leg-stump'
    
    return refined_df

# Apply the refinements to the filtered commentary
refined_commentary = refine_commentary_fields(commentary)


# Filter rows where specific columns are not None
commentary_before = commentary[
    # (commentary['bowler_id']==265)
    # (commentary['batsman_id']==1413)
    (commentary['shot_played'].notnull())&
    (commentary['ball_length'].notnull())&
    (commentary['ball_line'].notnull())&
    # (commentary['batting_control'] != 'in-control')&
    # (commentary['bowling_variation'] !='normal')
    (commentary['shot_direction'].notnull())
    # ((commentary['runs']==0) | (commentary['runs']==1) | (commentary['runs']==2)) & 
    # (commentary['is_wicket']!=True)
]


# Filter rows where specific columns are not None
commentary_after = refined_commentary[
    # (refined_commentary['bowler_id']==265)
    # (refined_commentary['batsman_id']==1413)
    (refined_commentary['shot_played'].notnull())&
    (refined_commentary['ball_length'].notnull())&
    (refined_commentary['ball_line'].notnull())&
    # (refined_commentary['batting_control'].notnull)
#     (refined_commentary['bowling_variation'] !='normal')
    (refined_commentary['shot_direction'].notnull())
#     ((refined_commentary['runs']==0) | (refined_commentary['runs']==1) | (refined_commentary['runs']==2)) & 
#     (refined_commentary['is_wicket']!=True)
]

# Print the filtered rows
row_count = len(commentary_before)
print(f"Number of rows in commentary_before: {row_count}")

# Print the filtered rows
row_count = len(commentary_after)
print(f"Number of rows in refined_commentary: {row_count}")
# commentary_after.tail(25)


Number of rows in commentary_before: 9083
Number of rows in refined_commentary: 21012


Collecting evaluateNote: you may need to restart the kernel to use updated packages.

  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Using cached dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Using cached xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py312-none-any.whl.metadata (7.2 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-18.1.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill (from evaluate)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Using cached fsspec-202

In [17]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import logging
import evaluate
import gc
import os
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
    PeftModel
)

class CricketDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):  # Reduced max_length
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def create_query_answer_pairs(self, row):
        """Create multiple query-answer pairs from a single commentary row"""
        pairs = []
        
        # Template 1: Shot played query
        shot_query = f"what shot did {row['batsman_name']} play?"
        shot_answer = f"{row['batsman_name']} played a {row['shot_played']} towards {row['shot_direction']}"
        pairs.append((shot_query, shot_answer))
        
        # Template 2: Ball details query
        ball_query = f"how did {row['bowler_name']} bowl to {row['batsman_name']}?"
        ball_answer = f"{row['bowler_name']} bowled a {row['ball_length']} length {row['ball_line']} line delivery"
        pairs.append((ball_query, ball_answer))
        
        # Template 3: Result query
        result_query = f"what was the result when {row['bowler_name']} bowled to {row['batsman_name']}?"
        result = f"Wicket - {row['wicket_type']}" if row['is_wicket'] else f"{row['runs']} runs"
        result_answer = f"The result was {result} with {row['batting_control']}% control"
        pairs.append((result_query, result_answer))
        
        return pairs
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        pairs = self.create_query_answer_pairs(row)
        
        query, answer = pairs[np.random.randint(0, len(pairs))]
        input_text = f"Q: {query}"  # Simplified prefix
        
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        targets = self.tokenizer(
            answer,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

def prepare_cricket_data(commentary_data):
    """Prepare cricket commentary data for training"""
    commentary_data = commentary_data.dropna()
    commentary_data['is_wicket'] = commentary_data['is_wicket'].astype(str)
    
    train_data, val_data = train_test_split(
        commentary_data, 
        test_size=0.15,
        random_state=42
    )
    
    return train_data, val_data

def compute_metrics(eval_pred, tokenizer):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    rouge = evaluate.load('rouge')
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {
        'rouge1': result['rouge1'],
        'rouge2': result['rouge2'],
        'rougeL': result['rougeL']
    }

def create_peft_config():
    """Create LoRA configuration for PEFT"""
    return LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=4,  # Reduced rank
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q", "v"]
    )

def train_cricket_model(commentary_data, output_dir="./cricket_model"):
    """Main function to train the model"""
    
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    # Initialize tokenizer and model using a smaller model
    model_name = "google/flan-t5-small"  # Much smaller than T5-base
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        load_in_8bit=True,  # Use 8-bit quantization instead of 4-bit
        device_map="auto"
    )
    
    # Prepare model for training
    base_model = prepare_model_for_kbit_training(base_model)
    
    # Add LoRA adapters
    peft_config = create_peft_config()
    model = get_peft_model(base_model, peft_config)
    
    # Prepare data
    train_data, val_data = prepare_cricket_data(commentary_data)
    train_dataset = CricketDataset(train_data, tokenizer)
    val_dataset = CricketDataset(val_data, tokenizer)
    
    # Define training arguments with reduced batch sizes and gradient accumulation
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=2,  # Reduced epochs
        per_device_train_batch_size=1,  # Reduced batch size
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,  # Increased gradient accumulation
        warmup_steps=50,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=500,
        load_best_model_at_end=True,
        save_total_limit=1,  # Keep only the best model
        fp16=True,
        gradient_checkpointing=True
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda eval_pred: compute_metrics(eval_pred, tokenizer)
    )
    
    # Train the model
    trainer.train()
    
    # Save the final model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return model, tokenizer

class CricketQASystem:
    def __init__(self, model_path):
        """Initialize QA system with PEFT model"""
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        # Load the base model first
        base_model = AutoModelForSeq2SeqLM.from_pretrained(
            "google/flan-t5-small",
            device_map="auto",
            load_in_8bit=True
        )
        
        # Load the PEFT model
        self.model = PeftModel.from_pretrained(base_model, model_path)
        self.model.eval()
    
    @torch.no_grad()
    def answer_question(self, question, max_length=50):  # Reduced max_length
        """Generate answer for a given question"""
        input_text = f"Q: {question}"
        
        inputs = self.tokenizer(
            input_text,
            max_length=128,  # Reduced max_length
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        
        outputs = self.model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=2,  # Reduced beam size
            length_penalty=1.0,
            early_stopping=True
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

def main():
    """Main function to demonstrate usage"""
    try:
        # Train the model
        model, tokenizer = train_cricket_model(refined_commentary)
        
        # Create QA system
        qa_system = CricketQASystem("./cricket_model")
        
        # Test questions
        test_questions = [
            "which shot does Virat Kohli play most?",
            "how does Virat Kohli play against short balls?",
            "what is the average control percentage for Rohit Sharma?",
            "describe the bowling style of Jasprit Bumrah"
        ]
        
        # Test the system
        for question in test_questions:
            try:
                answer = qa_system.answer_question(question)
                print(f"\nQ: {question}")
                print(f"A: {answer}")
            except RuntimeError as e:
                print(f"Error processing question: {str(e)}")
                continue
                
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

An error occurred: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



In [18]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 3050 Ti Laptop GPU
Using device: cuda


In [14]:

import pandas as pd
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import logging
import evaluate
# from datasets import load_metric
import gc
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
    PeftModel
)
from transformers import BitsAndBytesConfig
import bitsandbytes as bnb

class CricketDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def create_query_answer_pairs(self, row):
        """Create multiple query-answer pairs from a single commentary row"""
        pairs = []
        
        # Template 1: Shot played query
        shot_query = f"what shot did {row['batsman_name']} play?"
        shot_answer = f"{row['batsman_name']} played a {row['shot_played']} towards {row['shot_direction']}"
        pairs.append((shot_query, shot_answer))
        
        # Template 2: Ball details query
        ball_query = f"how did {row['bowler_name']} bowl to {row['batsman_name']}?"
        ball_answer = f"{row['bowler_name']} bowled a {row['ball_length']} length {row['ball_line']} line delivery"
        pairs.append((ball_query, ball_answer))
        
        # Template 3: Result query
        result_query = f"what was the result when {row['bowler_name']} bowled to {row['batsman_name']}?"
        result = f"Wicket - {row['wicket_type']}" if row['is_wicket'] else f"{row['runs']} runs"
        result_answer = f"The result was {result} with {row['batting_control']}% control"
        pairs.append((result_query, result_answer))
        
        return pairs
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        pairs = self.create_query_answer_pairs(row)
        
        query, answer = pairs[np.random.randint(0, len(pairs))]
        input_text = f"answer cricket question: {query}"
        
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        targets = self.tokenizer(
            answer,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

def prepare_cricket_data(commentary_data):
    """Prepare cricket commentary data for training"""
    commentary_data = commentary_data.dropna()
    commentary_data['is_wicket'] = commentary_data['is_wicket'].astype(str)
    
    train_data, val_data = train_test_split(
        commentary_data, 
        test_size=0.15,
        random_state=42
    )
    
    return train_data, val_data

def compute_metrics(eval_pred,tokenizer):
    """Compute metrics for evaluation"""
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    rouge = evaluate.load('rouge')
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {
        'rouge1': result['rouge1'],
        'rouge2': result['rouge2'],
        'rougeL': result['rougeL']
    }

def create_peft_config():
    """Create LoRA configuration for PEFT"""
    return LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=8,  # Rank
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q", "v"]  # Target q and v matrices in attention
    )

def train_cricket_model(commentary_data, output_dir="./cricket_T5_model"):
    """Main function to train the model using PEFT/LoRA with memory optimizations"""
    
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    # Initialize quantization config with more aggressive memory savings
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True  # Enable double quantization
    )
    
    # Initialize tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained("T5-small")
    base_model = T5ForConditionalGeneration.from_pretrained(
        "T5-small",
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16  # Ensure model is in float16
    )
    
    # Prepare model for k-bit training
    base_model = prepare_model_for_kbit_training(base_model)
    
    # Create more memory-efficient LoRA config
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=4,  # Reduced rank
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q", "v"]
    )
    
    model = get_peft_model(base_model, peft_config)
    
    # Prepare smaller data samples if needed
    train_data, val_data = prepare_cricket_data(commentary_data)
    # Optionally reduce dataset size for testing
    # train_data = train_data.head(1000)  # Uncomment to use smaller dataset
    # val_data = val_data.head(100)
    
    train_dataset = CricketDataset(train_data, tokenizer)
    val_dataset = CricketDataset(val_data, tokenizer)
    
    # Define training arguments with reduced memory footprint
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=1,  # Reduced batch size
        per_device_eval_batch_size=1,   # Reduced batch size
        gradient_accumulation_steps=8,   # Increased gradient accumulation
        warmup_steps=50,                # Reduced warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,                 # Reduced evaluation frequency
        save_steps=500,
        load_best_model_at_end=True,
        save_total_limit=1,             # Keep only one checkpoint
        fp16=True,
        gradient_checkpointing=True,
        optim="paged_adamw_32bit",      # Memory-efficient optimizer
        max_grad_norm=0.3,              # Add gradient clipping
    )
    
    # Initialize trainer with memory optimizations
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda eval_pred: compute_metrics(eval_pred, tokenizer)
    )
    
    # Train the model
    trainer.train()
    
    # Save the final model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return model, tokenizer


class CricketQASystem:
    def __init__(self, model_path):
        """Initialize QA system with PEFT model"""
        self.tokenizer = T5Tokenizer.from_pretrained(model_path)
        
        # Load the PEFT model
        config = PeftModel.from_pretrained(model_path)
        base_model = T5ForConditionalGeneration.from_pretrained(
            "T5-small",
            device_map="auto",
            load_in_4bit=True
        )
        self.model = PeftModel.from_pretrained(base_model, model_path)
        self.model.eval()
    
    @torch.no_grad()
    def answer_question(self, question, max_length=100):
        """Generate answer for a given question"""
        input_text = f"answer cricket question: {question}"
        
        inputs = self.tokenizer(
            input_text,
            max_length=512,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        
        # Move inputs to the same device as model
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        
        outputs = self.model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

def main():
    """Main function to demonstrate usage"""
    try:
        # Train the model
        model, tokenizer = train_cricket_model(refined_commentary)
        
        # Create QA system
        qa_system = CricketQASystem("./cricket_T5_model")
        
        # Test questions
        test_questions = [
            "which shot does Virat Kohli play most?",
            "how does Virat Kohli play against short balls?",
            "what is the average control percentage for Rohit Sharma?",
            "describe the bowling style of Jasprit Bumrah"
        ]
        
        # Test the system
        for question in test_questions:
            try:
                answer = qa_system.answer_question(question)
                print(f"\nQ: {question}")
                print(f"A: {answer}")
            except RuntimeError as e:
                print(f"Error processing question: {str(e)}")
                continue
                
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

An error occurred: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



In [21]:
import pandas as pd
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import gc
import os

# Set environment variable for memory management
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

class CricketDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):  # Reduced max_length
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def create_query_answer_pairs(self, row):
        """Create multiple query-answer pairs from a single commentary row"""
        pairs = []

        # Template 1: Shot played query
        shot_query = f"what shot did {row['batsman_name']} play?"
        shot_answer = f"{row['batsman_name']} played a {row['shot_played']} towards {row['shot_direction']}"
        pairs.append((shot_query, shot_answer))

        # Template 2: Ball details query
        ball_query = f"how did {row['bowler_name']} bowl to {row['batsman_name']}?"
        ball_answer = f"{row['bowler_name']} bowled a {row['ball_length']} length {row['ball_line']} line delivery"
        pairs.append((ball_query, ball_answer))

        return pairs

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        pairs = self.create_query_answer_pairs(row)

        query, answer = pairs[np.random.randint(0, len(pairs))]
        input_text = f"answer cricket question: {query}"

        # Tokenize with smaller max_length
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer(
            answer,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

def prepare_cricket_data(commentary_data, max_samples=None):
    """Prepare cricket commentary data for training with option to limit samples"""
    commentary_data = commentary_data.dropna()
    commentary_data['is_wicket'] = commentary_data['is_wicket'].astype(str)

    if max_samples and len(commentary_data) > max_samples:
        commentary_data = commentary_data.sample(n=max_samples, random_state=42)

    train_data, val_data = train_test_split(
        commentary_data,
        test_size=0.15,
        random_state=42
    )

    return train_data, val_data

def compute_metrics(eval_pred,tokenizer):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Simple accuracy metric
    return {
        'accuracy': sum(p == l for p, l in zip(decoded_preds, decoded_labels)) / len(decoded_preds)
    }

def train_cricket_model(commentary_data, output_dir="./cricket_t5_model", max_samples=1000):
    """Memory-optimized training function"""

    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

    # Initialize smaller model
    tokenizer = T5Tokenizer.from_pretrained("t5-small")  # Using t5-small instead of t5-base
    model = T5ForConditionalGeneration.from_pretrained("t5-small")

    # Prepare limited data
    train_data, val_data = prepare_cricket_data(commentary_data, max_samples=max_samples)

    # Create datasets
    train_dataset = CricketDataset(train_data, tokenizer)
    val_dataset = CricketDataset(val_data, tokenizer)

    # Memory-optimized training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=2,  # Reduced batch size
        per_device_eval_batch_size=2,   # Reduced batch size
        gradient_accumulation_steps=4,   # Accumulate gradients
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=50,
        evaluation_strategy="steps",
        eval_steps=100,
        save_steps=200,
        load_best_model_at_end=True,
        save_total_limit=2,             # Keep only 2 checkpoints
        fp16=True,                      # Use mixed precision training
        dataloader_num_workers=0,       # Reduce worker processes
        gradient_checkpointing=True     # Enable gradient checkpointing
    )

    # Initialize trainer
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda eval_pred: compute_metrics(eval_pred, tokenizer)
)

    # Train with memory optimization
    try:
        trainer.train()
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
    except Exception as e:
        print(f"Training failed with error: {str(e)}")
        # Try to free memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        raise e

    return model, tokenizer

class CricketQASystem:
    def __init__(self, model_path):
        self.tokenizer = T5Tokenizer.from_pretrained(model_path)
        self.model = T5ForConditionalGeneration.from_pretrained(model_path)

        # Move model to CPU if GPU is out of memory
        if torch.cuda.is_available() and torch.cuda.memory_allocated() > 0.8 * torch.cuda.get_device_properties(0).total_memory:
            self.model = self.model.cpu()

        self.model.eval()

    @torch.no_grad()  # Disable gradient computation during inference
    def answer_question(self, question, max_length=50):  # Reduced max_length
        input_text = f"answer cricket question: {question}"

        # Clear cache before inference
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        inputs = self.tokenizer(
            input_text,
            max_length=256,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )

        # Move to CPU if necessary
        if torch.cuda.is_available() and torch.cuda.memory_allocated() > 0.8 * torch.cuda.get_device_properties(0).total_memory:
            inputs = {k: v.cpu() for k, v in inputs.items()}
            self.model = self.model.cpu()

        outputs = self.model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=2,  # Reduced beam size
            length_penalty=1.0,
            early_stopping=True
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Usage example with memory optimization
def main():
    # Set smaller initial sample size
    max_samples = 1000 # Adjust based on your GPU memory
    

    try:
        # Train with memory optimization
        model, tokenizer = train_cricket_model(
            refined_commentary,
            max_samples=max_samples
        )

        # Create QA system
        qa_system = CricketQASystem("./cricket_t5_model")

        # Test with basic questions
        test_questions = [
            "which shot does Virat Kohli play most?",
            "how does Virat Kohli play against short balls?"
        ]

        for question in test_questions:
            try:
                answer = qa_system.answer_question(question)
                print(f"\nQ: {question}")
                print(f"A: {answer}")
            except RuntimeError as e:
                print(f"Error processing question: {str(e)}")
                continue

    except RuntimeError as e:
        print(f"Training failed due to memory error: {str(e)}")
        print("Try reducing max_samples or batch size further")

if __name__ == "__main__":
    main()

Training failed due to memory error: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Try reducing max_samples or batch size further


In [5]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
Installing collected packages: click, nltk
Successfully installed click-8.1.8 nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.
