<a href="https://colab.research.google.com/github/joeky00/ink-bot-react/blob/main/Another_copy_of_Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Reasoning**:
Load the combined QA dataset from the CSV file and apply the preprocessing function to it.



# Cleaned-up Sports AI Chatbot Notebook

This notebook contains the essential code to run the Sports AI Chatbot, including data loading, QA generation, model setup, API integration, and the Gradio interface.

## 1. Extract Data from Zip Files

Extract the necessary CSV files from the uploaded zip archives.

In [None]:
import zipfile
import os
import glob # Import glob to find files

# Use glob to find zip files in the /content directory
zip_files = glob.glob('/content/archive(*).zip')

extract_dir = '/content/'

for zip_file_path in zip_files:
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Successfully extracted {zip_file_path}")
    except FileNotFoundError:
        print(f"Error: {zip_file_path} not found.")
    except zipfile.BadZipFile:
        print(f"Error: {zip_file_path} is a bad zip file.")

# List files in /content after extraction to confirm
print("\nFiles in /content after extraction:")
for file in os.listdir(extract_dir):
    print(file)


Files in /content after extraction:
.config
sample_data


## 2. Load DataFrames

Load the extracted CSV files into pandas DataFrames. These will be used for generating QA pairs and as context for the AI.

In [None]:
import pandas as pd

# Load football_players.csv (original dataset)
try:
    df = pd.read_csv('/content/football_players.csv')
    print("Loaded football_players.csv")
    # display(df.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: football_players.csv not found.")
    df = None # Ensure df is None if file not found

# Load and display top250-00-19.csv
try:
    df_top250 = pd.read_csv('/content/top250-00-19.csv')
    print("Loaded top250-00-19.csv")
    # display(df_top250.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: top250-00-19.csv not found.")
    df_top250 = None # Ensure df_top250 is None if file not found

# Load and display fifa_players.csv
try:
    df_fifa = pd.read_csv('/content/fifa_players.csv')
    print("Loaded fifa_players.csv")
    # display(df_fifa.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: fifa_players.csv not found.")
    df_fifa = None # Ensure df_fifa is None if file not found

# Load and display data.csv
try:
    df_data = pd.read_csv('/content/data.csv')
    print("Loaded data.csv")
    # display(df_data.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: data.csv not found.")
    df_data = None # Ensure df_data is None if file not found

# Load and display dataset.csv
try:
    df_dataset = pd.read_csv('/content/dataset.csv')
    print("Loaded dataset.csv")
    # display(df_dataset.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: dataset.csv not found.")
    df_dataset = None # Ensure df_dataset is None if file not found

Error: football_players.csv not found.
Error: top250-00-19.csv not found.
Error: fifa_players.csv not found.
Error: data.csv not found.
Error: dataset.csv not found.


## 3. Generate and Combine QA Pairs

Define functions to generate QA pairs from each DataFrame and combine them into a single dataset.

In [None]:
import pandas as pd
import csv

# Original QA generation function (assuming df is loaded)
def generate_qa_pairs(df, num_samples=10):
    qa_pairs = []
    if df is None:
        print("Original df is not loaded for QA generation.")
        return qa_pairs

    for _ in range(num_samples):
        player = df.sample(1).iloc[0]
        name = player.get("Player", "Unknown")
        nationality = player.get("Origin", "Unknown")
        club = player.get("To(Club)", "Unknown")
        position = player.get("Position", "Unknown")
        fee = player.get("Fee(€ mln)", "Unknown")


        questions = [
            f"Which club does {name} play for?",
            f"What is the transfer fee for {name} in € mln?",
            f"What position does {name} play?",
            f"What is the nationality of {name}?",
        ]

        answers = [
            f"{name} plays for {club}.",
            f"The transfer fee for {name} was {fee} € mln.",
            f"{name} plays as a {position}.",
            f"{name} is from {nationality}.",
        ]

        qa_pairs.extend(list(zip(questions, answers)))

    return qa_pairs

# QA generation function for df_top250
def generate_qa_pairs_top250(df, num_samples=10):
    qa_pairs = []
    if df is None:
        print("df_top250 is not loaded for QA generation.")
        return qa_pairs

    for _ in range(num_samples):
        player = df.sample(1).iloc[0]
        name = player.get("Name", "Unknown")
        position = player.get("Position", "Unknown")
        team_from = player.get("Team_from", "Unknown")
        team_to = player.get("Team_to", "Unknown")
        transfer_fee = player.get("Transfer_fee", "Unknown")
        season = player.get("Season", "Unknown")


        questions = [
            f"Which club did {name} transfer to in the {season} season?",
            f"What was the transfer fee for {name} in the {season} season?",
            f"What position did {name} play?",
            f"Which club did {name} transfer from in the {season} season?",
        ]

        answers = [
            f"{name} transferred to {team_to} in the {season} season.",
            f"The transfer fee for {name} in the {season} season was {transfer_fee}.",
            f"{name} played as a {position}.",
            f"{name} transferred from {team_from} in the {season} season.",
        ]

        qa_pairs.extend(list(zip(questions, answers)))

    return qa_pairs

# QA generation function for df_fifa
def generate_qa_pairs_fifa(df, num_samples=10):
    qa_pairs = []
    if df is None:
        print("df_fifa is not loaded for QA generation.")
        return qa_pairs

    for _ in range(num_samples):
        player = df.sample(1).iloc[0]
        name = player.get("name", "Unknown")
        full_name = player.get("full_name", "Unknown")
        nationality = player.get("nationality", "Unknown")
        overall_rating = player.get("overall_rating", "Unknown")
        age = player.get("age", "Unknown")
        positions = player.get("positions", "Unknown")


        questions = [
            f"What is the full name of {name}?",
            f"What is the overall rating of {name}?",
            f"What is the nationality of {name}?",
            f"What is the age of {name}?",
            f"What positions does {name} play?",
        ]

        answers = [
            f"The full name of {name} is {full_name}.",
            f"The overall rating of {name} is {overall_rating}.",
            f"{name} is from {nationality}.",
            f"{name} is {age} years old.",
            f"{name} plays in the {positions} positions.",
        ]

        qa_pairs.extend(list(zip(questions, answers)))

    return qa_pairs

# QA generation function for df_data
def generate_qa_pairs_data(df, num_samples=10):
    qa_pairs = []
    if df is None:
        print("df_data is not loaded for QA generation.")
        return qa_pairs

    for _ in range(num_samples):
        match_event = df.sample(1).iloc[0]
        player = match_event.get("Player", "Unknown")
        season = match_event.get("Season", "Unknown")
        competition = match_event.get("Competition", "Unknown")
        club = match_event.get("Club", "Unknown")
        opponent = match_event.get("Opponent", "Unknown")
        matchday = match_event.get("Matchday", "Unknown")
        date = match_event.get("Date", "Unknown")


        questions = [
            f"What competition did {player} play in during the {season} season on matchday {matchday}?",
            f"Which club did {player} play for in the match against {opponent} on {date}?",
            f"What was the opponent when {player} played for {club} on {date}?",
        ]

        answers = [
            f"{player} played in the {competition} competition during the {season} season on matchday {matchday}.",
            f"{player} played for {club} in the match against {opponent} on {date}.",
            f"The opponent was {opponent} when {player} played for {club} on {date}.",
        ]
        qa_pairs.extend(list(zip(questions, answers)))

    return qa_pairs

# QA generation function for df_dataset
def generate_qa_pairs_dataset(df, num_samples=10):
    qa_pairs = []
    if df is None:
        print("df_dataset is not loaded for QA generation.")
        return qa_pairs

    for _ in range(num_samples):
        player_transfer = df.sample(1).iloc[0]
        player = player_transfer.get("Player", "Unknown")
        year = player_transfer.get("Year", "Unknown")
        fee = player_transfer.get("Fee", "Unknown")
        from_club = player_transfer.get("From Club", "Unknown")
        to_club = player_transfer.get("To Club", "Unknown")

        questions = [
            f"Which club did {player} transfer to in {year}?",
            f"What was the transfer fee for {player} in {year}?",
            f"Which club did {player} transfer from in {year}?",
            f"In what year did {player} transfer from {from_club} to {to_club}?",
        ]

        answers = [
            f"{player} transferred to {to_club} in {year}.",
            f"The transfer fee for {player} in {year} was {fee}.",
            f"{player} transferred from {from_club} in {year}.",
            f"{player} transferred from {from_club} to {to_club} in {year}.",
        ]
        qa_pairs.extend(list(zip(questions, answers)))

    return qa_pairs


# Generate QA pairs from each dataframe that was successfully loaded
all_qa_data = []
if df is not None:
    all_qa_data.extend(generate_qa_pairs(df, num_samples=50)) # Increased samples for more data
if df_top250 is not None:
    all_qa_data.extend(generate_qa_pairs_top250(df_top250, num_samples=50)) # Increased samples
if df_fifa is not None:
    all_qa_data.extend(generate_qa_pairs_fifa(df_fifa, num_samples=50)) # Increased samples
if df_data is not None:
    all_qa_data.extend(generate_qa_pairs_data(df_data, num_samples=50)) # Increased samples
if df_dataset is not None:
    all_qa_data.extend(generate_qa_pairs_dataset(df_dataset, num_samples=50)) # Increased samples


# Save the combined QA pairs into a CSV file
with open("combined_qa_dataset.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["question", "answer"])
    writer.writerows(all_qa_data)

print(f"✅ Saved combined QA dataset with {len(all_qa_data)} pairs as combined_qa_dataset.csv")

✅ Saved combined QA dataset with 0 pairs as combined_qa_dataset.csv


## 4. Load Model and Tokenizer

Load the FLAN-T5 model and tokenizer for text generation (and potentially for fine-tuning if needed).

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print("✅ Loaded flan-t5-small model successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ Loaded flan-t5-small model successfully!


## 5. Preprocess the Combined Dataset

Load the combined QA dataset and preprocess it for potential use with a language model.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the combined dataset from the CSV file
combined_dataset = load_dataset('csv', data_files='combined_qa_dataset.csv')

# Load tokenizer (ensure it's loaded if session restarted)
# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small") # Assuming tokenizer is already loaded

# Preprocessing function
def preprocess(example):
    input_text = "question: " + example["question"]
    target_text = example["answer"]
    tokenized_input = tokenizer(input_text, truncation=True, padding="max_length", max_length=128)
    tokenized_target = tokenizer(target_text, truncation=True, padding="max_length", max_length=32)

    tokenized_input["labels"] = tokenized_target["input_ids"]
    return tokenized_input

# Apply preprocessing to the combined dataset
tokenized_combined_dataset = combined_dataset.map(preprocess)

# Preview tokenized combined data
print("Preview of tokenized combined dataset:")
print(tokenized_combined_dataset["train"][0])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Preview of tokenized combined dataset:
{'question': 'Which club does Matthijs de Ligt play for?', 'answer': 'Matthijs de Ligt plays for Juventus.', 'input_ids': [822, 10, 4073, 1886, 405, 5708, 7436, 354, 7, 20, 1414, 122, 17, 577, 21, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [5708, 74

## 6. Define API and AI Response Functions

Define the functions to interact with external APIs for news and matches, and the main AI response function that routes user queries.

In [None]:
from transformers import pipeline
import re
import requests
import pandas as pd # Import pandas for DataFrame handling

# API keys (Replace with secure handling in deployment)
# It's recommended to use environment variables or Colab secrets for API keys
news_api_key = "b311a02382fa4a88b9d1b4bfc74bb051"
football_api_key = "5e8310b5845626994bcbf672a6ff5b60"

# Load the question-answering pipeline model (used for player queries)
try:
    # Note: This uses a different model (distilbert-base-cased-distilled-squad)
    # than the FLAN-T5 model loaded earlier.
    qa_pipeline_player = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
    print("✅ Loaded question-answering pipeline for player queries successfully!")
except Exception as e:
    print(f"⚠️ Could not load question-answering pipeline: {e}")
    qa_pipeline_player = None


def get_transfer_news():
    url = f"https://newsapi.org/v2/everything?q=football transfers&language=en&sortBy=publishedAt&apiKey={news_api_key}"
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise an exception for bad status codes
        data = response.json()
        if data.get("articles"):
            article = data["articles"][0]
            return f"📰 Latest Transfer: \"{article['title']}\" (Source: {article['source']['name']})"
        return "⚠️ No transfer news available."
    except requests.exceptions.RequestException as e:
        return f"⚠️ Error fetching transfer news: {e}"


def get_next_match():
    headers = {
        "x-rapidapi-host": "v3.football.api-sports.io",
        "x-apisports-key": football_api_key,
    }
    # Changed season to 2023 as per API error suggestion for free plan
    url = "https://v3.football.api-sports.io/fixtures?league=39&season=2023&next=1"
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status() # Raise an exception for bad status codes
        data = response.json()
        if data.get("response"):
            match = data["response"][0]["teams"]
            return f"⚽ Next Match: {match['home']['name']} vs {match['away']['name']}"
        return "⚠️ No upcoming matches."
    except requests.exceptions.RequestException as e:
        return f"⚠️ Error fetching next match data: {e}"


# AI Response function
def sports_ai_response(user_input):
    user_input = user_input.lower()

    # Combine all loaded dataframes into a single context string for the QA pipeline
    combined_context = ""
    if 'df' in globals() and isinstance(df, pd.DataFrame):
        combined_context += df.to_string() + "\n\n"
    if 'df_top250' in globals() and isinstance(df_top250, pd.DataFrame):
        combined_context += df_top250.to_string() + "\n\n"
    if 'df_fifa' in globals() and isinstance(df_fifa, pd.DataFrame):
        combined_context += df_fifa.to_string() + "\n\n"
    if 'df_data' in globals() and isinstance(df_data, pd.DataFrame):
        combined_context += df_data.to_string() + "\n\n"
    if 'df_dataset' in globals() and isinstance(df_dataset, pd.DataFrame):
        combined_context += df_dataset.to_string() + "\n\n"


    # Use the question-answering pipeline for factual player-based questions
    # Check for keywords that indicate a player-specific question
    player_keywords = ["who", "what", "how", "when", "is", "was", "paid", "salary", "fee", "rating", "position", "nationality", "age", "height", "weight", "club", "team", "transfer", "season", "competition", "matchday", "opponent", "date", "plays for"]
    if any(k in user_input for k in player_keywords):
        if qa_pipeline_player and combined_context:
             try:
                 # Use the 'qa_pipeline_player' pipeline with combined context
                 result = qa_pipeline_player(question=user_input, context=combined_context)
                 # Check if the answer is likely valid
                 if result and result.get('answer') and len(result['answer'].split()) > 1 and result.get('score', 0) > 0.1: # Add a score threshold
                     return f"🤖 Answer: {result['answer']}"
                 else:
                     # Fallback if QA doesn't find a good answer in the combined context
                     print("QA pipeline did not find a confident answer in combined context. Checking APIs...")
             except Exception as e:
                 print(f"Error during QA pipeline processing: {e}")
                 return "⚠️ An error occurred while trying to answer your question."
        else:
            print("QA pipeline not loaded or no dataframes loaded for context. Checking APIs...")


    # Use API if question is about transfers (and QA didn't find a good answer)
    if "transfer" in user_input or "signed" in user_input or "latest news" in user_input:
        return get_transfer_news()

    # Use API if question is about next match (and QA didn't find a good answer)
    elif any(kw in user_input for kw in ["next match", "upcoming match", "who is playing", "next premier league game"]):
        return get_next_match()


    # Catch-all fallback if no specific intent matched or QA/API failed
    return "⚽ I'm still learning. Try asking about a player, transfer, or upcoming match!"

print("✅ Defined sports_ai_response and API helper functions.")

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


✅ Loaded question-answering pipeline for player queries successfully!
✅ Defined sports_ai_response and API helper functions.


## 7. Launch Gradio Interface

Set up and launch the Gradio interface to interact with the chatbot.

In [None]:
!pip install gradio --quiet
import gradio as gr

def chatbot_interface(message):
    # Ensure dataframes are loaded if they are used as context in sports_ai_response
    # This might be redundant if the loading cells are guaranteed to run before this,
    # but adds robustness if the execution order is not strictly controlled.
    global df, df_top250, df_fifa, df_data, df_dataset
    # You might need to re-run the data loading cell or ensure persistence

    return sports_ai_response(message)

demo = gr.Interface(fn=chatbot_interface,
                    inputs="text",
                    outputs="text",
                    title="⚽ Sports AI Chatbot",
                    description="Ask about football players, transfers, or next matches!")

# To run in Colab, use share=True to get a public URL
# For deployment on platforms like Render, share=False is typical,
# and the platform handles exposing the service.
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://91e99d1fb71e669547.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import zipfile
import os
import glob # Import glob to find files

# Use glob to find zip files in the /content directory - using a simpler pattern
zip_files = glob.glob('/content/*.zip')

extract_dir = '/content/'

for zip_file_path in zip_files:
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"Successfully extracted {zip_file_path}")
    except FileNotFoundError:
        print(f"Error: {zip_file_path} not found.")
    except zipfile.BadZipFile:
        print(f"Error: {zip_file_path} is a bad zip file.")
    except Exception as e:
        print(f"An unexpected error occurred while processing {zip_file_path}: {e}")


# List files in /content after extraction to confirm
print("\nFiles in /content after extraction:")
try:
    for file in os.listdir(extract_dir):
        print(file)
except FileNotFoundError:
    print(f"Error: Directory {extract_dir} not found.")

In [None]:
import pandas as pd

# Load football_players.csv (original dataset)
try:
    df = pd.read_csv('/content/football_players.csv')
    print("Loaded football_players.csv")
    # display(df.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: football_players.csv not found.")
    df = None # Ensure df is None if file not found

# Load and display top250-00-19.csv
try:
    df_top250 = pd.read_csv('/content/top250-00-19.csv')
    print("Loaded top250-00-19.csv")
    # display(df_top250.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: top250-00-19.csv not found.")
    df_top250 = None # Ensure df_top250 is None if file not found

# Load and display fifa_players.csv
try:
    df_fifa = pd.read_csv('/content/fifa_players.csv')
    print("Loaded fifa_players.csv")
    # display(df_fifa.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: fifa_players.csv not found.")
    df_fifa = None # Ensure df_fifa is None if file not found

# Load and display data.csv
try:
    df_data = pd.read_csv('/content/data.csv')
    print("Loaded data.csv")
    # display(df_data.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: data.csv not found.")
    df_data = None # Ensure df_data is None if file not found

# Load and display dataset.csv
try:
    df_dataset = pd.read_csv('/content/dataset.csv')
    print("Loaded dataset.csv")
    # display(df_dataset.head()) # Optional: display head for verification
except FileNotFoundError:
    print("Error: dataset.csv not found.")
    df_dataset = None # Ensure df_dataset is None if file not found