<a href="https://colab.research.google.com/github/ladkrutarth/Hybrid_RAG_FineTuned_LLM_/blob/main/RagAndLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL for constructing full links
base_url = "https://www.nfldraftbuzz.com"

# List of pages to scrape (from 1 to 40)
pages = list(range(1, 42))  # Scrape pages 1 to 40

# Headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

# Initialize an empty list to store all player data
all_players_data = []

# Loop through each page
for page_num in pages:
    print(f"Scraping page {page_num}...")

    # Construct the URL for the page
    main_url = f"https://www.nfldraftbuzz.com/positions/ALL/{page_num}/2025"

    # Request the main listing page
    main_response = requests.get(main_url, headers=headers)
    if main_response.status_code != 200:
        print(f"Failed to retrieve page {page_num}. Status Code: {main_response.status_code}")
        continue

    main_soup = BeautifulSoup(main_response.text, "html.parser")

    # --- Extract Player URLs from the Main Listing Page ---
    player_links = []
    for a in main_soup.find_all("a", href=True):
        href = a['href']
        if "/Player/" in href:
            # Build full URL if not already complete
            full_url = href if href.startswith("http") else base_url + href
            if full_url not in player_links:
                player_links.append(full_url)

    print(f"Found {len(player_links)} players on page {page_num}.")

    # --- Loop through each player URL and extract details ---
    for idx, url in enumerate(player_links, start=1):
        print(f"Scraping player {idx}: {url}")
        player_response = requests.get(url, headers=headers)
        if player_response.status_code != 200:
            print(f"Failed to retrieve {url}. Status Code: {player_response.status_code}")
            continue

        player_soup = BeautifulSoup(player_response.text, "html.parser")

        # Example extraction of player name and position/team details
        name_tag = player_soup.find("h1")
        name = name_tag.text.strip() if name_tag else "Name not found"

        pos_team_tag = player_soup.find("h2")
        pos_team = pos_team_tag.text.strip() if pos_team_tag else "Position/Team not found"

        # --- Extract Scouting Report Strengths ---
        strengths_section = player_soup.find(string=lambda t: t and "Scouting Report: Strengths" in t)
        strengths = []
        if strengths_section:
            ul_tag = strengths_section.find_next("ul")
            if ul_tag:
                strengths = [li.get_text(strip=True) for li in ul_tag.find_all("li")]

        # --- Extract Scouting Report Weaknesses ---
        weaknesses_section = player_soup.find(string=lambda t: t and "Scouting Report: Weaknesses" in t)
        weaknesses = []
        if weaknesses_section:
            ul_tag = weaknesses_section.find_next("ul")
            if ul_tag:
                weaknesses = [li.get_text(strip=True) for li in ul_tag.find_all("li")]

        # --- Extract Scouting Report Summary (from h5 with class 'proNegHeader') ---
        summary = ""
        summary_section = player_soup.find("h5", class_="proNegHeader", string="Scouting Report: Summary")
        if summary_section:
            # Look for the next sibling after the <h5> (which is usually a paragraph or text)
            summary_tag = summary_section.find_next_sibling("p")
            if summary_tag:
                summary = summary_tag.get_text(strip=True)
            else:
                # Sometimes the summary could be in other tags like <div>, <span>, etc.
                # So we also check for all sibling elements.
                sibling_tag = summary_section.find_next_sibling()
                if sibling_tag:
                    summary = sibling_tag.get_text(strip=True)

        # Create a dictionary for the current player
        player_data = {
            "Name": name,
            "Position/Team": pos_team,
            "Profile URL": url,
            "Strengths": "; ".join(strengths) if strengths else None,
            "Weaknesses": "; ".join(weaknesses) if weaknesses else None,
            "Summary": summary if summary else None
        }
        all_players_data.append(player_data)

        # Pause to be polite to the server
        time.sleep(1)

# --- STEP 2: Save Data to CSV using Pandas ---
df = pd.DataFrame(all_players_data)
csv_filename = "nfl_players_with_strengths_weaknesses_summary_all_pages.csv"
df.to_csv(csv_filename, index=False)
print(f"\nData for {len(df)} players saved to {csv_filename}")
print(df)


Scraping page 1...
Found 12 players on page 1.
Scraping player 1: https://www.nfldraftbuzz.com/Player/Travis-Hunter-CB-JacksonState
Scraping player 2: https://www.nfldraftbuzz.com/Player/Ashton-Jeanty-RB-BoiseState
Scraping player 3: https://www.nfldraftbuzz.com/Player/Abdul-Carter-LB-PennState
Scraping player 4: https://www.nfldraftbuzz.com/Player/Mason-Graham-DL-Michigan
Scraping player 5: https://www.nfldraftbuzz.com/Player/Will-Johnson-DB-Michigan
Scraping player 6: https://www.nfldraftbuzz.com/Player/Kelvin-BanksJr-OL-Texas
Scraping player 7: https://www.nfldraftbuzz.com/Player/James-PearceJr-DL-Tennessee
Scraping player 8: https://www.nfldraftbuzz.com/Player/Cameron-Ward-QB-IncarnateWord
Scraping player 9: https://www.nfldraftbuzz.com/Player/Tetairoa-McMillan-WR-Arizona
Scraping player 10: https://www.nfldraftbuzz.com/Player/Shedeur-Sanders-QB-JacksonState
Scraping player 11: https://www.nfldraftbuzz.com/Player/Will-Campbell-OL-LSU
Scraping player 12: https://www.nfldraftbuzz.com

In [3]:
#data = pd.read_csv("nfl_players_with_strengths_weaknesses_summary_all_pages.csv")
#data
import pandas as pd
import numpy as np

chunk_size = 10000
chunks = pd.read_csv("/content/nfl_players_with_strengths_weaknesses_summary_all_pages.csv", chunksize=chunk_size)

# Iterate over the chunks
for chunk in chunks:
    # Process the chunk (e.g., print the first few rows)
    chunk.head()
    # Or perform your desired operations on the chunk
    # ...

data = chunk.head()

In [4]:
df = pd.DataFrame(data)

# Splitting the 'Name' column into multiple parts
df[['First Name', 'Last Name', 'Position', 'Team', 'Extra']] = df['Name'].str.split(' ', n=4, expand=True)

# Removing the unwanted part after '|'
df['Extra'] = df['Extra'].str.split('|', expand=True)[0]

# Merging 'First Name' and 'Last Name' into 'Player'
df['Player'] = df['First Name'] + ' ' + df['Last Name']

# Merging 'Team' and 'Extra' into 'Team Info'
df['Team Info'] = df['Team'] + '  ' + df['Extra']

# Reordering columns to place 'Position' where desired
new_order = ['Player', 'Position','Team Info', 'Profile URL', 'Strengths', 'Weaknesses']
df = df[new_order]

In [5]:
df

Unnamed: 0,Player,Position,Team Info,Profile URL,Strengths,Weaknesses
0,Travis Hunter,CB,Colorado,https://www.nfldraftbuzz.com/Player/Travis-Hun...,Possesses rare blend of twitch and fluidity th...,Frame remains somewhat linear at 185 pounds - ...
1,Ashton Jeanty,RB,Boise State,https://www.nfldraftbuzz.com/Player/Ashton-Jea...,Elite contact balance; sheds tackles with ease...,Slightly undersized for the prototype NFL feat...
2,Abdul Carter,DE/EDGE,Penn State,https://www.nfldraftbuzz.com/Player/Abdul-Cart...,Super athletic - Made Bruce Feldman's Freak's ...,Still developing a complete pass rush plan - r...
3,Mason Graham,DT,Michigan,https://www.nfldraftbuzz.com/Player/Mason-Grah...,Explosive first step paired with wrestling-hon...,Height limitations occasionally show up when l...
4,Will Johnson,CB,Michigan,https://www.nfldraftbuzz.com/Player/Will-Johns...,Rare blend of size and fluidity - maintains hi...,Can get grabby downfield when beaten - needs t...


In [7]:
!pip install faiss-cpu transformers sentence-transformers datasets torch

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.meta

In [8]:
import pandas as pd
import faiss
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, Dataset
import numpy as np

In [9]:
# Combine relevant columns for embedding (adjust column names as needed)
if "Strengths" in df.columns and "Weaknesses" in df.columns:
    # Combine relevant columns for embedding
 texts = df["Strengths"].fillna('') + " " + df["Weaknesses"].fillna('')
elif "text" in df.columns:
    texts = df["text"].fillna('')
else:
    texts = df.iloc[:,0].fillna('')
    print("Warning: 'Strengths' and 'Weaknesses' columns not found. Using the first column for text.")

# Load embedding model for retrieval
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert text to embeddings
embeddings = embedding_model.encode(texts.tolist(), convert_to_numpy=True)

# Create FAISS index for retrieval
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
# Load an LLM for fine-tuning (GPT-2, T5, Falcon, etc.)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Function to retrieve relevant documents based on a query
def retrieve_docs(query, top_k=3):
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    _, indices = index.search(query_embedding, top_k)
    return [texts.iloc[i] for i in indices[0]]


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
# Fine-tune the LLM with research-based data (example process)
def fine_tune_llm():
    # Load a pretrained language model and tokenizer
    model_name = "facebook/bart-base"  # Example model, choose as needed
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load and preprocess the dataset
    dataset = load_dataset("scientific_papers", "arxiv")
    def preprocess_function(examples):
        return tokenizer(examples["abstract"], padding="max_length", truncation=True)

    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        per_device_train_batch_size=2,
        num_train_epochs=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(500)),
        eval_dataset=tokenized_datasets["validation"].shuffle(seed=42).select(range(100)),
    )

    trainer.train()
    trainer.save_model("./fine_tuned_model")

# Uncomment the line below to run fine-tuning (can take time)
#fine_tune_llm()

In [15]:
def generate_answer(query):
    retrieved_docs = retrieve_docs(query)
    indices = index.search(embedding_model.encode([query], convert_to_numpy=True), 3)[1][0]
    players = []
    if 'Player' in df.columns:
        players = [df['Player'].iloc[i] for i in indices]
    else:
        print("Warning: 'Player' column not found in DataFrame.")

    input_text = "\n".join(retrieved_docs) + "\nQuestion: " + query + "\nAnswer: "
    inputs = tokenizer(input_text, return_tensors="pt", return_attention_mask=True, truncation=True, max_length=512) # Get attention mask and truncate
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask # Extract attention mask
    output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=200) # Use max_new_tokens and remove max_length
    answer = tokenizer.decode(output[0], skip_spe2cial_tokens=True)
    for i, Player in enumerate(players):
        print(f"Player {i+1}: {Player}")
    return answer

query = "Tell me about players with strong upper body strength."
answer = generate_answer(query)
print("Q:", query)
print("A:", answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Player 1: Travis Hunter
Player 2: Mason Graham
Player 3: Ashton Jeanty
Q: Tell me about players with strong upper body strength.
A: Possesses rare blend of twitch and fluidity that allows him to mirror receivers in man coverage while maintaining optimal leverage through their stems and breaks; Elite ball skills translate from WR background - transforms into the aggressor at catch point with exceptional timing and body control to high-point throws; Shows advanced route recognition and mental processing speed, routinely beating receivers to their landmarks before they execute their breaks; Brings controlled violence as a tackler despite lean frame, taking smart angles and showing outstanding spatial awareness in run support; Demonstrates veteran-level patience in press coverage, varying his approach and using efficient footwork to maintain positioning without opening his hips early; Conditioning level is off the charts - played more defensive snaps than any Power 5 corner while moonlight