In [9]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import requests
import json

# Load IPL dataset
ipl_data = pd.read_csv("matches.csv")

month = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
months = {}

for i in range(len(month)):
    months[i + 1] = month[i]

# Load pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Combine dataset columns for retrieval
ipl_data['context'] = ipl_data.apply(
    lambda x: f"Season: {x['season']}, Date: {x['date']} or {x['date'].split('/')[0]} of {months[int(x['date'].split('/')[1])]} {x['date'].split('/')[2]}, City: {x['city']}, Match: {x['team1']} vs {x['team2']}, "
              f"Toss Winner: {x['toss_winner']}, Decision: {x['toss_decision']}, Result: {x['result']}, Duckworth-Lewis-Stern applied: {x['dl_applied']}, "
              f"Winner: {x['winner']}, Win By Runs: {x['win_by_runs']}, Win By Wickets: {x['win_by_wickets']}, Player of the Match: {x['player_of_match']}, Venue: {x['venue']}, Umpire1: {x['umpire1']}, Umpire2: {x['umpire2']}",
    axis=1
)

# Encode the dataset using Sentence-BERT
context_embeddings = model.encode(ipl_data['context'].tolist())

# Initialize FAISS index
dimension = context_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)

# Add dataset embeddings to the FAISS index
faiss_index.add(context_embeddings)

# Function to retrieve relevant context using FAISS
def retrieve_context_faiss(question, k=5):
    question_vec = model.encode([question])
    _, top_indices = faiss_index.search(question_vec, k)
    return ipl_data.iloc[top_indices[0]]['context'].tolist()

# Function to query Phi3 via Ollama API
def query_phi3(prompt):
    url = "http://localhost:11434/api/generate"  # Ensure this is the correct endpoint
    headers = {"Content-Type": "application/json"}
    payload = {"model": "phi3", "prompt": prompt}

    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()  # Handle HTTP errors
        resp = "["
        for i in response:
            resp += i.decode('utf-8').replace("\n", "").replace("}", "},")
        resp = resp[:-1] + "]"
        resp = json.loads(resp)
        ans = ""
        for i in resp:
            ans += i["response"]
        return ans
    except requests.exceptions.RequestException as e:
        return f"Error querying Phi3 model: {e}"
    except ValueError as e:
        return f"Error parsing JSON response: {e}"

# RAG system function
def answer_question_rag(question):
    # Retrieve context from the FAISS index
    context = retrieve_context_faiss(question)
    print("The context retrieved is: ", context)
    if context:
        full_context = "\n".join(context)
        prompt = """
You are a cricket analytics expert specializing in IPL analysis. Analyze the provided match data and answer questions with precision.

Data Structure Understanding:
Each data chunk contains the following fields:
- Season: The IPL season year
- City: Location of the match
- Date: In DD/MM/YYYY format
- Team 1 & Team 2: The competing teams
- Toss Winner & Toss Decision: Team winning the toss and their choice
- Result: Match result type (normal/tied/no result)
- DL Applied: Whether Duckworth-Lewis method was used (0/1)
- Winner: The winning team
- Win by Runs: Margin of victory for team batting first
- Win by Wickets: Margin of victory for team batting second
- Player of the Match: Outstanding player of the game
- Venue: Specific stadium name
- Umpire 1 & Umpire 2: Match officials


Response Guidelines:

1. Match Details:
   - Always mention the specific date and season when referring to matches
   - Use full venue names as provided in the data
   - Specify city location when relevant
   - Include toss details when discussing match context

2. Team Performance:
   - Mention batting order (Team 1 vs Team 2) when relevant
   - Specify victory margins accurately (runs/wickets)
   - Include toss impact if pertinent to the query
   - Note if DL method was applied

3. Player Achievements:
   - Always mention Player of the Match awards
   - Connect player performances to match outcomes
   - Include relevant match context

4. Statistical Analysis:
   - Provide exact numbers as shown in the data
   - Include the complete context (venue, season, teams)
   - Specify if data spans multiple chunks
   - Maintain chronological order when discussing multiple matches

Error Handling:
1. If information is not found in the chunks:
   - Clearly state that the specific information is not present in the provided context
   - Do not make assumptions about missing data
   - Provide any relevant partial information from available chunks

2. For team name queries:
   - Recognize both full names and common abbreviations
   - Common mappings:
     - RCB = Royal Challengers Bangalore
     - CSK = Chennai Super Kings
     - KKR = Kolkata Knight Riders
     - MI = Mumbai Indians
     - KXIP/PBKS = Kings XI Punjab/Punjab Kings
     - DD/DC = Delhi Daredevils/Delhi Capitals
     - SRH = Sunrisers Hyderabad
     - RR = Rajasthan Royals

Context:
{context}

Question:
{question}

Answer:
"""

        response = query_phi3(prompt)
        if response.strip().lower() not in ["i don't know", "not found", ""]:
            return response
    return None
    
# RAG system response
print("Using RAG system...")
rag_answer = answer_question_rag(question)
print("RAG Answer:", rag_answer if rag_answer else "No answer found in the dataset.")


Using RAG system...
The context retrieved is:  ['Season: 2010, Date: 13/04/10 or 13 of april 10, City: Chennai, Match: Kolkata Knight Riders vs Chennai Super Kings, Toss Winner: Kolkata Knight Riders, Decision: bat, Result: normal, Duckworth-Lewis-Stern applied: 0, Winner: Chennai Super Kings, Win By Runs: 0, Win By Wickets: 9, Player of the Match: R Ashwin, Venue: MA Chidambaram Stadium, Chepauk, Umpire1: SS Hazare, Umpire2: SJA Taufel', 'Season: 2008, Date: 26/04/08 or 26 of april 08, City: Chennai, Match: Kolkata Knight Riders vs Chennai Super Kings, Toss Winner: Kolkata Knight Riders, Decision: bat, Result: normal, Duckworth-Lewis-Stern applied: 0, Winner: Chennai Super Kings, Win By Runs: 0, Win By Wickets: 9, Player of the Match: JDP Oram, Venue: MA Chidambaram Stadium, Chepauk, Umpire1: BF Bowden, Umpire2: AV Jayaprakash', 'Season: 2008, Date: 18/05/08 or 18 of may 08, City: Kolkata, Match: Kolkata Knight Riders vs Chennai Super Kings, Toss Winner: Kolkata Knight Riders, Decisio

In [10]:
questions = [
    "1. Which two teams played an IPL match on the 5th April 2017 ?",
    "2. Who won the toss during the match that happened on the 7th of April 2017 ?",
    "3. Who won the match between KKR and RR that happened on 20/05/08 ?",
    "4.  How many matches happened on the 9th of April 2017 ?",
    "5. In what locations did KKR and CSK play a match in the year 2010 ?",
    "6. What was the highest margin of victory (by runs) achieved in matches played at Wankhede Stadium in 2017?",
    "7. How many matches did MI play against CSK in the 2017 season, and what were the results?",
    "8. Who were the umpires for the match where the team batting second won by the smallest margin in 2017?",
    "9. Which team had the most wins in matches played at Eden Gardens across all seasons?",
    "10 . What was the total number of no-results (matches without a winner) in IPL history up to the 2017 season?",
]

In [11]:
for i in questions :
    rag_answer = answer_question_rag(i)
    print("\n\n")
    print(rag_answer)
    print("\n\n")

The context retrieved is:  ['Season: 2017, Date: 9/4/2017 or 9 of april 2017, City: Hyderabad, Match: Gujarat Lions vs Sunrisers Hyderabad, Toss Winner: Sunrisers Hyderabad, Decision: field, Result: normal, Duckworth-Lewis-Stern applied: 0, Winner: Sunrisers Hyderabad, Win By Runs: 0, Win By Wickets: 9, Player of the Match: Rashid Khan, Venue: Rajiv Gandhi International Stadium, Uppal, Umpire1: A Deshmukh, Umpire2: NJ Llong', 'Season: 2017, Date: 20/04/17 or 20 of april 17, City: Indore, Match: Kings XI Punjab vs Mumbai Indians, Toss Winner: Mumbai Indians, Decision: field, Result: normal, Duckworth-Lewis-Stern applied: 0, Winner: Mumbai Indians, Win By Runs: 0, Win By Wickets: 8, Player of the Match: JC Buttler, Venue: Holkar Cricket Stadium, Umpire1: M Erasmus, Umpire2: C Shamshuddin', 'Season: 2017, Date: 5/4/2017 or 5 of april 2017, City: Hyderabad, Match: Sunrisers Hyderabad vs Royal Challengers Bangalore, Toss Winner: Royal Challengers Bangalore, Decision: field, Result: normal, 