In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
import json
ipl_data = pd.read_csv("OneDrive/Desktop/matches.csv")

month = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
months = {}

for i in range(len(month)):
    months[i+1] = month[i]

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2') 
ipl_data['context'] = ipl_data.apply(
    lambda x: f"Season: {x['season']}, Date: {x['date']} or {x['date'].split('/')[0]} of {months[int(x['date'].split('/')[1])]} {x['date'].split('/')[2]}, City: {x['city']}, Match: {x['team1']} vs {x['team2']}, "
              f"Toss Winner: {x['toss_winner']}, Decision: {x['toss_decision']}, Result: {x['result']}, Duckworth-Lewis-Stern applied: {x['dl_applied']}, "
              f"Winner: {x['winner']}, Win By Runs: {x['win_by_runs']}, Win By Wickets: {x['win_by_wickets']}, Player of the Match: {x['player_of_match']}, Venue: {x['venue']}, Umpire1: {x['umpire1']}, Umpire2: {x['umpire2']}",
    axis=1
)

tfidf_matrix = model.encode(ipl_data['context'])

def retrieve_context(question, k=5):
    question_vec = model.encode([question])
    scores = cosine_similarity(question_vec, tfidf_matrix).flatten()
    top_indices = scores.argsort()[-k:][::-1]
    return ipl_data.iloc[top_indices]['context'].tolist()

# Function to query Phi3 via Ollama API
# def query_phi3(prompt):
#     url = "http://localhost:11434/api/generate"  # Replace with your server URL if using ngrok
#     headers = {"Content-Type": "application/json"}
#     payload = {"model": "phi3", "prompt": prompt}

#     response = requests.post(url, json=payload, headers=headers)
#     if response.status_code == 200:
#         return response.json()["response"]
#     else:
#         return "Error querying Phi3 model."

# def query_phi3(prompt):
#     url = "http://localhost:11434/api/generate/"  # Replace with ngrok URL if applicable
#     headers = {"Content-Type": "application/json"}
#     payload = {"model": "phi3", "prompt": prompt}

#     try:
#         response = requests.post(url, json=payload, headers=headers)
#         response.raise_for_status()  # Raise an error for bad responses
#         return response.json().get("response", "No response from Phi3 model.")
#     except requests.exceptions.RequestException as e:
#         return f"Error querying Phi3 model: {e}"

def query_phi3(prompt):
    url = "http://localhost:11434/api/generate"  # Ensure this is the correct endpoint
    headers = {"Content-Type": "application/json"}
    payload = {"model": "phi3", "prompt": prompt}

    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()  # Handle HTTP errors
        resp = "["
        # print("Raw Response:", response.text) # Log the raw response for debugging)
        for i in response:
            resp += i.decode('utf-8').replace("\n", "").replace("}", "},")
        resp = resp[:-1] + "]"
        resp = json.loads(resp)
        ans = ""
        for i in resp:
            ans += i["response"]
        return ans
    except requests.exceptions.RequestException as e:
        return f"Error querying Phi3 model: {e}"
    except ValueError as e:
        return f"Error parsing JSON response: {e}"


# Function to perform a web search
def web_search(question, num_results=3):
    results = search(question, num_results=num_results)
    return "\n".join(results)

# RAG system function
def answer_question_rag(question):
    # Retrieve context from the dataset
    context = retrieve_context(question)
    print("The context retrieved is: ", context)
    if context:
        full_context = "\n".join(context)
        prompt = f"Using the following IPL dataset context, answer the question:\n\n{full_context}\n\nQuestion: {question}\nAnswer:"
        response = query_phi3(prompt)
        if response.strip().lower() not in ["i don't know", "not found", ""]:
            return response
    return None

# CRAG system function (with web search fallback)
def answer_question_crag(question):
    # Try to answer using the RAG system
    rag_response = answer_question_rag(question)
    if rag_response:
        return rag_response
    
    # If RAG fails, use web search and ask Phi3
    web_results = web_search(question)
    if web_results:
        prompt = f"Using the following web search results, answer the question:\n\n{web_results}\n\nQuestion: {question}\nAnswer:"
        return query_phi3(prompt)
    
    return "I'm sorry, I couldn't find the answer to your question."

question = "In what locations did kolkatta knight riders and chennai super kings play a match in the year 2010?"

# RAG system response
print("Using RAG system...")
rag_answer = answer_question_rag(question)
print("RAG Answer:", rag_answer if rag_answer else "No answer found in the dataset.")

# CRAG system response
# print("\nUsing CRAG system...")
# crag_answer = answer_question_crag(question)
# print("CRAG Answer:", crag_answer)


Using RAG system...
The context retrieved is:  ['Season: 2010, Date: 13/04/10 or 13 of april 10, City: Chennai, Match: Kolkata Knight Riders vs Chennai Super Kings, Toss Winner: Kolkata Knight Riders, Decision: bat, Result: normal, Duckworth-Lewis-Stern applied: 0, Winner: Chennai Super Kings, Win By Runs: 0, Win By Wickets: 9, Player of the Match: R Ashwin, Venue: MA Chidambaram Stadium, Chepauk, Umpire1: SS Hazare, Umpire2: SJA Taufel', 'Season: 2008, Date: 26/04/08 or 26 of april 08, City: Chennai, Match: Kolkata Knight Riders vs Chennai Super Kings, Toss Winner: Kolkata Knight Riders, Decision: bat, Result: normal, Duckworth-Lewis-Stern applied: 0, Winner: Chennai Super Kings, Win By Runs: 0, Win By Wickets: 9, Player of the Match: JDP Oram, Venue: MA Chidambaram Stadium, Chepauk, Umpire1: BF Bowden, Umpire2: AV Jayaprakash', 'Season: 2008, Date: 18/05/08 or 18 of may 08, City: Kolkata, Match: Kolkata Knight Riders vs Chennai Super Kings, Toss Winner: Kolkata Knight Riders, Decisio

In [1]:
!pip install langchain openai sentence-transformers pandas scikit-learn



In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.agents import initialize_agent, Tool, AgentType
import requests
import json

# Load IPL data
ipl_data = pd.read_csv("matches.csv")

# Mapping month numbers to month names
month = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
months = {i+1: month[i] for i in range(len(month))}

# Apply context generation
ipl_data['context'] = ipl_data.apply(
    lambda x: f"Season: {x['season']}, Date: {x['date']} or {x['date'].split('/')[0]} of {months[int(x['date'].split('/')[1])]} {x['date'].split('/')[2]}, "
              f"City: {x['city']}, Match: {x['team1']} vs {x['team2']}, Toss Winner: {x['toss_winner']}, Decision: {x['toss_decision']}, Result: {x['result']}, "
              f"Duckworth-Lewis-Stern applied: {x['dl_applied']}, Winner: {x['winner']}, Win By Runs: {x['win_by_runs']}, Win By Wickets: {x['win_by_wickets']}, "
              f"Player of the Match: {x['player_of_match']}, Venue: {x['venue']}, Umpire1: {x['umpire1']}, Umpire2: {x['umpire2']}",
    axis=1
)

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert contexts into embeddings
embeddings = model.encode(ipl_data['context'].tolist())

# Use FAISS to create a vector store from the embeddings
embedding_model = SentenceTransformerEmbeddings(model)
faiss_store = FAISS.from_documents(ipl_data['context'].tolist(), embedding_model)

# Create a retriever to fetch relevant context based on the query
retriever = faiss_store.as_retriever()

# Initialize OpenAI LLM
llm = OpenAI(temperature=0)

# Create a prompt template for generating the answer
template = """You are a helpful assistant. Given the following IPL dataset context, answer the question.
Context:
{context}
Question: {question}
Answer:"""

prompt = PromptTemplate(input_variables=["context", "question"], template=template)

# Initialize the LLM chain
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Define function to retrieve relevant context and generate an answer
def answer_question_langchain(question):
    # Use retriever to fetch relevant contexts
    context = retriever.retrieve(question)
    
    # If context found, use LLM chain to generate the answer
    if context:
        full_context = "\n".join(context)
        response = llm_chain.run(context=full_context, question=question)
        return response
    return "No relevant context found."

# Define the function to query Phi3 (if needed)
def query_phi3(prompt):
    url = "http://localhost:11434/api/generate"  # Ensure this is the correct endpoint
    headers = {"Content-Type": "application/json"}
    payload = {"model": "phi3", "prompt": prompt}

    try:
        response = requests.post(url, json=payload, headers=headers)
        response.raise_for_status()  # Handle HTTP errors
        resp = "["
        for i in response:
            resp += i.decode('utf-8').replace("\n", "").replace("}", "},")
        resp = resp[:-1] + "]"
        resp = json.loads(resp)
        ans = ""
        for i in resp:
            ans += i["response"]
        return ans
    except requests.exceptions.RequestException as e:
        return f"Error querying Phi3 model: {e}"
    except ValueError as e:
        return f"Error parsing JSON response: {e}"

# Example usage
question = "In what locations did Kolkata Knight Riders and Chennai Super Kings play a match in the year 2010?"
answer = answer_question_langchain(question)
print("Answer:", answer)


  embedding_model = SentenceTransformerEmbeddings(model)


TypeError: HuggingFaceEmbeddings.__init__() takes 1 positional argument but 2 were given

In [4]:
!pip install langchain sentence-transformers faiss-cpu requests

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [10]:
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.chat_models import ChatOllama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document

ipl_data = pd.read_csv("matches.csv")

month = ["january", "february", "march", "april", "may", "june", 
         "july", "august", "september", "october", "november", "december"]
months = {i+1: month[i] for i in range(len(month))}

def create_context(row):
    return (
        f"Season: {row['season']}, Date: {row['date']} or {row['date'].split('/')[0]} of "
        f"{months[int(row['date'].split('/')[1])]} {row['date'].split('/')[2]}, "
        f"City: {row['city']}, Match: {row['team1']} vs {row['team2']}, "
        f"Toss Winner: {row['toss_winner']}, Decision: {row['toss_decision']}, "
        f"Result: {row['result']}, Duckworth-Lewis-Stern applied: {row['dl_applied']}, "
        f"Winner: {row['winner']}, Win By Runs: {row['win_by_runs']}, "
        f"Win By Wickets: {row['win_by_wickets']}, Player of the Match: {row['player_of_match']}, "
        f"Venue: {row['venue']}, Umpire1: {row['umpire1']}, Umpire2: {row['umpire2']}"
    )

documents = [
    Document(page_content=create_context(row), metadata={"index": idx})
    for idx, row in ipl_data.iterrows()
]

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

splits = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(splits, embeddings)

llm = ChatOllama(model="phi3", temperature=0)

prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 5}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

def answer_question_rag(question: str):
    """
    Answer questions using the RAG system
    """
    try:
        result = qa_chain({"query": question})
        return {
            "answer": result["result"],
            "source_documents": [doc.page_content for doc in result["source_documents"]]
        }
    except Exception as e:
        print(f"Error in RAG system: {e}")
        return {
            "answer": "I'm sorry, I encountered an error while processing your question.",
            "source_documents": []
        }

if __name__ == "__main__":
    question = ""
    
    print("Using Langchain RAG system...")
    result = answer_question_rag(question)
    
    print("\nAnswer:", result["answer"])
    print("\nSource Documents Used:")
    for idx, doc in enumerate(result["source_documents"], 1):
        print(f"\nDocument {idx}:")
        print(doc)

Using Langchain RAG system...

Answer: Yes, Chennai Super Kings (CSK) played against Royal Challengers Bangalore (RCB) on April 22nd, 2017. However, the context provided does not contain information about any other match between CSK and RCB in that particular season of 2017.

Source Documents Used:

Document 1:
Season: 2017, Date: 18/04/17 or 18 of april 17, City: Rajkot, Match: Royal Challengers Bangalore vs Gujarat Lions, Toss Winner: Gujarat Lions, Decision: field, Result: normal, Duckworth-Lewis-Stern applied: 0, Winner: Royal Challengers Bangalore, Win By Runs: 21, Win By Wickets: 0, Player of the Match: CH Gayle, Venue: Saurashtra Cricket Association Stadium, Umpire1: S Ravi, Umpire2: VK Sharma

Document 2:
Season: 2017, Date: 27/04/17 or 27 of april 17, City: Bangalore, Match: Royal Challengers Bangalore vs Gujarat Lions, Toss Winner: Gujarat Lions, Decision: field, Result: normal, Duckworth-Lewis-Stern applied: 0, Winner: Gujarat Lions, Win By Runs: 0, Win By Wickets: 7, Player