#Installing dependencies for web scraping

In [None]:
!pip install beautifulsoup4
!pip install requests




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os

# We choose 15 Indian players whose data will be used as a knowledge base

In [None]:
PLAYERS = "rg Sharma, Shubman Gill, Virat Kohli, Shreyas Iyer, KL Rahul, Ishan Kishan, Suryakumar Yadav, Hardik Pandya, Ravindra Jadeja, Axar Patel, Shardul Thakur, Jasprit Bumrah, Kuldeep Yadav, Mohammed Shami, Mohammed Siraj"

players= PLAYERS.split(", ")
players

['rg Sharma',
 'Shubman Gill',
 'Virat Kohli',
 'Shreyas Iyer',
 'KL Rahul',
 'Ishan Kishan',
 'Suryakumar Yadav',
 'Hardik Pandya',
 'Ravindra Jadeja',
 'Axar Patel',
 'Shardul Thakur',
 'Jasprit Bumrah',
 'Kuldeep Yadav',
 'Mohammed Shami',
 'Mohammed Siraj']

Function for retrieving player id using player name, this is required for accessing players portfolio on espncricinfo

In [None]:
def getPlayerID(players:list[str]):
  playerNameId ={}
  for player in players:
    url = "http://search.espncricinfo.com/ci/content/player/search.html?search=" + player.lower().replace(" ","+") + "&x=0&y=0"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    player_id = str(soup.find_all(class_='ColumnistSmry')[0]).split('.html')[0].split('/')[-1]
    playerNameId[player_id]=player
  return playerNameId

In [None]:
playerData = getPlayerID(players)

In [None]:
print(playerData)

{'34102': 'rg Sharma', '1070173': 'Shubman Gill', '253802': 'Virat Kohli', '642519': 'Shreyas Iyer', '422108': 'KL Rahul', '720471': 'Ishan Kishan', '446507': 'Suryakumar Yadav', '625371': 'Hardik Pandya', '234675': 'Ravindra Jadeja', '554691': 'Axar Patel', '475281': 'Shardul Thakur', '625383': 'Jasprit Bumrah', '559235': 'Kuldeep Yadav', '481896': 'Mohammed Shami', '940973': 'Mohammed Siraj'}




```
{'rg Sharma': '34102',
 'Shubman Gill': '1070173',
 'Virat Kohli': '253802',
 'Shreyas Iyer': '642519',
 'KL Rahul': '422108',
 'Ishan Kishan': '720471',
 'Suryakumar Yadav': '446507',
 'Hardik Pandya': '625371',
 'Ravindra Jadeja': '234675',
 'Axar Patel': '554691',
 'Shardul Thakur': '475281',
 'Jasprit Bumrah': '625383',
 'Kuldeep Yadav': '559235',
 'Mohammed Shami': '481896',
 'Mohammed Siraj': '940973'}
```



The API needs to be mapped with the descriptive Heading

In [None]:
mapping = {
    'tt': 'Title',
    'sp': 'Career_Span',
    'mt': 'Matches',
    'rn': 'Runs',
    'wk': 'Wickets',
    'ct': 'Catches_taken',
    'st': 'Stumpings',
    'fw': 'Five_Wickets_in_inning',
    'hs': 'Highest_Score',
    'hn': 'Hundreds',
    'bbi': 'Best_Bowling_in_Inning',
    'bta': 'Batting_Average',
    'bwa': 'Bowling_Average',
}

# This will create each players folder with their statistic in csv format

In [None]:
for player in playerData.keys():
  url=f"https://hs-consumer-api.espncricinfo.com/v1/pages/player/stats/summary?playerId={player}&recordClassId=11&type=ALLROUND"
  print(url)
  response=requests.get(url).json()
  data_needed= ['CAREER_AVERAGES', 'CLASS' , 'OPPOSITION_TEAM','HOST_COUNTRY','HOME_OR_AWAY','CAPTAIN']
  folder_path = f'/content/playerdata/{playerData[player].replace(" " , "_")}'  # Replace with your desired folder path
  os.makedirs(folder_path, exist_ok=True)
  for group in response['summary']['groups']:
    print(group['type'])
    if group['type'] in data_needed:
      df = pd.DataFrame()
      for stats in group['stats']:
          if df.empty:
              df=pd.DataFrame({mapping.get(key, key): [value] if value is not None else 0 for key, value in stats.items()})
          else:
              df.loc[len(df)] = [0 if val is None or val == '-' else val for val in stats.values()]
      df.drop(labels=['pr', 'bbad'], axis=1, inplace=True)
      df['Player_Id']= player
      if group['type']=='OPPOSITION_TEAM':
        df['Title'] = df['Title'].str.split('v').str[1].str.strip()
      csv_file_path = os.path.join(folder_path, f'{playerData[player].replace(" " , "_")}_{group["type"].lower()}.csv')
      df.to_csv(csv_file_path, index=False)

##Player personal Data

In [None]:
players_dataframe = pd.DataFrame()

In [None]:
base_url = 'https://hs-consumer-api.espncricinfo.com/v1/pages/player/home?playerId='

for player_id in playerData.keys():
    response = requests.get(base_url + str(player_id))
    response = response.json()
    BattingStyle="He does not bat." if not len(response['player']['longBattingStyles']) else str(','.join(response['player']['longBattingStyles']))
    BowlingStyle= "He does not bowl." if not len(response['player']['longBowlingStyles']) else str(','.join(response['player']['longBowlingStyles']))
    WicketKeeperStyle= "He is not wicket-keeper." if not len(response['player']['fieldingStyles']) else str(','.join(response['player']['fieldingStyles']))
    if players_dataframe.empty:
        players_dataframe = pd.DataFrame({
            'Player_Id': [response['player']['objectId']],
            'Name': [response['player']['longName']],
            'DOB': ['-'.join([
                str(response['player']['dateOfBirth']['date']).rjust(2, '0'),
                str(response['player']['dateOfBirth']['month']).rjust(2, '0'),
                str(response['player']['dateOfBirth']['year']).rjust(2, '0')
            ])],
            'Batting_Styles': BattingStyle,
            'Bowling_Styles': BowlingStyle,
            'Wicket_Keeper_Styles': WicketKeeperStyle,
            'Playing_Roles': [','.join(response['player']['playingRoles'])],
            'Profile': [' '.join([entry['html'] for entry in response['content']['profile']['items']])]

        })
    else:
        players_dataframe.loc[len(players_dataframe)] = [
            response['player']['objectId'],
            response['player']['fullName'],
            '-'.join([
                str(response['player']['dateOfBirth']['date']).rjust(2, '0'),
                str(response['player']['dateOfBirth']['month']).rjust(2, '0'),
                str(response['player']['dateOfBirth']['year']).rjust(2, '0')
            ]),BattingStyle,BowlingStyle,WicketKeeperStyle
            ,
            ','.join(response['player']['playingRoles']),
            ' '.join([entry['html'] for entry in response['content']['profile']['items']])
        ]

In [None]:
players_dataframe

Unnamed: 0,Player_Id,Name,DOB,Batting_Styles,Bowling_Styles,Wicket_Keeper_Styles,Playing_Roles,Profile
0,34102,Rohit Sharma,30-04-1987,right-hand bat,right-arm offbreak,He is not wicket-keeper.,top-order batter,"Languid and easy on the eye, Rohit Sharma owne..."
1,1070173,Shubman Gill,08-09-1999,right-hand bat,right-arm offbreak,He is not wicket-keeper.,opening batter,"A right-hand top-order batsman from Punjab, Sh..."
2,253802,Virat Kohli,05-11-1988,right-hand bat,right-arm medium,He is not wicket-keeper.,top-order batter,India has given to the world many a great cric...
3,642519,Shreyas Santosh Iyer,06-12-1994,right-hand bat,"right-arm offbreak,legbreak googly",He is not wicket-keeper.,top-order batter,Shreyas Iyer is an attacking top-order batter ...
4,422108,Kannaur Lokesh Rahul,18-04-1992,right-hand bat,He does not bowl.,wicketkeeper,wicketkeeper batter,"A tall, elegant right-hand batsman who can kee..."
5,720471,Ishan Pranav Kumar Pandey Kishan,18-07-1998,left-hand bat,He does not bowl.,wicketkeeper,wicketkeeper batter,A wicketkeeper and left-handed opening batsman...
6,446507,Suryakumar Ashok Yadav,14-09-1990,right-hand bat,"right-arm medium,right-arm offbreak",He is not wicket-keeper.,batter,Hard-hitting 360-degree batter Suryakumar Yada...
7,625371,Hardik Himanshu Pandya,11-10-1993,right-hand bat,right-arm medium-fast,He is not wicket-keeper.,allrounder,Allrounder Hardik Pandya's calling cards brisk...
8,234675,Ravindrasinh Anirudhsinh Jadeja,06-12-1988,left-hand bat,slow left-arm orthodox,He is not wicket-keeper.,allrounder,"Dissed when he first appeared, Ravindra Jadeja..."
9,554691,Axar Rajeshbhai Patel,20-01-1994,left-hand bat,slow left-arm orthodox,He is not wicket-keeper.,bowling allrounder,Left-arm spinner Axar Patel has been increasin...


In [None]:
players_dataframe.to_csv("/content/playerdata/players-data.csv", index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


This will create text file of each player with their information, this is used for RAG knowledge base

In [None]:
import pandas as pd

# Load player data CSV
player_data = pd.read_csv('/content/playerdata/players-data.csv')

# Iterate through each row in player_data
for index, row in player_data.iterrows():
    # Extract player information
    id = row['Player_Id']
    name = row['Name']
    dob = row['DOB']
    batting_styles = row['Batting_Styles']
    bowling_styles = row['Bowling_Styles']
    wicket_keeper_styles = row['Wicket_Keeper_Styles']
    playing_roles = row['Playing_Roles']
    profile = row['Profile']
    os.makedirs('/content/playerchunks/', exist_ok=True)
    # Create a text file for each player
    with open(f'/content/playerchunks/{name.replace(" ", "_")}.txt', 'w') as file:
        file.write(f"Following is the information of {name}\n")
        file.write(f"Name: {name}\n")
        file.write(f"Date of   : {dob}\n")
        file.write(f"Batting Styles: {batting_styles}\n")
        file.write(f"Bowling Styles: {bowling_styles}\n")
        file.write(f"Wicket Keeper Styles: {wicket_keeper_styles}\n")
        file.write(f"Playing Roles: {playing_roles}\n")
        file.write(f"Bio: {profile}\n\n")

        # Load career averages CSV for the player
        file_name=playerData[str(id)].replace(" ", "_")
        career_averages = pd.read_csv(f'/content/playerdata/{file_name}/{file_name}_career_averages.csv')
        Match_career_summary = pd.read_csv(f'/content/playerdata/{file_name}/{file_name}_class.csv')
        Opposition_Team = pd.read_csv(f'/content/playerdata/{file_name}/{file_name}_opposition_team.csv')
        Home_or_Away = pd.read_csv(f'/content/playerdata/{file_name}/{file_name}_home_or_away.csv')
        Host_country = pd.read_csv(f'/content/playerdata/{file_name}/{file_name}_home_or_away.csv')
        captain = pd.read_csv(f'/content/playerdata/{file_name}/{file_name}_captain.csv')


        # Write career averages information to the text file

        file.write(f"Overall International Career Summary of {name}:\n")
        for _, career_row in career_averages.iterrows():
            for col, value in career_row.items():
                file.write(f"{col}: {value}\n")
            file.write("\n")
        file.write(f"Match format wise career summary {name}:\n")
        for _, career_row in Match_career_summary.iterrows():
            file.write(f'Performace of {name} in {career_row.Title} match\n')
            for col, value in career_row.items():
                if col=="Title":
                  continue
                else:
                  file.write(f"{col}: {value}\n")
            file.write("\n")
        file.write(f"Performance of {name} against other teams :\n")
        for _, career_row in Opposition_Team.iterrows():
            # print(f'Performace of {name} against {career_row.Title}')
            file.write(f'Performace of {name} against {career_row.Title}\n')
            for col, value in career_row.items():
                if col=="Title":
                  continue
                else:
                  file.write(f"{col}: {value}\n")
            file.write("\n")
        file.write(f"Performance of {name} in other host country :\n")
        for _, career_row in Host_country.iterrows():
            file.write(f'Performace of {name} in {career_row.Title} Host Country\n')
            for col, value in career_row.items():
                if col=="Title":
                  continue
                else:
                  file.write(f"{col}: {value}\n")
            file.write("\n")
        file.write(f"Performance of {name} in home vs away :\n")
        for _, career_row in Home_or_Away.iterrows():
            file.write(f'Performace of {name} in {career_row.Title} \n')
            for col, value in career_row.items():
                if col=="Title":
                  continue
                else:
                  file.write(f"{col}: {value}\n")
            file.write("\n")
        file.write(f"Performance of {name} when he was playing as a captain or not a captain :\n")
        for _, career_row in captain.iterrows():
            file.write(f'Performace of {name} when he  {career_row.Title} \n')
            for col, value in career_row.items():
                if col=="Title":
                  continue
                else:
                  file.write(f"{col}: {value}\n")
            file.write("\n")
print("Text files created successfully.")


## neo4j vector retriever

In [None]:
!pip install langchain neo4j
!pip install "shapely<2.0.0"
!pip install google-cloud-aiplatform --upgrade

Collecting langchain
  Downloading langchain-0.0.335-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting neo4j
  Downloading neo4j-5.14.1.tar.gz (192 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.8/192.8 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.2-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.64-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Create Google Cloud account with the same account you are currently using this colab notebook

In [None]:
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [None]:
import vertexai
from google.cloud import aiplatform
PROJECT_ID = ""  # @param {type:"string"}
vertexai.init(project=PROJECT_ID, location="us-central1")

In [None]:
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.prompts import PromptTemplate
from pydantic import BaseModel
import time
import json

Reading players text information files

In [None]:
import os

# Path to the folder containing player information files
folder_path = '/content/playerchunks'

# Initialize an empty string to store concatenated text
text = ''

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    filepath = os.path.join(folder_path, filename)

    # Check if the path is a file (not a subdirectory)
    if os.path.isfile(filepath):
        # Read the content of the file and append it to the 'text' variable
        with open(filepath, 'r') as file:
            text += file.read() + '\n'
            text+=('-')*50
            text+='\n'

# Now, the 'text' variable contains the concatenated content with line breaks


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len,
)

document_chunks = text_splitter.create_documents([text])

In [None]:
#create neo4j aura instance and add credentials to this
url = ""
username = ""
password = ""


In [None]:
from langchain.docstore.document import Document


In [None]:
# Instantiate Neo4j vector from documents
neo4j_vector = Neo4jVector.from_documents(
    documents=[Document(page_content="foo")],
    embedding=VertexAIEmbeddings(),
    url=url,
    username=username,
    password=password,
    database="neo4j",  # neo4j by default
    index_name="cric-gpt",  # vector by default
    node_label="Player",  # Chunk by default
    text_node_property="info",  # text by default
    embedding_node_property="vector",  # embedding by default
    create_id_index=True,  # True by default
)

In [None]:
# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting.....")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings, BaseModel):
    requests_per_minute: int
    num_instances_per_batch: int
    # Overriding embed_documents method
    def embed_documents(self, texts: list[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)
        print("done")
        return [r.values for r in results]

In [None]:
def handle_quota_errors(func, *args, max_retries=10, initial_delay=5, backoff_factor=2, **kwargs):
    retries = 0
    delay = initial_delay

    while retries < max_retries:
        try:
            result = func(*args, **kwargs)
            return result  # Return the result if the function executes successfully
        except Exception as e:  # Replace with the actual exception type you expect
            print(f"Quota exceeded: {e}")
            retries += 1
            if retries < max_retries:
                wait = delay
                print(f"Retrying in {wait} seconds...")
                time.sleep(wait)
                delay *= backoff_factor
            else:
                print("Max retries reached. Function failed.")
                raise

In [None]:
EMBEDDING_QPM = 20
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

In [None]:
#This will generate embeddings of text chunks and it will create nodes in neo4j db
for index, doc in enumerate(document_chunks):
  embedding = handle_quota_errors(embeddings.embed_documents, [doc.page_content])
  neo4j_vector.add_embeddings([doc.page_content],embedding)

This is required if you wanted to create api , as this is referencing to the neo4j index we created earlier

In [None]:
neo4j_vector = Neo4jVector.from_existing_index(
    VertexAIEmbeddings(),
    url=url,
    username=username,
    password=password,
    index_name="cric-gpt",
    text_node_property="info",  # Need to define if it is not default
)

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain import PromptTemplate
from langchain.chains import RetrievalQA


Prompt-template

In [None]:
template = """
You are a helpfull cricket coach assistant which helps in strategies and other decision of palyers, you should only give answers to the question related to cricket only and if other irrelevant questions to cricket  asked then clearly tell them i am only responsible for assisting cricket related decision or information
you should only give answers in english language
------
<ctx>
{context}
</ctx>
------
<hs>
{history}
</hs>
------
{question}
Answer:
"""
prompt = PromptTemplate(
    input_variables=["history", "context", "question"],
    template=template,
)

Creating Retrieval chain

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=VertexAI(),
    chain_type='stuff',
    retriever=neo4j_vector.as_retriever(search_kwargs={'k': 6}),
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt,
        "memory": ConversationBufferMemory(
            memory_key="history",
            input_key="question"),
    }
)

Our RAG based LLM is ready

In [None]:
query = input()
print("-"*50)
qa.run(query)