# Exploration File
This project aims to build a semantic search engine using various NLP techniques. The dataset used is a csv file downloaded from kaggle titled "English Premier League - Match Commentary"

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("D:\\programs\\labs\\semantic-search-epl\\data\\23_24_match_details.csv")
df

Unnamed: 0,id,Home,Away,Date,Stadium,Attendance,Referee,events,summary
0,93323,Bournemouth,West Ham,2023-08-12,"Vitality Stadium, Bournemouth",,Robert Jones,Hello and welcome to live coverage of the Prem...,Referee: Peter Bankes. Assistants: Dan Robatha...
1,93336,Man City,Newcastle,2023-08-19,"Etihad Stadium, Manchester",,Robert Jones,Hello everyone and welcome to live text covera...,"Referee: Robert Jones. Assistants: Ian Hussin,..."
2,93343,Brentford,Crystal Palace,2023-08-26,"Gtech Community Stadium, Brentford",16997.0,Peter Bankes,Hello and welcome to the live commentary of th...,
3,93344,Brighton,West Ham,2023-08-26,"American Express Stadium, Falmer",31508.0,Anthony Taylor,Hello and welcome to live coverage of the Prem...,Referee: Anthony Taylor. Assistants: Gary Besw...
4,93347,Everton,Wolves,2023-08-26,"Goodison Park, Liverpool",38851.0,Craig Pawson,Hello and welcome to live coverage of this Pre...,
...,...,...,...,...,...,...,...,...,...
298,93647,Man City,Luton,2024-04-13,"Etihad Stadium, Manchester",53449.0,John Brooks,Hello and welcome to live coverage of the Prem...,Manchester City moved top of the Premier Leagu...
299,93641,Arsenal,Aston Villa,2024-04-14,"Emirates Stadium, London",60350.0,David Coote,Hello and welcome to our LIVE commentary of th...,Arsenal failed to take advantage of Liverpool’...
300,93650,West Ham,Fulham,2024-04-14,"London Stadium, London",62459.0,Stuart Attwell,"Hello, and welcome to live coverage of this fi...",Andreas Pereira proved the hero with a deadly ...
301,93646,Liverpool,Crystal Palace,2024-04-14,"Anfield, Liverpool",60090.0,Chris Kavanagh,Hello and welcome to our live coverage of the ...,Eberechi Eze dealt a huge blow to Liverpool’s ...


In [3]:
df.drop(columns=["summary","Stadium","Attendance","Referee", "Date"], inplace=True)
df.dropna(subset=["events"], inplace=True)

df['id'] = range(1,len(df)+1)

df["events"] = df["events"].str.lower()

df

Unnamed: 0,id,Home,Away,events
0,1,Bournemouth,West Ham,hello and welcome to live coverage of the prem...
1,2,Man City,Newcastle,hello everyone and welcome to live text covera...
2,3,Brentford,Crystal Palace,hello and welcome to the live commentary of th...
3,4,Brighton,West Ham,hello and welcome to live coverage of the prem...
4,5,Everton,Wolves,hello and welcome to live coverage of this pre...
...,...,...,...,...
298,287,Man City,Luton,hello and welcome to live coverage of the prem...
299,288,Arsenal,Aston Villa,hello and welcome to our live commentary of th...
300,289,West Ham,Fulham,"hello, and welcome to live coverage of this fi..."
301,290,Liverpool,Crystal Palace,hello and welcome to our live coverage of the ...


In [4]:
teams = ["Manchester City", "Liverpool", "Burnley", "Brighton", "Wolves", "Tottenham", "Arsenal", "Fulham", "Aston Villa", "Newcastle United",
         "Manchester United", "Nottingham Forest", "Brentford", "Chelsea", "Crystal Palace", "Luton Town", "Bournemouth", "Everton", "Sheffield United", "West Ham"]

df["Home"] = None
df["Away"] = None

for idx, row in df.iterrows():
    
    intro = row["events"][:150].lower()
    found = []
    
    for team in teams:
        if team.lower() in intro:
            found.append(team)
            
    if len(found)>=2:
        df.loc[idx, "Home"] = found[0]
        df.loc[idx, "Away"] = found[1]

print(df[["Home","Away","events"]])

                  Home               Away  \
0      Manchester City            Burnley   
1              Arsenal  Nottingham Forest   
2          Bournemouth           West Ham   
3             Brighton         Luton Town   
4               Fulham            Everton   
..                 ...                ...   
298          Liverpool  Manchester United   
299            Chelsea   Sheffield United   
300          Tottenham  Nottingham Forest   
301             Wolves           West Ham   
302  Manchester United        Bournemouth   

                                                events  
0    hello and welcome to live coverage of the prem...  
1    hello everyone and welcome to live text covera...  
2    hello and welcome to the live commentary of th...  
3    hello and welcome to live coverage of the prem...  
4    hello and welcome to live coverage of this pre...  
..                                                 ...  
298  hello and welcome to live coverage of the prem...  
299

In [5]:
def chunk_text(text, chunk_size = 50, overlap = 10):
    words = text.split()
    chunks = []
    
    start = 0
    while start<len(words):
        end = start + chunk_size
        chunk = words[start:end]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap
        
    return chunks

In [6]:
chunks = []

for _,row in df.iterrows():

    match_id = row["id"]
    home = row["Home"]
    away = row["Away"]
    
    text_chunks = chunk_text(row["events"])
    
    for chunk in text_chunks:
        chunks.append({
            'match_id': match_id,
            'home': home,
            'away': away,
            'chunk_text': chunk
        })
        
        
df_chunks = pd.DataFrame(chunks)

df_chunks = df_chunks.reset_index(drop=True)

df_chunks
    

Unnamed: 0,match_id,home,away,chunk_text
0,1,Manchester City,Burnley,hello and welcome to live coverage of the prem...
1,1,Manchester City,Burnley,side and the reigning champions. burnley could...
2,1,Manchester City,Burnley,"tier. led by vincent kompany, a citizens' lege..."
3,1,Manchester City,Burnley,"soil. city, meanwhile, claimed a historic treb..."
4,1,Manchester City,Burnley,as riyad mahrez and ilkay gundogan leaving the...
...,...,...,...,...
23005,291,Manchester United,Bournemouth,powerful shot flying into the bottom corner fr...
23006,291,Manchester United,Bournemouth,equaliser from just a few yards out. it didn't...
23007,291,Manchester United,Bournemouth,the break with a narrow lead. in the second ha...
23008,291,Manchester United,Bournemouth,was almost late drama as bournemouth were give...


In [7]:
vec = TfidfVectorizer(stop_words="english")
X = vec.fit_transform(df_chunks["chunk_text"])

In [8]:
def semantic_search(query, top_k=20, min_similarity=0.09):
    query_vec = vec.transform([query])
    
    similarities = cosine_similarity(query_vec, X).flatten()
    
    valid_indices = np.where(similarities>=min_similarity)[0]
    
    sorted_indices = valid_indices[np.argsort(similarities[valid_indices])[::-1]]

    top_indices = sorted_indices[:top_k]    
    results = df_chunks.iloc[top_indices].copy()
    results['Similarity scores'] = similarities[top_indices]
    
    return results[['match_id','home', 'away', 'chunk_text', 'Similarity scores']]

In [10]:
results = semantic_search("foden scored a goal")
print(results)

       match_id             home               away  \
16048       206  Manchester City          Brentford   
13335       174  Manchester City            Everton   
16039       206  Manchester City          Brentford   
18136       231  Manchester City        Bournemouth   
19110       243  Manchester City  Manchester United   
19137       243  Manchester City  Manchester United   
10217       135  Manchester City        Aston Villa   
16049       206  Manchester City          Brentford   
19120       243  Manchester City  Manchester United   
1094         16  Manchester City   Newcastle United   
22301       283  Manchester City     Crystal Palace   
4141         57  Manchester City  Nottingham Forest   
13334       174  Manchester City            Everton   
19077       243  Manchester City  Manchester United   
11835       156        Liverpool  Manchester United   
766          11      Aston Villa            Everton   
767          11      Aston Villa            Everton   
10032     