In [60]:
import os
import pandas as pd
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer, util


## Import Player Data

In [75]:
# Import data
player_data = pd.read_parquet("player_stats_splits/league_split/all_leagues_players_stats.parquet")
player_data

Unnamed: 0,Rnd,R2.0,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,...,D,A,FK,FD,Player,Agents,Roles,team_name,region,league_name
0,340,1.37,283.9,1.44,76%,188.5,1.00,0.21,0.16,0.10,...,237,70,54,35,florescent,"[jett, raze]",[duelist],Version1,,challengers_na
1,536,1.28,217.4,1.37,74%,146.4,0.82,0.32,0.08,0.04,...,321,172,42,24,xenom,"[omen, viper, brimstone]",[controller],TropiCaos,BR,challengers_br
2,536,1.28,217.4,1.37,74%,146.4,0.82,0.32,0.08,0.04,...,321,172,42,24,xenom,"[omen, viper, brimstone]",[controller],Stellae Gaming,BR,challengers_br
3,338,1.27,259.5,1.34,76%,164.5,0.89,0.23,0.24,0.12,...,224,78,82,39,mada,"[jett, raze, reyna]",[duelist],"Luminosity Esports, Luminosity Gaming",,challengers_na
4,338,1.27,259.5,1.34,76%,164.5,0.89,0.23,0.24,0.12,...,224,78,82,39,mada,"[jett, raze, reyna]",[duelist],"Moist Moguls, BreakThru",,challengers_na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692,332,0.79,157.2,0.74,67%,105.2,0.51,0.31,0.06,0.11,...,227,102,21,35,SouhcNi,"[sova, gekko, kayo]",[initiator],"SuperMassive Blaze Female, Papara SuperMassive...",EMEA,game_changers_emea
1693,304,0.77,144.9,0.66,65%,98.5,0.50,0.38,0.04,0.06,...,232,115,13,18,syd89,"[omen, brimstone, kayo]",[controller],Avernus Esports GC,SEA,game_changers_sea
1694,408,0.76,147.3,0.69,66%,99.8,0.54,0.21,0.06,0.08,...,316,84,26,31,Vefa,"[breach, gekko, sova]",[initiator],MYVRA,LATAM,game_changers_latam
1695,224,0.75,151.2,0.69,65%,107.1,0.50,0.27,0.07,0.07,...,162,61,16,15,dati,"[gekko, sova, breach]",[initiator],"SuperMassive Blaze Female, Papara SuperMassive...",EMEA,game_changers_emea


In [81]:
player_data['region'].value_counts()

region
NA       389
SEA      370
EMEA     242
BR       237
LATAM    211
JP        89
KR        65
INTL      45
LAS       27
VN        18
LAN        4
Name: count, dtype: int64

In [83]:
# Add long names for each region for better vector search
region_long_names = {
    'NA': 'North America',
    'SEA': 'Southeast Asia',
    'EMEA': 'Europe, Middle East, and Africa',
    'BR': 'Brazil',
    'LATAM': 'Latin America',
    'JP': 'Japan',
    'KR': 'South Korea',
    'INTL': 'International',
    'LAS': 'Latin America South',
    'VN': 'Vietnam',
    'LAN': 'Latin America North'
}

player_data['region_long'] = player_data['region'].map(region_long_names)
player_data

Unnamed: 0,Rnd,R2.0,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,...,A,FK,FD,Player,Agents,Roles,team_name,region,league_name,region_long
0,340,1.37,283.9,1.44,76%,188.5,1.00,0.21,0.16,0.10,...,70,54,35,florescent,"[jett, raze]",[duelist],Version1,,challengers na,North America
1,536,1.28,217.4,1.37,74%,146.4,0.82,0.32,0.08,0.04,...,172,42,24,xenom,"[omen, viper, brimstone]",[controller],TropiCaos,BR,challengers br,Brazil
2,536,1.28,217.4,1.37,74%,146.4,0.82,0.32,0.08,0.04,...,172,42,24,xenom,"[omen, viper, brimstone]",[controller],Stellae Gaming,BR,challengers br,Brazil
3,338,1.27,259.5,1.34,76%,164.5,0.89,0.23,0.24,0.12,...,78,82,39,mada,"[jett, raze, reyna]",[duelist],"Luminosity Esports, Luminosity Gaming",,challengers na,North America
4,338,1.27,259.5,1.34,76%,164.5,0.89,0.23,0.24,0.12,...,78,82,39,mada,"[jett, raze, reyna]",[duelist],"Moist Moguls, BreakThru",,challengers na,North America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692,332,0.79,157.2,0.74,67%,105.2,0.51,0.31,0.06,0.11,...,102,21,35,SouhcNi,"[sova, gekko, kayo]",[initiator],"SuperMassive Blaze Female, Papara SuperMassive...",EMEA,game changers emea,"Europe, Middle East, and Africa"
1693,304,0.77,144.9,0.66,65%,98.5,0.50,0.38,0.04,0.06,...,115,13,18,syd89,"[omen, brimstone, kayo]",[controller],Avernus Esports GC,SEA,game changers sea,Southeast Asia
1694,408,0.76,147.3,0.69,66%,99.8,0.54,0.21,0.06,0.08,...,84,26,31,Vefa,"[breach, gekko, sova]",[initiator],MYVRA,LATAM,game changers latam,Latin America
1695,224,0.75,151.2,0.69,65%,107.1,0.50,0.27,0.07,0.07,...,61,16,15,dati,"[gekko, sova, breach]",[initiator],"SuperMassive Blaze Female, Papara SuperMassive...",EMEA,game changers emea,"Europe, Middle East, and Africa"


In [78]:
# Clean up league names for easier tokenization
player_data['league_name'] = player_data['league_name'].str.replace("_", " ")
player_data

Unnamed: 0,Rnd,R2.0,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,...,A,FK,FD,Player,Agents,Roles,team_name,region,league_name,region_long
0,340,1.37,283.9,1.44,76%,188.5,1.00,0.21,0.16,0.10,...,70,54,35,florescent,"[jett, raze]",[duelist],Version1,,challengers na,North America
1,536,1.28,217.4,1.37,74%,146.4,0.82,0.32,0.08,0.04,...,172,42,24,xenom,"[omen, viper, brimstone]",[controller],TropiCaos,BR,challengers br,Brazil
2,536,1.28,217.4,1.37,74%,146.4,0.82,0.32,0.08,0.04,...,172,42,24,xenom,"[omen, viper, brimstone]",[controller],Stellae Gaming,BR,challengers br,Brazil
3,338,1.27,259.5,1.34,76%,164.5,0.89,0.23,0.24,0.12,...,78,82,39,mada,"[jett, raze, reyna]",[duelist],"Luminosity Esports, Luminosity Gaming",,challengers na,North America
4,338,1.27,259.5,1.34,76%,164.5,0.89,0.23,0.24,0.12,...,78,82,39,mada,"[jett, raze, reyna]",[duelist],"Moist Moguls, BreakThru",,challengers na,North America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692,332,0.79,157.2,0.74,67%,105.2,0.51,0.31,0.06,0.11,...,102,21,35,SouhcNi,"[sova, gekko, kayo]",[initiator],"SuperMassive Blaze Female, Papara SuperMassive...",EMEA,game changers emea,"Europe, Middle East, and Africa"
1693,304,0.77,144.9,0.66,65%,98.5,0.50,0.38,0.04,0.06,...,115,13,18,syd89,"[omen, brimstone, kayo]",[controller],Avernus Esports GC,SEA,game changers sea,Southeast Asia
1694,408,0.76,147.3,0.69,66%,99.8,0.54,0.21,0.06,0.08,...,84,26,31,Vefa,"[breach, gekko, sova]",[initiator],MYVRA,LATAM,game changers latam,Latin America
1695,224,0.75,151.2,0.69,65%,107.1,0.50,0.27,0.07,0.07,...,61,16,15,dati,"[gekko, sova, breach]",[initiator],"SuperMassive Blaze Female, Papara SuperMassive...",EMEA,game changers emea,"Europe, Middle East, and Africa"


In [80]:
# Load vector search model
model = SentenceTransformer("all-MiniLM-L6-v2")

### Step 1 - Query

In [131]:
query = "Make me a high performing team for the VCT Challengers tournament"

query_embedding = model.encode(query, convert_to_tensor=True)

### Step 2 - Determine if a filter needs to be applied or use the entire dataset

In [132]:
# Create options for determining what to filter for

"""
Prompt engineering to determine which category to filter by if needed

Determine which category should be filtered with pandas and return it as a single line response.

If no specific filter is needed, return a single line response of "No".

Note: The dataset has the league name and the league's region in the league_name column. The vector search should be able to differentiate
"""

""" TEMPORARY UNTIL WE REPLACE WITH LLM REPLY """

filter_col_options = ['league name', 'region']

options_embeddings = model.encode(filter_col_options, convert_to_tensor=True)    # Only filter by region or league

cosine_scores = util.pytorch_cos_sim(query_embedding, options_embeddings)[0]

max_score_index = np.argmax(cosine_scores.numpy())

best_match_col = filter_col_options[max_score_index]
best_match_confidence = cosine_scores[max_score_index].item()

best_match_col, best_match_confidence

('league name', 0.2324700951576233)

## Step 3 - Determine what filter to use on column matched

In [125]:
# Merge into string
corpus = pd.DataFrame()

corpus['text'] = player_data.apply(lambda row: ' | '.join([f"{col}: {row[col]}" for col in player_data.columns]), axis=1)

corpus

Unnamed: 0,text
0,Rnd: 340 | R2.0: 1.37 | ACS: 283.9 | K:D: 1.44...
1,Rnd: 536 | R2.0: 1.28 | ACS: 217.4 | K:D: 1.37...
2,Rnd: 536 | R2.0: 1.28 | ACS: 217.4 | K:D: 1.37...
3,Rnd: 338 | R2.0: 1.27 | ACS: 259.5 | K:D: 1.34...
4,Rnd: 338 | R2.0: 1.27 | ACS: 259.5 | K:D: 1.34...
...,...
1692,Rnd: 332 | R2.0: 0.79 | ACS: 157.2 | K:D: 0.74...
1693,Rnd: 304 | R2.0: 0.77 | ACS: 144.9 | K:D: 0.66...
1694,Rnd: 408 | R2.0: 0.76 | ACS: 147.3 | K:D: 0.69...
1695,Rnd: 224 | R2.0: 0.75 | ACS: 151.2 | K:D: 0.69...


In [126]:
# Vectorize data
stats_vectors = model.encode(corpus['text'].tolist(), convert_to_tensor=True)
stats_vectors = np.array(stats_vectors.cpu())

In [127]:
# Create vectors index

d = stats_vectors.shape[1]  
index = faiss.IndexFlatL2(d)  # l2 distance
index.add(stats_vectors)  # Add vectors to the index

In [134]:
query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy()

k = 500  # Number of nearest neighbors
D, I = index.search(query_vector, k)  # Search

# Get corresponding rows from the DataFrame
results = player_data.iloc[I[0]]
results

Unnamed: 0,Rnd,R2.0,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,...,A,FK,FD,Player,Agents,Roles,team_name,region,league_name,region_long
665,277,0.95,204.7,1.00,67%,132.7,0.71,0.13,0.17,0.17,...,37,47,47,DEATHMAKER,"[jett, raze, cypher]","[duelist, sentinel]","Velocity Gaming, VLTG, RVT",INTL,challengers south asia,International
702,462,0.92,203.8,0.91,70%,135.7,0.69,0.32,0.11,0.15,...,146,53,71,Kakarot,"[jett, neon, raze]",[duelist],"Medal Esports, MDL, MEDAL ESPORTS",INTL,challengers south asia,International
703,462,0.92,203.8,0.91,70%,135.7,0.69,0.32,0.11,0.15,...,146,53,71,Kakarot,"[jett, neon, raze]",[duelist],"Medal Esports, MDL, MEDAL ESPORTS",INTL,challengers south asia,International
323,225,1.04,212.7,1.05,67%,141.7,0.76,0.25,0.12,0.09,...,56,27,20,artzin,"[yoru, gekko, killjoy]","[sentinel, duelist, initiator]","Stars Horizon, TERROR.NET",BR,challengers br,Brazil
163,276,1.10,235.0,1.13,74%,157.5,0.81,0.24,0.15,0.13,...,66,41,36,DaFt,"[raze, neon, gekko]","[duelist, initiator]",VICTORY,VN,challengers sea vn,Vietnam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352,205,1.21,218.3,1.26,76%,142.3,0.80,0.35,0.06,0.05,...,71,13,11,fluxxy,"[omen, viper, astra]",[controller],HEROIC Valkyries,EMEA,game changers emea,"Europe, Middle East, and Africa"
636,612,0.96,186.0,0.91,69%,124.1,0.63,0.39,0.09,0.10,...,238,54,60,BerLIN,"[kayo, astra, viper]",[controller],CBT Gaming,SEA,challengers sea hk and tw,Southeast Asia
585,233,0.97,180.4,0.95,62%,125.4,0.67,0.21,0.09,0.08,...,49,21,19,Less,"[cypher, killjoy, omen]","[controller, sentinel]",Jaguares Gaming,BR,challengers br,Brazil
496,578,0.99,182.8,0.91,74%,125.5,0.60,0.41,0.07,0.10,...,236,38,56,Sacy,"[fade, gekko, skye]",[initiator],Team Vikings,BR,challengers br,Brazil


In [152]:
# Check what the most common column value is (Most likely what we want to filter by)

if best_match_col == "league name":
    return_col = results['league_name']
elif best_match_col == "region":
    return_col = results['region']
else:   # Best match from LLM is just general
   return_col = None

"""
If best match from LLM is just factoring in entire dataset, please IGNORE these 

"""

return_col.value_counts().nlargest(1).index[0]

'challengers na'

All player Data Num Tokens Est. -> 172132

Shortlisted by all leagues with 50 matches -> 5177

https://token-counter.app/meta/llama-3.1

### Step 4 - Filter dataset and sort by best performing on ranking

### Run below cell if LLM believes region or league filtering

Need to add in another filter for if they want just a league or want a specific league in a specific region like challengers vs challengers NA. For now, assume the vector search filtered through everyone

In [155]:
# Filter such as making sure no other leagues are in the list

results

Unnamed: 0,Rnd,R2.0,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,...,A,FK,FD,Player,Agents,Roles,team_name,region,league_name,region_long
665,277,0.95,204.7,1.00,67%,132.7,0.71,0.13,0.17,0.17,...,37,47,47,DEATHMAKER,"[jett, raze, cypher]","[duelist, sentinel]","Velocity Gaming, VLTG, RVT",INTL,challengers south asia,International
702,462,0.92,203.8,0.91,70%,135.7,0.69,0.32,0.11,0.15,...,146,53,71,Kakarot,"[jett, neon, raze]",[duelist],"Medal Esports, MDL, MEDAL ESPORTS",INTL,challengers south asia,International
703,462,0.92,203.8,0.91,70%,135.7,0.69,0.32,0.11,0.15,...,146,53,71,Kakarot,"[jett, neon, raze]",[duelist],"Medal Esports, MDL, MEDAL ESPORTS",INTL,challengers south asia,International
323,225,1.04,212.7,1.05,67%,141.7,0.76,0.25,0.12,0.09,...,56,27,20,artzin,"[yoru, gekko, killjoy]","[sentinel, duelist, initiator]","Stars Horizon, TERROR.NET",BR,challengers br,Brazil
163,276,1.10,235.0,1.13,74%,157.5,0.81,0.24,0.15,0.13,...,66,41,36,DaFt,"[raze, neon, gekko]","[duelist, initiator]",VICTORY,VN,challengers sea vn,Vietnam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352,205,1.21,218.3,1.26,76%,142.3,0.80,0.35,0.06,0.05,...,71,13,11,fluxxy,"[omen, viper, astra]",[controller],HEROIC Valkyries,EMEA,game changers emea,"Europe, Middle East, and Africa"
636,612,0.96,186.0,0.91,69%,124.1,0.63,0.39,0.09,0.10,...,238,54,60,BerLIN,"[kayo, astra, viper]",[controller],CBT Gaming,SEA,challengers sea hk and tw,Southeast Asia
585,233,0.97,180.4,0.95,62%,125.4,0.67,0.21,0.09,0.08,...,49,21,19,Less,"[cypher, killjoy, omen]","[controller, sentinel]",Jaguares Gaming,BR,challengers br,Brazil
496,578,0.99,182.8,0.91,74%,125.5,0.60,0.41,0.07,0.10,...,236,38,56,Sacy,"[fade, gekko, skye]",[initiator],Team Vikings,BR,challengers br,Brazil


In [161]:
shortlist = results.sort_values(by='R2.0', ascending=False)
shortlist

Unnamed: 0,Rnd,R2.0,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,...,A,FK,FD,Player,Agents,Roles,team_name,region,league_name,region_long
0,340,1.37,283.9,1.44,76%,188.5,1.00,0.21,0.16,0.10,...,70,54,35,florescent,"[jett, raze]",[duelist],Version1,,challengers na,North America
1328,423,1.34,297.2,1.49,73%,198.6,1.05,0.22,0.26,0.18,...,91,108,75,Lied,"[raze, iso, gekko]","[duelist, initiator]","Fusion Esports, Fusion X, Fusion",LATAM,game changers latam,Latin America
953,423,1.34,297.2,1.49,73%,198.6,1.05,0.22,0.26,0.18,...,91,108,75,Lied,"[raze, iso, gekko]","[duelist, initiator]","Fusion Esports, Fusion X, Fusion",LATAM,game changers latam,Latin America
1,536,1.28,217.4,1.37,74%,146.4,0.82,0.32,0.08,0.04,...,172,42,24,xenom,"[omen, viper, brimstone]",[controller],TropiCaos,BR,challengers br,Brazil
2,536,1.28,217.4,1.37,74%,146.4,0.82,0.32,0.08,0.04,...,172,42,24,xenom,"[omen, viper, brimstone]",[controller],Stellae Gaming,BR,challengers br,Brazil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,221,0.75,186.1,0.77,59%,128.1,0.64,0.17,0.13,0.26,...,37,29,57,fainz,"[jett, neon, raze]",[duelist],EGN Esports,EMEA,challengers portugal,"Europe, Middle East, and Africa"
945,220,0.73,190.1,0.72,69%,122.2,0.61,0.25,0.13,0.21,...,54,28,47,RND,"[neon, jett, raze]",[duelist],"LOS GRANDES, Los Grandes",BR,challengers br,Brazil
944,220,0.73,190.1,0.72,69%,122.2,0.61,0.25,0.13,0.21,...,54,28,47,RND,"[neon, jett, raze]",[duelist],"TBK Lusa, TBK Esports, TBK",BR,challengers br,Brazil
943,220,0.73,190.1,0.72,69%,122.2,0.61,0.25,0.13,0.21,...,54,28,47,RND,"[neon, jett, raze]",[duelist],"RED Canids Kalunga, 1530RED Canids Kalunga",BR,challengers br,Brazil


Currently there are some Game Changers participants in the Challengers list, will need to refine the search for this issue to be fixed.

### Run below cell if LLM believes in factoring in all regions, leagues, and players

In [169]:
shortlist = player_data.sort_values(by='R2.0', ascending=False)
shortlist.drop_duplicates(subset='Player')

Unnamed: 0,Rnd,R2.0,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,...,A,FK,FD,Player,Agents,Roles,team_name,region,league_name,region_long
0,340,1.37,283.9,1.44,76%,188.5,1.00,0.21,0.16,0.10,...,70,54,35,florescent,"[jett, raze]",[duelist],Version1,,challengers na,North America
1325,327,1.36,308.4,1.45,75%,187.3,1.05,0.26,0.31,0.21,...,84,101,69,Miku,"[yoru, raze, jett]",[duelist],"Fusion Esports, Fusion X, Fusion",LATAM,game changers latam,Latin America
1327,423,1.34,297.2,1.49,73%,198.6,1.05,0.22,0.26,0.18,...,91,108,75,Lied,"[raze, iso, gekko]","[duelist, initiator]",Skull Cracker Quartz,LATAM,game changers latam,Latin America
954,328,1.31,287.9,1.41,74%,182.8,1.03,0.17,0.26,0.17,...,55,86,57,miNt,"[raze, jett]",[duelist],FENNEL HOTELAVA,JP,game changers jpn,Japan
955,349,1.30,227.8,1.49,76%,148.7,0.85,0.22,0.07,0.05,...,77,25,17,mmonch,"[cypher, killjoy, deadlock]","[controller, sentinel]",FiRePOWER,LATAM,game changers latam,Latin America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,221,0.75,186.1,0.77,59%,128.1,0.64,0.17,0.13,0.26,...,37,29,57,fainz,"[jett, neon, raze]",[duelist],EGN Esports,EMEA,challengers portugal,"Europe, Middle East, and Africa"
1320,224,0.75,151.2,0.69,65%,107.1,0.50,0.27,0.07,0.07,...,61,16,15,dati,"[gekko, sova, breach]",[initiator],"SuperMassive Blaze Female, Papara SuperMassive...",EMEA,game changers emea,"Europe, Middle East, and Africa"
1321,217,0.74,135.4,0.67,66%,88.7,0.52,0.35,0.02,0.04,...,76,4,9,Junki,"[cypher, omen, harbor]","[controller, sentinel]",Bon Bon Bum,LATAM,game changers latam,Latin America
943,220,0.73,190.1,0.72,69%,122.2,0.61,0.25,0.13,0.21,...,54,28,47,RND,"[neon, jett, raze]",[duelist],"RED Canids Kalunga, 1530RED Canids Kalunga",BR,challengers br,Brazil


## Note: Current configuration does not take into account team role imbalance. Need another model to determine

## Step 5 - Take Top K To Make Team

In [170]:
top_k = 5 # Typical 4 positions

shortlist.head(top_k)

Unnamed: 0,Rnd,R2.0,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,...,A,FK,FD,Player,Agents,Roles,team_name,region,league_name,region_long
0,340,1.37,283.9,1.44,76%,188.5,1.0,0.21,0.16,0.1,...,70,54,35,florescent,"[jett, raze]",[duelist],Version1,,challengers na,North America
1324,340,1.37,283.9,1.44,76%,188.5,1.0,0.21,0.16,0.1,...,70,54,35,florescent,"[jett, raze]",[duelist],Version 1,,game changers na,North America
948,340,1.37,283.9,1.44,76%,188.5,1.0,0.21,0.16,0.1,...,70,54,35,florescent,"[jett, raze]",[duelist],Shopify Rebellion,,game changers na,North America
949,340,1.37,283.9,1.44,76%,188.5,1.0,0.21,0.16,0.1,...,70,54,35,florescent,"[jett, raze]",[duelist],Version 1,,game changers na,North America
1322,340,1.37,283.9,1.44,76%,188.5,1.0,0.21,0.16,0.1,...,70,54,35,florescent,"[jett, raze]",[duelist],Shopify Rebellion,,game changers na,North America


## Step 6 - Send information to LLM with data from each player to compile report