# Fandom Riot Dump Parsing

In [468]:
from bs4 import BeautifulSoup
import pandas as pd

import mwparserfromhell as mw
import os
import re
import json

In [469]:
# Parse files with BS4

# filepath = "fandom/"
filepath = "Z:\\VCT HACK\\fandom\\"

with open(filepath+"valorant_esports_pages.xml", 'r', encoding='utf8') as file:

    val_esports_pages = BeautifulSoup(file, "xml")
    
with open(filepath+"valorant_pages.xml", 'r', encoding='utf8') as file:

    val_pages = BeautifulSoup(file, "xml")

## Get Valorant player bios

### Filter wiki entries to only be player pages

In [470]:
# Filter and keep only wiki info entries
esports_wiki_entries = []


for page_entry in val_esports_pages.find_all('page'):

    if page_entry.find('ns').text == "0":
        esports_wiki_entries.append(page_entry)


In [471]:
def parse_table(table):
    rows = re.split(r'\|-\n?', table)
    data = []

    for row in rows:
        if 'class="wikitable"' in row:
            continue
        
        columns = row.split('|')

        cleaned_columns = [
            re.sub(r'\[\[.*?\|(.*?)\]\]', r'\1', col.strip())
            .replace('{{Tick}}', 'Yes')
            .replace('{{Cross}}', 'No')
            .strip()
            for col in columns if col.strip() and not col.startswith('[[File:')]
        

        if cleaned_columns:
            data.append(cleaned_columns)

    final_data = {row[0]: row[1] for row in data if len(row) >= 2}

    return final_data


def parse_infobox(template, sections, custom_sections):

    dictionary = {}

    for row in template.params:
        dictionary[str(row.name).strip().lower()] = (row.value).strip()

        for section in sections:
                       
            try:            
                section_title = section.filter_headings()[0].title.strip().lower()

                for c in custom_sections:
                    if c in section_title:

                        processed_section = re.sub(r"==([^=]+)==", "", str(section).strip())
                        
                        if processed_section.find("wikitable") != -1:
                            dictionary[c] = parse_table(processed_section)

                        else:
                            dictionary[c] = processed_section
            except:
                continue        

    return dictionary

In [472]:
players = []
teams = []
tournaments = []

for page in esports_wiki_entries:
    
    page_text = mw.parse(page.find('text').text)

    templates = page_text.filter_templates()
    sections = page_text.get_sections()
    
    index = 0


    if not templates:
        continue

    for template in templates:

        parser = lambda a: parse_infobox(template, sections, a)

        if template.name.strip() == "Infobox Player" :
            player_dict = parser(['biography','trivia','tournament results'])
            players.append(player_dict)

        elif template.name.strip() == 'Infobox Team':
            team_dict = parser(['history', 'timeline', 'player roster', 'organization', 'tournaments'])
            teams.append(team_dict)

        elif template.name.strip() == 'Infobox Tournament':
            tournament_dict = parser(['overview', 'participants', 'result', 'schedule'])
            tournaments.append(tournament_dict)

In [473]:
players_df = pd.DataFrame(players)
teams_df = pd.DataFrame(teams)
tournaments_df = pd.DataFrame(tournaments)

In [474]:
players_df.columns

Index(['isretired', 'biography', 'trivia', 'tournament results', 'noteamhist',
       'id', 'name', 'pronoun', 'checkboxautoimage', 'checkboxshowimage',
       'country', 'residency', 'checkbox-res', 'birth_date_year',
       'birth_date_month', 'birth_date_day', 'checkbox1', 'compid1',
       'checkboxcomp', 'role', 'checkboxprev', 'checkboxissub',
       'checkboxsharesrole', 'checkboxautoteams', 'stream', 'facebook',
       'twitter', 'instagram', 'youtube', 'checkboxispersonality',
       'checkboxsuppressorgnavbox', 'checkboxsbs', 'checkboxtocr', 'checkbox2',
       'checkbox3', 'snapchat', 'reddit', 'discord', 'otherwikis', 'team',
       'teamhist1', 'teamdate1', 'isretiredplayer', 'nativename',
       'namealphabet', 'compid2', 'low_content', 'checkboxislowercase',
       'checkboxplatformprev', 'esea', '5ewin', 'b5csgo', 'steam',
       'nationality', 'sponsor', 'site', 'teamrole1', 'pronly', 'isinactive',
       'tracker', 'teamrole2', 'prev1', 'image', 'text', 'usercountry',

In [475]:
player_wiki_filtered = players_df[['biography', 'trivia', 'tournament results', 'id',
                                    'isretired', 'name', 'pronoun', 'country', 'residency', 'birth_date_year', 
                                    'role', 'stream', 'facebook', 'twitter', 'instagram', 'youtube', 'nativename']]

player_wiki_filtered = player_wiki_filtered.rename(columns={'isretired': 'is_retired', 'nativename': 'native_name'})

In [476]:
teams_df.columns

Index(['name', 'history', 'timeline', 'player roster', 'organization',
       'tournaments', 'orgcountry', 'country', 'region', 'image', 'sponsor',
       'headcoach', 'owner', 'website', 'facebook', 'twitter', 'instagram',
       'weibo', 'stream', 'youtube', 'discord', 'created', 'disbanded',
       'otherwikis', 'auto_teams', 'isdisbanded', 'snapchat', 'rosterphoto',
       'subreddit', 'special', 'partner', 'twitch-team', 'lolpros', 'tiktok',
       'irc', 'reddit', 'twitch', 'created2', 'foundedcountry', 'coach', 'vk',
       'neworg'],
      dtype='object')

In [477]:
filtered_teams_df = teams_df[['name', 'history', 'player roster', 'orgcountry', 'country', 
                              'region', 'headcoach', 'owner', 'isdisbanded', 'website', 'facebook', 'twitter', 'instagram',
                              'weibo', 'stream', 'youtube', 'discord', 'created', 'disbanded', 'tiktok','reddit', 'twitch']]

filtered_teams_df = filtered_teams_df.rename(columns={'player roster': 'player_roster', 'orgcountry': 'org_counrtry', 
                                                      'isdisbanded' : 'is_disbanded', 'disbanded' : 'date_disbanded'})

In [478]:
filtered_teams_df.columns

Index(['name', 'history', 'player_roster', 'org_counrtry', 'country', 'region',
       'headcoach', 'owner', 'is_disbanded', 'website', 'facebook', 'twitter',
       'instagram', 'weibo', 'stream', 'youtube', 'discord', 'created',
       'date_disbanded', 'tiktok', 'reddit', 'twitch'],
      dtype='object')

In [479]:
tournaments_df.columns

Index(['name', 'overview', 'participants', 'result', 'schedule', 'image',
       'organizer', 'rulebook', 'sponsor', 'type', 'region', 'prizepool',
       'format', 'date', 'sdate', 'edate', 'website', 'streams', 'linkstreams',
       'league', 'first', 'second', 'third', 'third1', 'third2', 'fourth',
       'rednotice', 'country', 'qual1', 'qual2', 'qual3', 'qual4',
       'cm_standardname', 'cm_tournamentlevel', 'cm_isqualifier',
       'cm_isplayoffs', 'cm_isofficial', 'cm_year', 'cm_standardleague',
       'cm_leagueiconkey', 'twitter', 'discord', 'organizer2', 'organizer3',
       'facebook', 'organizer4', 'youtube', 'instagram', 'split',
       'split_number', 'closest_timezone', 'tier', 'location', 'venue',
       'platform', 'game', 'mode', 'numberofteams', 'web', 'bracket',
       'storedname', 'server', 'address'],
      dtype='object')

In [480]:
# with open('output.txt', 'w') as f:
#     f.write(tournaments_df['result'].to_string(index=False))

In [481]:
filtered_tournaments_df = tournaments_df[['name', 'overview', 'participants', 'result', 'schedule', 'organizer', 'type', 'region'
                                          , 'prizepool', 'format', 'sdate', 'edate', 'league', 'first', 'second', 'third',
                                          'tier', 'location', 'venue', ]]
filtered_tournaments_df = filtered_tournaments_df.rename({'sdat': 'start_date', 'edate': 'end_date'}, axis=1)

In [482]:
# # Determine how many players from Riot data have wiki pages
# # filepath = "shortlist-model-concept/"
# # filename = "all-players-stats.json"

# riot_data = pd.read_json(filepath+filename)
# riot_data.head()

In [483]:
# # Inner join to remove rows that don't have wiki pages

# combined_fandom_riot = pd.merge(riot_data, player_wiki_filtered, on='Player', how='inner')
# # combined_fandom_riot

In [484]:
# combined_fandom_riot.describe(include='all')

## Get Valorant Game Information (Agents, Maps, Abilities, etc)
---
Give more context to the LLM to understand Valorant.

### Filter wiki entries to only be game info pages

In [485]:
def identify_duplicate_columns(df):
    duplicate_columns = [col for col in df.columns if df.columns.tolist().count(col) > 1]
    print(f"Duplicate columns: {duplicate_columns}")


def compare_duplicate_columns(df, column_name):
    duplicate_indices = [i for i, col in enumerate(df.columns) if col == column_name]
    
    if len(duplicate_indices) != 2:
        print("Found more than 2 identically named columns.")
        return None
    

    col1 = df.iloc[:, duplicate_indices[0]]
    col2 = df.iloc[:, duplicate_indices[1]]
    

    differences = col1 != col2
    

    if differences.any():
        print(f"Differences found between the two columns named '{column_name}':")
        diff_rows = df[differences]
        print(diff_rows[[column_name]])
    else:
        print(f"The two columns named '{column_name}' are identical.")
        
    return differences


def drop_duplicate_column(df, column_name):
    duplicate_indices = [i for i, col in enumerate(df.columns) if col == column_name]
    
    if len(duplicate_indices) != 2:
        print("Found more than 2 identically named columns.")
        return None
    
    df = df.drop(df.columns[duplicate_indices[1]], axis=1)

    return df


In [486]:
game_wiki_pages = []

for page_entry in val_pages.find_all('page'):

    if page_entry.find('ns').text in ['0', '10']:   
        game_wiki_pages.append(page_entry)

In [487]:
abilities = []
maps = []
weapons = []

for page in game_wiki_pages:
    
    ability_dict = {}

    page_text = mw.parse(page.find('text').text)


    templates = page_text.filter_templates()
    sections = page_text.get_sections()

    parser = lambda a: parse_infobox(template, sections, a)

    
    index = 0

    if not templates:
        continue

    for template in templates:

        if template.name.strip() == "Infobox ability" :
            ability_dict = parser(['summary', 'stats'])
            abilities.append(ability_dict)

        elif template.name.strip() == 'Infobox map':
            map_dict = parser(['features', 'trivia'])
            maps.append(map_dict)

        elif template.name.strip() == 'Infobox weapon':
            weapon_dict = parser(['summary', 'stats'])
            weapons.append(weapon_dict)

In [488]:
abilities_df = pd.DataFrame(abilities)
maps_df = pd.DataFrame(maps)
weapons_df = pd.DataFrame(weapons)

In [489]:
abilities_df.columns

Index(['title', 'stats', 'image', 'caption', 'description', 'agent', 'type',
       'function', 'friendlyfire', 'creds', 'uses', 'key', 'navigation',
       'summary', 'health', 'fulleffects', 'restock', 'points', 'mapobjects',
       'uses_per_round', 'credits', 'killfeed'],
      dtype='object')

In [490]:
reduced_abilities_df = abilities_df[['title', 'summary', 'stats','description', 'agent', 'type', 'friendlyfire', 
                             'creds', 'uses', 'type','fulleffects' ,'health', 'restock', 'points', 
                             'mapobjects', 'function', 'uses_per_round', 'credits']]

In [491]:
identify_duplicate_columns(reduced_abilities_df)

Duplicate columns: ['type', 'type']


In [492]:
compare_duplicate_columns(reduced_abilities_df,'type')

The two columns named 'type' are identical.


0      False
1      False
2      False
3      False
4      False
       ...  
97     False
98     False
99     False
100    False
101    False
Name: type, Length: 102, dtype: bool

In [493]:
reduced_abilities_df = drop_duplicate_column(reduced_abilities_df, 'type')

print(reduced_abilities_df.columns)
print(identify_duplicate_columns(reduced_abilities_df))

Index(['title', 'summary', 'stats', 'description', 'agent', 'friendlyfire',
       'creds', 'uses', 'fulleffects', 'health', 'restock', 'points',
       'mapobjects', 'function', 'uses_per_round', 'credits'],
      dtype='object')
Duplicate columns: []
None


In [494]:
maps_df.columns

Index(['title', 'features', 'trivia', 'image', 'caption', 'location',
       'elements', 'sites', 'mini-map', 'caption2', 'added', 'theme',
       'coordinates', 'codename', 'rotation', 'pages', 'teleporters', '1'],
      dtype='object')

In [495]:
maps_df.head()

Unnamed: 0,title,features,trivia,image,caption,location,elements,sites,mini-map,caption2,added,theme,coordinates,codename,rotation,pages,teleporters,1
0,Bind,\nBind's unique feature is that it doesn't con...,\n*Bind and Rabat are [[Cypher]]'s home,Loading Screen Bind.png,,"{{fi|MA}} Rabat, Rabat-Salé-Kénitra, Morocco, ...",One-way teleporters<br>One-way automatic doors,A/B,Bind minimap.png,,[[Closed Beta|Beta]],{{PAGENAME}},"34°2'A"" N 6°51'Z"" W<br>(34°2'0"" N 6°51'0"" W)",Duality,Enabled,{{*}} [[Bind/Lore]]<br>{{lorewikiarticle}},,
1,Haven,\nHaven's features include an additional third...,"\n*In A site tower, there is an empty sniper r...",Loading Screen Haven.png,,"{{fi|BT}} Thimphu, Thimphu District, Bhutan, A...",400 HP destructible panels,A/B/C,Haven minimap.png,,[[Closed Beta|Beta]],{{PAGENAME}},"27°28'A"" N 89°38'WZ"" E<br>(27°28'0"" N 89°38'30...",Triad,Enabled,{{lorewikiarticle}},,
2,Split,\nSplit is the first map to use ascenders. The...,"\n* At B Link, the backdrop is named Scuttle S...",Loading Screen Split.png,,"{{fi|JP}} Shinjuku, Tokyo, Kantō, Japan, Alpha...",Rope ascenders,A/B,Split minimap.png,,[[Closed Beta|Beta]],{{PAGENAME}},"35°41'CD"" N 139°41'WX"" E<br>(35°41'23"" N 139°4...",Bonsai,Removed,{{*}} [[Split/Quotes]]<br>{{lorewikiarticle}},,
3,Range,"{''''Speed': 'Easy {{*}} Medium {{*}} Hard', '...","\n*Previously, the entrance to the office unde...",Loading Screen Range.png,,"{{fi|IT}} Poveglia, Venice, Veneto, Italy, Alp...",,,Range minimap.png,,,{{PAGENAME}},"45°26'FF"" N 12°20'Q"" E<br>(45°26'55"" N 12°20'9...",Poveglia,,{{*}} [[Range/Lore]]<br>{{lorewikiarticle|The ...,,
4,Ascent,\nAscent's features include mechanical doors l...,\n*Ascent contains landmarks from Venice such ...,Loading Screen Ascent.png,,"{{fi|IT}} San Marco, Venice, Veneto, Italy, Al...",,A/B,Ascent minimap.png,,"June 2nd, 2020<br><small>([[Episode 01: IGNITI...",{{PAGENAME}},"45°26'BF"" N 12°20'Q"" E<br>(45°26'15"" N 12°20'9...",Ascent,Enabled,{{lorewikiarticle}},,


In [496]:
reduced_maps_df = maps_df[['title', 'features', 'trivia', 'location', 'elements', 
                           'sites', 'added', 'codename', 'rotation', 'teleporters', 'coordinates']]

In [497]:
weapons_df.columns

Index(['title', 'summary', 'image', 'icon', 'killfeed', 'type', 'length',
       'credits', 'mode', 'magazine', 'penetration', 'rate', 'run', 'equip',
       'spread', 'reload', 'function', 'zoom', 'altrate', 'move', 'altspread',
       '0-15m', '15-30m', '30-50m', 'reserve', 'notes', 'feature', 'creator',
       'stats', 'caption', '0-30m', '0-20m', '20-50m', '0-50m',
       'primarypellet', 'distance', 'pellet', '0-8m', '8-12m', '12-50m',
       '0-7m', '7-15m', '15-50m', 'burst', '0-10m', '10-15m', '1'],
      dtype='object')

In [498]:
weapons_df.head(1)

Unnamed: 0,title,summary,image,icon,killfeed,type,length,credits,mode,magazine,...,0-8m,8-12m,12-50m,0-7m,7-15m,15-50m,burst,0-10m,10-15m,1
0,Phantom,"{'rowspan=""3""': '30–50 m', 'Body': '4', 'Legs...",Phantom.png,[[File:Phantom icon.png|100x100px]],[[File:Phantom killfeed.png|50x50px]],[[Weapons#Rifles|Rifle]],121.11 cm <br> 90.59 cm without silencer,2900,Auto,30,...,,,,,,,,,,


In [499]:
reduced_weapons_df = weapons_df.drop(['image', 'icon', 'killfeed', '1'], axis=1)

Save the files to json

In [500]:
def convert_nested_dataframes(record):
    for key, value in record.items():
        if isinstance(value, pd.DataFrame):
            record[key] = value.to_dict(orient='records')  
    return record


def save_to_json(filepath, filename, df):
    df.fillna("", inplace=True)  

    records = df.to_dict(orient='records')

    nested_records = [convert_nested_dataframes(record) for record in records]

    full_path = f"{filepath}/{filename}" 
    with open(full_path, 'w') as json_file:
        json.dump(nested_records, json_file, indent=4)

In [501]:
path = "Z:\VCT HACK\Cleaned Jsons"

save_to_json(path, "fandom_players.json", player_wiki_filtered)
save_to_json(path, "fandom_teams.json", filtered_teams_df)
save_to_json(path, "fandom_tournaments.json", filtered_tournaments_df)
save_to_json(path, "fandom_maps.json", reduced_maps_df)
save_to_json(path, "fandom_abilities.json", reduced_abilities_df)
save_to_json(path, "fandom_weapons.json", reduced_weapons_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna("", inplace=True)
