# Fandom Riot Dump Parsing

In [3]:
from bs4 import BeautifulSoup
import pandas as pd

import mwparserfromhell as mw
import os
import re

In [4]:
# Parse files with BS4

# filepath = "fandom/"
filepath = "Z:\\VCT HACK\\fandom\\"

with open(filepath+"valorant_esports_pages.xml", 'r', encoding='utf8') as file:

    val_esports_pages = BeautifulSoup(file, "xml")
    
with open(filepath+"valorant_pages.xml", 'r', encoding='utf8') as file:

    val_pages = BeautifulSoup(file, "xml")

## Get Valorant player bios

### Filter wiki entries to only be player pages

In [5]:
# Filter and keep only wiki info entries
esports_wiki_entries = []


for page_entry in val_esports_pages.find_all('page'):

    if page_entry.find('ns').text == "0":
        esports_wiki_entries.append(page_entry)


In [6]:
# Further filter by main text if it has a player info box

esports_wiki_player_entries = []


for page_entry in esports_wiki_entries:

    if "Infobox Player" in page_entry.find('text').text:
        esports_wiki_player_entries.append(page_entry)


In [7]:
missing_infobox = 0

player_info_dicts = []

# Extract data from player pages
for page in esports_wiki_player_entries:
    
    # Print player page data
    player_info = {}

    # print(f"Player: {page.find('title').text}")
    # print(f"Last Updated: {page.find('timestamp').text}")

    # Add to dict
    player_info['player'] = (page.find('title').text).strip()
    player_info['last_update'] = page.find('timestamp').text

    page_text = mw.parse(page.find('text').text)

    templates = page_text.filter_templates()
    sections = page_text.get_sections()
    
    # Get infobox player data

    # Check if infobox is available
    if templates[0].name.strip() == "Infobox Player":
        player_infobox = templates[0]

        for row in player_infobox.params:

            player_info[str(row.name)] = (row.value).strip()

    else:
        # print("No infobox found")
        missing_infobox += 1

    # Check if biography and trivia exist
    for section in sections:

        try:
            section_title = section.filter_headings()[0].title.strip().lower()
            #print(section_title)

            if "biography" in section_title:
                biography = str(section).replace("== Biography ==", "").replace("==Biography==", "").strip()
                # print(f"Bio: {biography}")

            if "trivia" in section_title:
                trivia = str(section).replace("== Trivia ==", "").replace("==Trivia==", "").strip()
                # print(f"Trivia: {trivia}")
        except:
            continue

    player_info['bio'] = biography
    player_info['trivia'] = trivia


    player_info_dicts.append(player_info)
    


In [8]:
# Convert to pandas for easier manipulation
player_wiki = pd.DataFrame(player_info_dicts)

In [9]:
# Show all columns currently
player_wiki.columns

Index(['player', 'last_update', 'isretired', 'noteamhist', 'id', 'name',
       'pronoun', 'checkboxAutoImage', 'checkboxShowImage', 'country',
       'residency', 'checkbox-res', 'birth_date_year', 'birth_date_month',
       'birth_date_day', 'checkbox1', 'compID1', 'checkboxComp', 'role',
       'checkboxPrev', 'checkboxIsSub', 'checkboxSharesRole',
       'checkboxAutoTeams', 'stream', 'facebook', 'twitter', 'instagram',
       'youtube', 'checkboxIsPersonality', 'checkboxSuppressOrgNavbox',
       'checkboxSBS', 'checkboxTOCR', 'checkbox2', 'checkbox3', 'bio',
       'trivia', 'snapchat', 'reddit', 'discord', 'otherwikis', 'team',
       'teamhist1', 'teamdate1', 'isretiredplayer', 'nativename',
       'namealphabet', 'compID2', 'low_content', 'checkboxIsLowercase',
       'checkboxPlatformPrev', 'esea', '5ewin', 'b5csgo', 'steam',
       'nationality', 'sponsor', 'site', 'teamrole1', 'pronly', 'isinactive',
       'tracker', 'teamrole2', 'prev1', 'image', 'text', 'usercountry',
  

In [10]:
# Remove columns we don't need
player_wiki_filtered = player_wiki[['player', 'bio', 'trivia', 'isretired', 'name', 'pronoun', 'country', 'residency', 'birth_date_year', 'role', 'stream', 'facebook', 'twitter', 'instagram', 'youtube', 'nativename', 'last_update']]

# Rename columns to make it easier
player_wiki_filtered = player_wiki_filtered.rename(columns={'player': 'Player', 'isretired': 'is_retired', 'nativename': 'native_name'})
# player_wiki_filtered



In [11]:
# Save to json
# fielpath = "Cleaned-Jsons/"
filepath = "Z:\\VCT HACK\\Cleaned Jsons\\"

player_wiki_filtered.to_json(filepath+"players-wiki.json")

In [12]:
# Determine how many players from Riot data have wiki pages
# filepath = "shortlist-model-concept/"
# filename = "all-players-stats.json"

riot_data = pd.read_json(filepath+filename)
riot_data.head()

NameError: name 'filename' is not defined

In [64]:
# Inner join to remove rows that don't have wiki pages

combined_fandom_riot = pd.merge(riot_data, player_wiki_filtered, on='Player', how='inner')
# combined_fandom_riot

NameError: name 'riot_data' is not defined

In [63]:
# combined_fandom_riot.describe(include='all')

## Get Valorant Game Information (Agents, Maps, Abilities, etc)
---
Give more context to the LLM to understand Valorant.

### Filter wiki entries to only be game info pages

In [13]:
# Filter and keep only wiki info entries
game_wiki_pages = []

for page_entry in val_pages.find_all('page'):

    """
    Namespaces:
    0 -> All wiki entries such as Agents, Maps, About content, etc
    10 -> Tables and templates. Some pages are linked to tables and they are also in this namespace (Ex: Agent Table)
    
    Some of Namespace 10 are not useful. Look over and filter out what tables we actually want by name (Ex: Template:Agent Table)
    
    """
    if page_entry.find('ns').text in ['0', '10']:   
        game_wiki_pages.append(page_entry)

#game_wiki_pages, len(game_wiki_pages)

In [17]:
# this code is so inefficient so much repetition is actually bruns my eyes to look at but i am lazy
def parse_ability(template, sections):

    ability_dict = {}

    for row in template.params:
        ability_dict[str(row.name)] = (row.value).strip()

        for section in sections:

                try:            
                    section_title = section.filter_headings()[0].title.strip().lower()

                    if "summary" in section_title or 'description' in section_title:
                        summary = str(section).replace("== Summary ==", "").replace("==Summary==", "").replace("== Description ==", "").replace("==Description==", "").strip()
                        ability_dict['summary'] = summary
                     

                    elif "stats" in section_title in section_title:
                        stats = str(section).replace("== Stats ==", "").replace("==Stats==", "").strip()
                        ability_dict['stats'] = stats

                except:
                    continue     

    return ability_dict

def parse_map(template, sections):
     
    map_dict = {}

    for row in template.params:
        map_dict[str(row.name)] = (row.value).strip()

        for section in sections:

                try:            
                    section_title = section.filter_headings()[0].title.strip().lower()

                    if "features" in section_title:
                        features = str(section).replace("== Features ==", "").replace("==Features==", "").strip()
                        map_dict['features'] = features
                     

                    elif "trivia" in section_title:
                        trivia = str(section).replace("== Trivia ==", "").replace("==Trivia==", "").strip()
                        map_dict['trivia'] = trivia

                except:
                    continue     

    return map_dict

def parse_map(template, sections):
     
    map_dict = {}

    for row in template.params:
        map_dict[str(row.name)] = (row.value).strip()

        for section in sections:

                try:            
                    section_title = section.filter_headings()[0].title.strip().lower()

                    if "features" in section_title:
                        features = str(section).replace("== Features ==", "").replace("==Features==", "").strip()
                        map_dict['features'] = features
                     

                    elif "trivia" in section_title:
                        trivia = str(section).replace("== Trivia ==", "").replace("==Trivia==", "").strip()
                        map_dict['trivia'] = trivia

                except:
                    continue     

    return map_dict

def parse_weapon(template, sections):
     
    weapon_dict = {}

    for row in template.params:
        weapon_dict[str(row.name)] = (row.value).strip()

        for section in sections:

                try:            
                    section_title = section.filter_headings()[0].title.strip().lower()

                    if "summary" in section_title:
                        summary = str(section).replace("== Summary ==", "").replace("==Summary==", "").strip()
                        weapon_dict['summary'] = summary
                     

                    elif "stats" in section_title:
                        stats = str(section).replace("== Stats ==", "").replace("==Stats==", "").strip()
                        weapon_dict['stats'] = stats

                except:
                    continue     

    return weapon_dict

Abilities

In [18]:
abilities = []
maps = []
weapons = []

# Extract data from wiki pages
for page in game_wiki_pages:
    
    # Print player page data
    ability_dict = {}

    page_text = mw.parse(page.find('text').text)

    # print(page_text)

    templates = page_text.filter_templates()
    sections = page_text.get_sections()
    
    index = 0

    if not templates:
        continue

    for template in templates:

        if template.name.strip() == "Infobox ability" :
            ability_dict = parse_ability(template, sections)
            abilities.append(ability_dict)

        elif template.name.strip() == 'Infobox map':
            map_dict = parse_map(template, sections)
            maps.append(map_dict)

        elif template.name.strip() == 'Infobox weapon':
            weapon_dict = parse_weapon(template, sections)
            weapons.append(weapon_dict)

In [47]:
abilities_df = pd.DataFrame(abilities)
maps_df = pd.DataFrame(maps)
weapons_df = pd.DataFrame(weapons)

In [112]:
cleaned_columns = [column.strip() for column in abilities_df.columns]
abilities_df.columns = cleaned_columns


In [113]:
abilities_df.columns

Index(['title', 'summary', 'stats', 'image', 'caption', 'description', 'agent',
       'type', 'function', 'friendlyfire', 'creds', 'uses', 'key',
       'navigation', 'title', 'image', 'description', 'agent', 'type',
       'health', 'fulleffects', 'restock', 'points', 'mapobjects', 'restock',
       'function', 'uses_per_round', 'health', 'credits', 'uses_per_round',
       'caption', 'credits', 'creds', 'points', 'key', 'uses', 'killfeed'],
      dtype='object')

In [114]:
cleaned_abilities_df = abilities_df[['title', 'summary', 'stats','description', 'agent', 'type', 'function', 'friendlyfire', 
                             'creds', 'uses', 'type', 'health', 'restock', 'points', 'mapobjects', 'function', 'uses_per_round', 'creds']]

In [115]:
columns = cleaned_abilities_df

duplicate_columns = [col for col in set(columns) if list(columns).count(col) > 1]

print("Duplicate columns:", duplicate_columns)

Duplicate columns: ['uses', 'uses_per_round', 'function', 'restock', 'agent', 'creds', 'description', 'points', 'title', 'health', 'type']


In [116]:
def deduplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df


renamed_df = deduplicate_columns(cleaned_abilities_df)

renamed_df.columns

def compare_columns(column1, column2):

    identical = True
    indexes = []

    for i in range(0,len(column1)):

        if pd.isna(column1.iloc[i]) and pd.isna(column2.iloc[i]):
            continue
        
        if column1.iloc[i] != column2.iloc[i]:
            identical = False
            indexes.append(i)

    if identical:
        print("Identical columns")

    else:
        print("Different rows: ")

        for index in indexes:
            
            print(f"Column 1: {column1.iloc[index]} Column 2: {column2.iloc[index]}")

In [82]:
cleaned_abilities_df = deduplicate_columns(cleaned_abilities_df)

for column in duplicate_columns:

    print(compare_columns())

Identical columns
None
Identical columns
None
Identical columns
None


In [93]:
cleaned_abilities_df = renamed_df.drop(['function.1', 'type.1', 'creds.1'], axis=1)

In [94]:
print(cleaned_abilities_df)

           title                                            summary  \
0      Curveball  Curveball is a [[Blind]] ability that [[Phoeni...   
1            NaN  Slow Orb is a [[Crowd Control]] ability that [...   
2            NaN  Barrier Orb is primarily a [[Barrier]] ability...   
3            NaN  Healing Orb is a [[Sustain]] ability that [[Sa...   
4            NaN  Resurrection is a [[Sustain]] ability that [[S...   
..           ...                                                ...   
97     Razorvine  Razorvine is a [[Crowd Control]] ability that ...   
98         Shear  Shear is primarily an [[Intel]] ability that V...   
99      Arc Rose  Arc Rose is primarily a [[Blind]] ability that...   
100          NaN  Steel Garden is a [[Limiter]] ability that Vys...   
101  Stim Beacon                                                NaN   

                                                 stats  \
0    {| class="wikitable" style="text-align:left"\n...   
1    {| class="wikitable" style