In [1]:
import pandas as pd
import requests
import os
import re

## Step 1: Enrich the actual general data of the buildings

In [2]:
def formatBuildingName(building_name):    
    # We create a list with each item starting with a capital letter, and join them together by introducing a blank space between the words. Finally we remove the file extension
    return " ".join( re.findall('[A-Z][^A-Z]*', building_name) ).replace(".csv", "")

In [3]:
# We get the list of all files in buildings directory (building_name.csv)
files_list = [file for file in os.listdir("../data/buildings")]
files_list = sorted(files_list)
files_list[:5]

['AgujaEspacial.csv',
 'Alcatraz.csv',
 'AlmacenDeSemillas.csv',
 'Atomium.csv',
 'BasilicaDeSanMarcos.csv']

In [4]:
formated_name_list = [formatBuildingName(building) for building in files_list]
formated_name_list[:5]

['Aguja Espacial',
 'Alcatraz',
 'Almacen De Semillas',
 'Atomium',
 'Basilica De San Marcos']

In [5]:
with open ("../tmp/links.csv", "r", encoding="utf-8") as file:
    GB_list_links = [line.strip().split(";") for line in file.readlines()]
# We remove the first item in the list (it`s correspond to the header of the .csv)
GB_list_links = GB_list_links[1:]

In [6]:
# We iterate over the two lists and add the name information in different languages 
for build_link, format_name in zip(GB_list_links, formated_name_list):
    # We remove underlines and insert ESP building name in firts position
    build_link[0] = build_link[0].replace("_", " ").title()
    # Build_link = [ESP Name, ING Name, Link]
    build_link.insert(0, format_name)

In [7]:
# We stored the enrich build data in a .csv file
with open ("../tmp/buildings.csv", "w", encoding="utf-8") as file:
    file.write("Building Name ESP;Building Name ING;Link\n")
    for line in GB_list_links:
        file.write(";".join(line) + "\n")

## Step 2: Get new complementary building data

### First Time (Search and Create DataFrame)

In [8]:
# We take the main page of the wiki of FOE where appears the list of the all buildings and there features

main_url = "https://es.wiki.forgeofempires.com/index.php?title=Grandes_Edificios"
main_request = requests.get(main_url)

# We get the list of all tables that exists in the page
tables_list_df = pd.read_html(main_request.text)

# We keep the second table that appears since it is the one that contains the information of the buildings (See the page to know the position within the array where it is located) and we eliminate the column "Unnamed: 0" since it corresponds to the image of the building.
GB_DF = tables_list_df[1].drop(["Unnamed: 0"], axis = 1)
GB_DF.columns = ["Name", "Age", "Size", "Features"]

# Generamos un csv con la información principal de los edificios (Nombre, Era, Tamaño y su Habilidad)
GB_DF.to_csv("../tmp/GBWiki.csv", sep = ";", header = True, index_label="ID")

GB_DF.head()

Unnamed: 0,Name,Age,Size,Features
0,Observatorio,Sin Era,3x3,Bienes del gremio Bonificación de unidades de ...
1,Templo de las Reliquias,Sin Era,6x6,Caza de Reliquias
2,Oráculo de Delfos,Sin Era,3x3,Producción de Suministros Felicidad
3,Torre de Babel,Edad de Bronce,4x4,Producción de productos Población
4,Estatua de Zeus,Edad de Bronce,2x3,Mejora de ejércitos


### Other times (We already have the DataFrame)

In [9]:
# We charge the building data from the wiki and order their values by building name to join easiest with our dataframe
Wiki_GB_df = pd.read_csv("../tmp/GBWiki.csv", sep = ";", encoding="utf-8")
Wiki_GB_df.sort_values(by=["Name"], inplace=True)

Wiki_GB_df = Wiki_GB_df[["Age", "Size", "Features"]] # Remove unnecesary columns
Wiki_GB_df.head()

Unnamed: 0,Age,Size,Features
20,Edad Moderna,6x5,Producción de monedas: Felicidad
19,Era del Progreso,10x7,Unidad Penitenciaria Felicidad
31,Futuro Ártico,5x6,Producción de Suministros Manos que ayudan
21,Edad Moderna,7x6,Bienes del gremio Felicidad
10,Plena Edad Media,6x6,Producción de productos Mejora de monedas


In [10]:
# We carge our DataFrame
GB_df = pd.read_csv("../tmp/buildings.csv", sep = ";", encoding="utf-8")
GB_df = GB_df[["Building Name ESP", "Building Name ING"]]
GB_df.head()

Unnamed: 0,Building Name ESP,Building Name ING
0,Aguja Espacial,Space Needle
1,Alcatraz,Alcatraz
2,Almacen De Semillas,Seed Vault
3,Atomium,Atomium
4,Basilica De San Marcos,St Mark S Basilica


In [11]:
# We combine the two DataFrame to get all of information
# We reset index because we want concat ignore that
final_DF = pd.concat([GB_df, Wiki_GB_df.reset_index()], axis = 1)
final_DF.drop("index", axis = 1, inplace=True) # Remove the new column create due reset index
final_DF.rename(columns={"Link": "Link Calculator Tool"}, inplace=True)
final_DF.head()

Unnamed: 0,Building Name ESP,Building Name ING,Age,Size,Features
0,Aguja Espacial,Space Needle,Edad Moderna,6x5,Producción de monedas: Felicidad
1,Alcatraz,Alcatraz,Era del Progreso,10x7,Unidad Penitenciaria Felicidad
2,Almacen De Semillas,Seed Vault,Futuro Ártico,5x6,Producción de Suministros Manos que ayudan
3,Atomium,Atomium,Edad Moderna,7x6,Bienes del gremio Felicidad
4,Basilica De San Marcos,St Mark S Basilica,Plena Edad Media,6x6,Producción de productos Mejora de monedas


In [12]:
final_DF.to_csv("../data/buildingsInfo.csv", sep=";", header=True, index=False, encoding="utf-8")