In [1]:
import pandas as pd
import re
import time
import requests
import unidecode
import os
import math

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [2]:
class GreatBuilding():
    def __init__(self, name_esp, name_ing, levels):
        self.name_esp = name_esp
        self.name_ing = name_ing
        self.levels = levels
        self.dataFrame = None
    
    def __str__(self):
        return f"Name ESP: {self.name_esp} --> Name ING: {self.name_ing}\nNº Levels: {len(self.levels)}"
    
    def createDataFrame(self):
        levels_data = []
        # Create a list of dictionaries to store the data to create de final DataFrame
        for level in self.levels:
            levels_data.append({
                "Current Level": level.current_level,
                "Goal Level": level.goal_level,
                "Total Fps": level.total_fp,
                "Owner Investment": level.owner_investment,
                "P1 Reward": level.p1,
                "P2 Reward": level.p2,
                "P3 Reward": level.p3,
                "P4 Reward": level.p4,
                "P5 Reward": level.p5,
            })
        
        self.dataFrame = pd.DataFrame(data=levels_data)
    
    def saveDataFrame(self):
        # We remove accents and problematic characters, remove blank spaces and format the text to CamelCase.
        format_name = unidecode.unidecode( "".join(word for word in self.name_esp.title() if not word.isspace()) )

        self.dataFrame.to_csv(f"../data/buildings/{format_name}.csv", index=False, header=True, sep=";", encoding="utf-8")

In [3]:
class Level():
    def __init__(self, current_level, goal_level, total_fp, owner_investment, p1, p2, p3, p4, p5):
        self.current_level = current_level
        self.goal_level = goal_level
        self.total_fp = total_fp
        self.owner_investment = owner_investment
        self.p1, self.p2, self.p3, self.p4, self.p5 = p1, p2, p3, p4, p5

    def __str__(self):
        return f"Current Level: {self.current_level} \Goal Level: {self.goal_level} \nTotal Fps: {self.total_fp} \nOwner Investment: {self.owner_investment} \nP1: {self.p1} \nP2: {self.p2} \nP3: {self.p3} \nP4: {self.p4} \nP5: {self.p5}"

In [4]:
def getDataLevels(min_level, max_level):
    levels_data = []
    for level in range(min_level, max_level + 1):
        print(f"Level: {level} of {max_level}") # Showing scraping progress

        # We clear the data input field and set the value of the building level at each step of the iteration
        GB_level_input.clear() 
        GB_level_input.send_keys( str(level) )
        # We obtain the table where the building data appears and we format it in list mode
        rewardsTable = driver.find_element(By.CLASS_NAME, "gbiTable") \
            .text.split("\n")
        
        # We extract the date from rewards table
        GB_level = getDataFromRewardsTable(rewardsTable)
        
        # We use the iteration variable to get missing attributes
        GB_level.current_level = level - 1
        GB_level.next_level = level
        
        levels_data.append(GB_level)  
    
    return levels_data

In [5]:
def getDataFromRewardsTable(rewards_table):
    owner_investment = int( "".join( re.findall(r'\d', rewards_table[-2]) ) )
    total_fp = int( "".join( re.findall(r'\d', rewards_table[-1]) ) )
    p1 = rewards_table[3].split(" ")[1]
    p2 = rewards_table[5].split(" ")[1]
    p3 = rewards_table[7].split(" ")[1]
    p4 = rewards_table[9].split(" ")[1]
    p5 = rewards_table[11].split(" ")[1]

    return Level("", "", total_fp, owner_investment, p1, p2, p3, p4, p5)

## Step 1: Extraction of the names of the buildings and links of their pages

In [6]:
# Open browser
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(10)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/jmsimonramos/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache
  


In [7]:
# We open the main page of the tool to get the data
driver.get("https://foe.tools/es/gb-investment/Space_Needle")
print("WebPage Loaded")
time.sleep(10) # Wait 10 seconds to load the page

# We wait until cookie button are vissible to remove it
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "css-47sehv"))).click()

# We get the Div element witch contains the list of all buildings
GB_nameList_div = driver.find_element(By.ID, "gbListSelect")

# We select a element in the middle of the page to scroll the windows to here and click the button to change the type of filter to apply to the building list
movment_element = driver.find_element(By.ID, "fieldInvestorPercentage")
webdriver.ActionChains(driver).move_to_element(movment_element).perform()
print("Movement to filter button performed")

# We click the button to change the type of filter of the buildings list
GB_nameList_div.find_element(By.CLASS_NAME, "bg-blueGray-200").click()

# We get the final list of buildings
GB_list = driver.find_element(By.TAG_NAME, "optgroup") \
                .find_elements(By.TAG_NAME, "option")

print("Obtained list of Buildings!")

WebPage Loaded
Movement to filter button performed
Obtained list of Buildings!


In [8]:
# We check if have all buildings (actualy exists 42)
print(len(GB_list))

42


In [9]:
# We get the links to page from each building
base_url = "https://foe.tools/es/gb-investment/"
GB_list_links = []
for index, building in enumerate(GB_list):
    print(f"Building {index + 1} of {len(GB_list)}")
    # We generate the complete url and we make a request to check if is valid
    url = base_url + building.get_attribute("value")
    request = requests.get(url)

    if request.status_code == 200: # If all is ok, we add the information to list
        GB_list_links.append(f"{building.get_attribute('value')};{url}")
    else: # In case of error, we show a message of failure
        print(f"ERROR. There have a problem with the building: {building.get_attribute('value')}\nURL: {url}. Status Code: {request.status_code}")

# We stored the data in a .csv file to avoid of repeat this step 
with open ("../data/links.csv", "w", encoding="utf-8") as file:
    file.write("Building;Link\n")
    for line in GB_list_links:
        file.write(line + "\n")

print("Data of buildings and their link stored correctly!!")

Building 1 of 42
Building 2 of 42
Building 3 of 42
Building 4 of 42
Building 5 of 42
Building 6 of 42
Building 7 of 42
Building 8 of 42
Building 9 of 42
Building 10 of 42
Building 11 of 42
Building 12 of 42
Building 13 of 42
Building 14 of 42
Building 15 of 42
Building 16 of 42
Building 17 of 42
Building 18 of 42
Building 19 of 42
Building 20 of 42
Building 21 of 42
Building 22 of 42
Building 23 of 42
Building 24 of 42
Building 25 of 42
Building 26 of 42
Building 27 of 42
Building 28 of 42
Building 29 of 42
Building 30 of 42
Building 31 of 42
Building 32 of 42
Building 33 of 42
Building 34 of 42
Building 35 of 42
Building 36 of 42
Building 37 of 42
Building 38 of 42
Building 39 of 42
Building 40 of 42
Building 41 of 42
Building 42 of 42
Data of buildings and their link stored correctly!!


In [10]:
driver.close() # We close the browser

## Step 2: Extraction of the necesary fps for each level of all buildings

In [11]:
# Open browser
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(10)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/jmsimonramos/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache
  


In [12]:
with open ("../tmp/links.csv", "r", encoding="utf-8") as file:
    GB_list_links = [line.strip().split(";") for line in file.readlines()]
# We remove the first item in the list (it`s correspond to the header of the .csv)
GB_list_links = GB_list_links[1:]

In [13]:
for index, (building, link) in enumerate(GB_list_links):
    print(f"Building {index + 1} of {len(GB_list_links)}")
    # We open the page and wait to load it
    driver.get(link) 
    time.sleep(10) # We wait until cookie button are vissible to remove it

    if index == 0: # If it`s the first message we remove the cookies banner
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "css-47sehv"))).click()
        # If cookies banner not appear, we continue with the program without this step
        except TimeoutException:
            pass
    
    # We get the part of page wich appear the metadata of the building
    GB_level_input = driver.find_element(By.ID, "giLevel")
    building_name = driver.find_element(By.CLASS_NAME, "title").text

    # We get the min and max levels of the building
    min_level = int( GB_level_input.get_attribute("min") )
    max_level = int( GB_level_input.get_attribute("max") )
    
    print(f"Building: {building_name} \t Min Level: {min_level} \t Max Level: {max_level}")

    # We get the data of the building for all levels between min level and max level
    levels_data = getDataLevels(min_level, max_level)

    # We create a Great Building object to store the information
    GB = GreatBuilding(building_name.replace("_", " "), building.replace("_", " "), levels_data)

    # Finally, we create and store the data of the building
    print("Create DataFrame")
    GB.createDataFrame() 
    print("Stored DataFrame")
    GB.saveDataFrame()
    print("================================================\n")

print("All Buildings have been completed satisfactory!!")

All Buildings have been completed satisfactory!!


In [14]:
driver.close() # We close the browser

### Step 3: We calculate the Arc 80 Bonus for all levels

In [15]:
# We get the list of all files in buildings directory (building_name.csv)
files_list = [file for file in os.listdir("../data/buildings")]
files_list = sorted(files_list)
files_list[:5]

['AgujaEspacial.csv',
 'Alcatraz.csv',
 'AlmacenDeSemillas.csv',
 'Atomium.csv',
 'BasilicaDeSanMarcos.csv']

In [16]:
for file in files_list:
    df = pd.read_csv(f"../data/buildings/{file}", sep=";", encoding="utf-8")
    # We calculate the arc level 80 bonus
    df["P1 Reward (Arc Level 80)"] = df["P1 Reward"].apply(lambda x: math.ceil(x * 1.9) )
    df["P2 Reward (Arc Level 80)"] = df["P2 Reward"].apply(lambda x: math.ceil(x * 1.9) )
    df["P3 Reward (Arc Level 80)"] = df["P3 Reward"].apply(lambda x: math.ceil(x * 1.9) )
    df["P4 Reward (Arc Level 80)"] = df["P4 Reward"].apply(lambda x: math.ceil(x * 1.9) )
    df["P5 Reward (Arc Level 80)"] = df["P5 Reward"].apply(lambda x: math.ceil(x * 1.9) )
    # Save de dataframe with new data
    df.to_csv(f"../data/buildings/{file}", sep=";", header=True, index=False, encoding="utf-8")

### Step 4: Create a big DataFrame with the levels of all buildings

In [17]:
def formatBuildingName(building_name):    
    # We create a list with each item starting with a capital letter, and join them together by introducing a blank space between the words. Finally we remove the file extension
    return " ".join( re.findall('[A-Z][^A-Z]*', building_name) ).replace(".csv", "")

In [18]:
# We get the list of all files in buildings directory (building_name.csv)
files_list = [file for file in os.listdir("../data/buildings")]
files_list = sorted(files_list)
files_list[:5]

['AgujaEspacial.csv',
 'Alcatraz.csv',
 'AlmacenDeSemillas.csv',
 'Atomium.csv',
 'BasilicaDeSanMarcos.csv']

In [19]:
building_df = pd.read_csv("../data/buildingsInfo.csv", sep=";", encoding="utf-8")
building_df.head()

Unnamed: 0,Building Name ESP,Building Name ING,Age,Size,Features
0,Aguja Espacial,Space Needle,Edad Moderna,6x5,Producción de monedas: Felicidad
1,Alcatraz,Alcatraz,Era del Progreso,10x7,Unidad Penitenciaria Felicidad
2,Almacen De Semillas,Seed Vault,Futuro Ártico,5x6,Producción de Suministros Manos que ayudan
3,Atomium,Atomium,Edad Moderna,7x6,Bienes del gremio Felicidad
4,Basilica De San Marcos,St Mark S Basilica,Plena Edad Media,6x6,Producción de productos Mejora de monedas


In [20]:
joined_df = pd.DataFrame()
for file in files_list:
    df = pd.read_csv(f"../data/buildings/{file}", sep=";", encoding="utf-8") # Load data
    # Insert column with building names at firsts postions
    building_name_esp = formatBuildingName(file)
    building_name_ing = building_df[building_df["Building Name ESP"] == building_name_esp][["Building Name ING"]].values[0][0]
    
    df.insert(0, "Building Name ESP", building_name_esp)
    df.insert(1, "Building Name ING", building_name_ing)
    
    # Concat both DataFrames
    joined_df = pd.concat([joined_df, df], axis=0)

print(f"Final DataFrame dimensions: {joined_df.shape}")

Final DataFrame dimensions: (6452, 16)


In [21]:
joined_df.to_csv("../data/ALLBuildingLevels.csv", sep=";", index=False, header=True, encoding="utf-8")