<a href="https://colab.research.google.com/github/hazem-antar/Web-Scraping-and-Data-Analysis/blob/main/Scandiweb_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Gathring Data: Web Scraping

In [None]:
#Importing libraries
!pip install colorthief
from bs4 import BeautifulSoup 
import urllib.request
import pandas as pd
import numpy as np
from time import sleep
from tqdm import tqdm
import re
from google.colab import files
import io
import warnings
from colorthief import ColorThief
from sklearn.neighbors import DistanceMetric

#For supressing warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn

In [None]:
#Scrapper for collecting Pokemons source page URLs
opener = urllib.request.build_opener()             #Creating an object of a URL opener
opener.addheaders = [('User-agent', 'Mozilla/5.0')] #Setting virtual headers to the object because accessing some pages from an automated script will be denied
Pokemon_pages = []
with opener.open('https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number') as page:   #Accssing the main Pokemons list page and requesting its HTML code
        page_html = page.read().decode('utf-8')                           #Read and decode the HTML Code to a string
        page_soup = BeautifulSoup(page_html, 'html.parser')               #Parse and convert the string into a tree-like structure of parent and children tags
        tables_div = page_soup.find("div", {"class": "mw-parser-output"}) #Look for a div tag in the tree with "class = mw-parser-output"
        tables = tables_div.find_all('table', recursive=False)            #Look for every direct table child within the preivous div
        tables = tables[1:-1]
        for table in tables:
          tables_rows = table.find_all("tr")  #extracting table rows ("each representing a specific Pokemon")
          tables_rows.pop(0)                  #remove header row
          for row in tables_rows:
            tables_a = row.find("a")          #find <a> tag which holds the second part of URL in its ["href"]
            if tables_a != None:
              Pokemon_pages.append("https://bulbapedia.bulbagarden.net/" + tables_a["href"])   #generating and appending the URL
        Pokemon_pages = list(dict.fromkeys(Pokemon_pages))  #removing any duplicate URL
        assert len(Pokemon_pages) == 905      #confirming that all the 905 district page URL was collected

In [None]:
def identify_color(Img_URL, Possible_Colors):
  """Function to determine the color of any Pokemon apperence (variant)
  - Input: Image URL of the target apperence, and List of possible colors to choose from
  - Output: color feature of the Pokemon apperence
  - Method: Finding the main color theme of the image and finding the crossing color from the possible colors list that has minimum euclidean distance with the theme color
  """
  colors_data = {'Yellow': (243,215,124), 'Black': (79,83,78), 'Purple': (162,136,188), 'White': (223,229,241), 'Blue': (0,96,156),
                 'Brown': (189,134,78), 'Red': (218,111,90), 'Green': (142,184,118), 'Gray': (154,156,177), 'Pink': (248,216,219)}    #RGP Data about all main Pokemons colors
  req = urllib.request.Request(Img_URL, headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'} )                       #Requesting the image from URL
  img = urllib.request.urlopen(req)
  dominant_color = ColorThief(img).get_color(quality=1)   #find theme color of the image using ColorThief library
  dist = DistanceMetric.get_metric('euclidean')           #defining the measuring technique in Distance Metric library
  identified_color = ""
  least = np.inf                  #least calculated distance, initialized as infinity
  for color in Possible_Colors:   #calculate distance for each color in possible colors
    X = [dominant_color, colors_data[color]]
    distance = dist.pairwise(X)[0][1]
    if distance < least:          #if distance < least calculated distance, update the least calculated distance and the identified color
      least = distance
      identified_color = color
  return(identified_color)

Collecting colorthief
  Downloading colorthief-0.2.1-py2.py3-none-any.whl (6.1 kB)
Installing collected packages: colorthief
Successfully installed colorthief-0.2.1


In [None]:
 #Main Dataframe holding the Pokemons data
Pokemons_df = pd.DataFrame(columns = ["NUMBER", "CODE", "SERIAL", "NAME", "TYPE1", "TYPE2", "COLOR", "ABILITY1", "ABILITY2",
                                      "ABILITY HIDDEN", "GENERATION", "LEGENDARY", "MYTHICAL", "MEGA_EVOLUTION", "HEIGHT",
                                      "WEIGHT", "HP", "ATK", "DEF", "SP_ATK", "SP_DEF", "SPD", "TOTAL", "MOVES"])       

In [None]:
#Pokemon Page Scrapper
for p in tqdm(range(905)):      #Fetching each Pokemon page
  with opener.open(Pokemon_pages[p]) as page:  
    page_html = page.read().decode('utf-8')    
    page_soup = BeautifulSoup(page_html, 'html.parser')   
    info_table = page_soup.find("div", {"class" : "mw-parser-output"}).find("table", {"class" : "roundy"}, recursive=False)
    Pok_number = int(info_table.find("th", {"class": "roundy"}).find("span").text[1:])  #Pokemon number is common to all variants
    Pok_code = 1    #Intializing the Pokemon code with 1. Increases with every variant

    #Collecting the Pokemon attack moves (common for all variants)--------------------------------------------------------------------------------
    moves_df = pd.DataFrame(columns = ["move", "type", "category", "power"])  #Dataframe to store the Pokemon attack moves and their properties       
    h4_labels = page_soup.find_all("h4") 
    for label in h4_labels:
      if label.find("span", id=lambda value: "By_" not in value): continue    #Select only <h4> containing <span> with id containing the word"By _"
      span_id = label.find("span")["id"]    #Get the span id to determine the type and number of table/s that will follow the label
      T = []                                #Holds all table/s belonging to the current label for fetching
      if span_id == "By_TM/TR":
        next_table = label
        for i in range(2):      #Find the two preceding tables to the label
          next_table = next_table.find_next_sibling("table", recursive=False)
          if next_table["class"][0] == "expandable":    #If table is extendable (means it contain the target table inside), then find the get table inside
            T.append(next_table.find("table"))
          else:
            T.append(next_table)                        #If table is not extendable (it is the target), append the table to T
      else:                     #If span_id other than "By_TM/TR" then we only have 1 table after the label
        next_table = label.find_next_sibling("table", recursive=False)
        if next_table["class"][0] == "expandable":
          T.append(next_table.find("table"))
        else:
          T.append(next_table)
      
      for t in T:                                               #Fetch each table in T (target tables following the label)
        moves_rows = t.find("tbody").find_all("tr", recursive=False)[1].find("table").find("tbody", recursive=False).find_all("tr", recursive=False)  #Attack moves rows
        for row in moves_rows:  
          moves_data = {"move":np.nan, "type":np.nan, "category":np.nan,  "power":0}     #Temporary dictionary to hold the move properties
          properties = row.find_all("td", recursive=False)      #Find all coulmns within the row (properties)
          if properties == [] : continue                        #If no properties skip the current row
           
           #Depending on which type label is being analysed, the "moves data" dictionary will get filled using different indices of the "properties" list
          
          if span_id == "By_leveling_up" or span_id == "By_breeding" or span_id == "By_a_prior_evolution" or  span_id == "By_events":   #labels with tables having similar columns structure
            try:     #Try because the same type of label has proceeding tables with two different structures in different pages
              moves_data["move"] = properties[1].text.strip()
              moves_data["type"] = properties[2].text.strip()
              moves_data["category"] = properties[3].text.strip()
              moves_data["power"] = properties[4].find(text=True, recursive=False).strip()
              if "—" in moves_data["power"] or "%" in moves_data["power"]: continue   #if the "power" property contain garbage values skip the row
              moves_data["power"] = int(moves_data["power"])  
            except:
              moves_data["move"] = properties[2].text.strip()
              moves_data["type"] = properties[3].text.strip()
              moves_data["category"] = properties[4].text.strip()
              moves_data["power"] = properties[5].find(text=True, recursive=False).strip()
              if "—" in moves_data["power"] or "%" in moves_data["power"]: continue
              moves_data["power"] = int(moves_data["power"])  

          elif span_id == "By_TM/TR" or  span_id == "By_TM":                          #labels with tables having similar columns structure
            moves_data["move"] = properties[2].text.strip()
            moves_data["type"] = properties[3].text.strip()
            moves_data["category"] = properties[4].text.strip()
            moves_data["power"] = properties[5].find(text=True, recursive=False).strip()
            if "—" in moves_data["power"] or "%" in moves_data["power"]: continue
            moves_data["power"] = int(moves_data["power"])  

          elif span_id == "By_tutoring" or "By_transfer_from_another_generation":     #labels with tables having similar columns structure
            moves_data["move"] = properties[0].text.strip()
            moves_data["type"] = properties[1].text.strip()
            moves_data["category"] = properties[2].text.strip()
            moves_data["power"] = properties[3].find(text=True, recursive=False).strip()
            if "—" in moves_data["power"] or "%" in moves_data["power"]: continue
            moves_data["power"] = int(moves_data["power"]) 

          if moves_data["power"] != 0:    #if power has been updated append the temporary dictionary "moves_data" to the Pokemon moves dataframe "moves_df"
            moves_df = moves_df.append(pd.DataFrame({'move': moves_data['move'], 'type': moves_data['type'], 'category': moves_data['category'], 'power': [moves_data['power']]}),ignore_index = True)
    
    moves_df.drop_duplicates(inplace = True)        #Drop duplicate moves 
    moves_df.reset_index(drop=True, inplace= True)  #Reset "moves_df" dataframe index
    #Attack Moves Storing Ends------------------------------------------------------------------------------------------------------------------

    #Collecting the possible colors for all Pokemon variants
    possible_colors = []                    #Initialize empty possible colors list 
    key_words = page_soup.find("div", {"id": "mw-normal-catlinks"}).find_all("li")    #Find all keywords in the Pokemon page
    for key in key_words:
      if "colored" in key.find("a").text:   #If the keyword contain the word "colored" save the preceding color to the word in the "possible_colors" list
        possible_colors.append(key.find("a").text.split("-")[0])

    #Collecting Pokemon variants names
    variants_rows = info_table.find("table", {"class": "roundy"}).find("tbody").find("table", {"class": "roundy"}).find("tbody").find_all("tr", recursive=False)
    variants =[]
    for row in variants_rows:
        if row.has_attr('style') and row["style"] =="display:none;" : break  
        for i in row.find_all("small"):
          variants.append(i.text.strip())
    if len(variants) == 0:
      variants.append(info_table.find("big").text)
    variants = list(dict.fromkeys(variants))

    #Collecting the variant data --------------------------------------------------------------------------------------------
    for var in variants:
      if var == '': continue    #Skip garbage
      variant_data = {'NUMBER': np.nan, 'CODE': np.nan, 'SERIAL': np.nan, 'NAME': np.nan, 'TYPE1': np.nan, "TYPE2": np.nan,
                      "COLOR": np.nan, "ABILITY1": np.nan, "ABILITY2": np.nan, "ABILITY HIDDEN": np.nan, "GENERATION": np.nan,
                      "LEGENDARY": 0, "MYTHICAL": 0, "MEGA_EVOLUTION": 0, "HEIGHT": np.nan, "WEIGHT": np.nan, "HP": np.nan, 
                      "ATK": np.nan, "DEF": np.nan, "SP_ATK": np.nan, "SP_DEF": np.nan, "SPD": np.nan, "TOTAL": np.nan, 
                      "MOVES": moves_df}   #Temporary dictionary to hold the Pokemon variant data
      
      variant_data["NUMBER"] = Pok_number
      variant_data["CODE"] = Pok_code
      variant_data["SERIAL"] = str(Pok_number) + str(Pok_code)
      Pok_code += 1           #Increment code value for next variant
      variant_data["NAME"] = var.replace("\xa0"," ").strip()  #Some names has values "\xa0" instead of intermediate spaces
      
      #Extracting the image URL for current variant
      Var_Image_URL = "https://bulbapedia.bulbagarden.net/" + info_table.find("a", {"class":"image", "title": var})["href"]   
      with opener.open(Var_Image_URL) as image_page:
        Var_Image_URL = "https:" + BeautifulSoup(image_page.read().decode('utf-8'), 'html.parser').find("div", {"class":"fullImageLink"}).find("a")["href"]
      
      #Determining the color of the image using the implemented function "identify_color" 
      if (possible_colors[0] == "Unknown"):   #Skip color identification in pages mention only the keyword "Unknown_Colored"
        variant_data["COLOR"] = "Unknown"
      else:
        variant_data["COLOR"] = identify_color(Var_Image_URL, possible_colors)

      #Determining the type/s of the variant
      types_table = page_soup.find("div", {"class" : "mw-parser-output"}).find_all("table", recursive=False)[1].find("tbody")
      types_rows = types_table.find_all("tr", recursive=False)[1].find("td").find("table").find("tbody").find_all("tr", recursive=False)
      for row in types_rows:
        cols = row.find_all("td", recursive=False)
        for col in cols:
          if col.has_attr('style') and "display: none;" in col["style"] : continue
          name = col.find("small")
          if name == None or name.text.replace("\xa0"," ").strip() == variant_data["NAME"]:
            types = col.find_all("b")
            variant_data["TYPE1"] = types[0].text.strip()
            if len(types) == 2:
              variant_data["TYPE2"] = types[1].text.strip()

      #Determining the variant abilities
      abilities_table = info_table.find("a", {"title": "Ability"}).find_parent('b').find_parent('td').find("table", recursive=False)
      abilities_table_rows= abilities_table.find("tbody").find_all("tr", style=lambda value: value != "display:none;")      
      for row in abilities_table_rows:
        cols = row.find_all("td", recursive=False)
        for col in cols:
          if col.has_attr('style') and col["style"] == "display: none": continue
          name = col.find("small")
          if name == None or name.text.replace("\xa0"," ").strip() == variant_data["NAME"]:   #Check normal ability
            abilities = col.find_all("span")
            if abilities != None:
              variant_data["ABILITY1"] = abilities[0].text.strip()
              if len(abilities) == 2:
                variant_data["ABILITY2"] = abilities[1].text.strip()
          elif (name.text == " Hidden Ability") or (name.text.replace("\xa0"," ").strip() == variant_data["NAME"] + " Hidden Ability"):   # Check hidden ability
            abilities = col.find("span")
            if abilities != None:
              variant_data["ABILITY HIDDEN"] = abilities.text.strip()

      #Determining the variant generation
      if Pok_number <= 151:
        variant_data["GENERATION"] = 1
      elif Pok_number > 151 and Pok_number <= 251 :
        variant_data["GENERATION"] = 2
      elif Pok_number > 251 and Pok_number <= 386 :
        variant_data["GENERATION"] = 3 
      elif Pok_number > 386 and Pok_number <= 493 :
        variant_data["GENERATION"] = 4
      elif Pok_number > 493 and Pok_number <= 649 :
        variant_data["GENERATION"] = 5
      elif Pok_number > 649 and Pok_number <= 721 :
        variant_data["GENERATION"] = 6  
      elif Pok_number > 721 and Pok_number <= 809 :
        variant_data["GENERATION"] = 7
      else:
        variant_data["GENERATION"] = 8
      
      #Determining if variant is legendary
      legend_check = info_table.find_next_sibling("p").find("a", {"title":"Legendary Pokémon"})
      if legend_check != None:
        variant_data["LEGENDARY"] = 1
      
      #Determining if variant is mythical
      mythical_check = info_table.find_next_sibling("p").find("a", {"title":"Mythical Pokémon"})
      if mythical_check != None:
        variant_data["MYTHICAL"] = 1
      
      #Determining if variant is Mega evoluted
      if "Mega" in var:
        variant_data["MEGA_EVOLUTION"] = 1  
      
      #Determining the height of variant
      height_table = info_table.find("a", {"title": "List of Pokémon by height"}).find_parent('b').find_parent('td').find("table", recursive=False)
      height_table_rows = height_table.find("tbody").find_all("tr", style=lambda value: value != "display:none;")
      for row in range(0, len(height_table_rows), 2):
        if (len(height_table_rows) == 1) or (len(height_table_rows) > 1 and height_table_rows[row+1].text.replace("\xa0"," ").strip() == variant_data["NAME"]): 
            if "???" in height_table_rows[row].find_all("td")[1].text: continue
            elif "+" in height_table_rows[row].find_all("td")[1].text:
              variant_data["HEIGHT"] = float(height_table_rows[row].find_all("td")[1].text.strip().split("+")[0])
            else:
              variant_data["HEIGHT"] = float(height_table_rows[row].find_all("td")[1].text.strip().split(" ")[0])
      
      #Determining the weight of variant
      weight_table = info_table.find("a", {"title": "Weight"}).find_parent('b').find_parent('td').find("table", recursive=False)
      weight_table_rows = weight_table.find("tbody").find_all("tr", style=lambda value: value != "display:none;")
      for row in range(0, len(weight_table_rows), 2):
        if (len(weight_table_rows) == 1) or (len(weight_table_rows) > 1 and weight_table_rows[row+1].text.replace("\xa0"," ").strip() == variant_data["NAME"]):
            if "???" in weight_table_rows[row].find_all("td")[1].text: continue
            elif "+" in weight_table_rows[row].find_all("td")[1].text:
              variant_data["WEIGHT"] = float(weight_table_rows[row].find_all("td")[1].text.strip().split("+")[0])
            else:
              variant_data["WEIGHT"] = float(weight_table_rows[row].find_all("td")[1].text.strip().split(" ")[0])

      #Determining the variant stats
      stats_label = page_soup.find("span", {"class": "mw-headline","id": re.compile("Stats|Base_stats")}).find_parent(re.compile("h3|h4"))
      """C_1 and C_2 are used two collect unbounded amount of preceding and previous labels respectively that are considerd possible targets.
       Then the intersection of thses lists is found to bound the search and find the actual target labels"""
      c_1 = stats_label.find_next_siblings(re.compile("h5|h4"))  
      for c in c_1:
        span = c.find("span").text
        if span == None or span == "Base stats" or span == "Base Stats":   #Ignore the Base state label when searching
          c_1.remove(c)
      c_2 = stats_label.find_next_sibling(re.compile("h3|h4"), text=lambda value: (value != "Base stats") and (value != "Base Stats") and (value not in variants))
      c_2 = c_2.find_previous_siblings(re.compile("h5|h4"))
      for c in c_2:
        span = c.find("span").text
        if span == None or span == "Base stats" or span == "Base Stats": #Ignore the Base state label when searching
          c_2.remove(c)
      characteristics_labels = [value for value in c_1 if value in c_2]
      after_state = stats_label.find_next().find_next()
      if (((after_state.text == "Base stats") or (after_state.text == "Base Stats")) and (after_state.find_next().find_next().name == "table" or
                                                                                          after_state.find_next().find_next().name == "ul")) or (after_state.name == "table"):
        characteristics = page_soup.find("span", {"class": "mw-headline", "id": re.compile("Stats|Base_stats")}).find_parent(re.compile("h3|h4"))
        characteristics = characteristics.find_next_sibling("table").find_all("tr", style=lambda value: value and 'background' in value)
        assert len(characteristics) == 7
        count = 0
        L = ["HP", "ATK", "DEF", "SP_ATK", "SP_DEF", "SPD", "TOTAL"]
        for row in characteristics:
          variant_data[L[count]] = int(row.find("div", {"style": "float:right"}).text)
          count += 1 
      if len(characteristics_labels) > 0:
        for label in characteristics_labels:
          if (label.text.strip() == variant_data["NAME"]) or ("Generation" in label.text.strip()) or ("Super Size" in label.text.strip()) or ("Male" in label.text.strip()): 
            characteristic_table = label.find_next_sibling("table")
            state_tables_boundry = stats_label.find_next_sibling(re.compile("h3|h4"), text=lambda value: (value != "Base stats") and
                                                                 (value != "Base Stats") and (value not in variants)).find_previous_siblings("table")
            characteristic_table = [table for table in state_tables_boundry if table == characteristic_table]
            if len(characteristic_table) == 0:
              text = label.find_next_sibling("p").text
              for var_name in variants[::-1]:
                if "same" in text and var_name in text.split("has")[1]:
                  variant_data['HP'] = Pokemons_df.loc[Pokemons_df['NAME'] == var_name]["HP"].tolist()[0]
                  variant_data['ATK'] = Pokemons_df.loc[Pokemons_df['NAME'] == var_name]["ATK"].tolist()[0]
                  variant_data['DEF'] = Pokemons_df.loc[Pokemons_df['NAME'] == var_name]["DEF"].tolist()[0]
                  variant_data['SP_ATK'] = Pokemons_df.loc[Pokemons_df['NAME'] == var_name]["SP_ATK"].tolist()[0]
                  variant_data['SP_DEF'] = Pokemons_df.loc[Pokemons_df['NAME'] == var_name]["SP_DEF"].tolist()[0]
                  variant_data['SPD'] = Pokemons_df.loc[Pokemons_df['NAME'] == var_name]["SPD"].tolist()[0]
                  variant_data['TOTAL'] = Pokemons_df.loc[Pokemons_df['NAME'] == var_name]["TOTAL"].tolist()[0]
                  break
            else:
              characteristics = characteristic_table[0].find("tbody").find_all("tr", recursive=False)[2:-1]
              count = 0
              L = ["HP", "ATK", "DEF", "SP_ATK", "SP_DEF", "SPD", "TOTAL"]
              for row in characteristics:
                variant_data[L[count]] = int(row.find("div", {"style": "float:right"}).text)
                count += 1 
      #Appending the varient to the Pokemons dataframe         
      df_temp = pd.DataFrame({'NUMBER': variant_data['NUMBER'], 'CODE': variant_data['CODE'], 'SERIAL': variant_data['SERIAL'], 'NAME': variant_data['NAME'],
                              'TYPE1': variant_data['TYPE1'], "TYPE2": variant_data['TYPE2'], "COLOR": variant_data['COLOR'], "ABILITY1": variant_data['ABILITY1'],
                              "ABILITY2": variant_data['ABILITY2'], "ABILITY HIDDEN": variant_data['ABILITY HIDDEN'], "GENERATION": [variant_data['GENERATION']],
                              "LEGENDARY": [variant_data['LEGENDARY']], "MYTHICAL": [variant_data['MYTHICAL']], "MEGA_EVOLUTION": [variant_data['MEGA_EVOLUTION']],
                              "HEIGHT": [variant_data['HEIGHT']], "WEIGHT": [variant_data['WEIGHT']], "HP": [variant_data['HP']], "ATK": [variant_data['ATK']],
                              "DEF": [variant_data['DEF']], "SP_ATK": [variant_data['SP_ATK']], "SP_DEF": [variant_data['SP_DEF']], "SPD": [variant_data['SPD']],
                              "TOTAL": [variant_data['TOTAL']], "MOVES" : [moves_df]})
      Pokemons_df = Pokemons_df.append(df_temp,ignore_index = True)
    # variant data Ends---------------------------------------------------------------------------------

100%|██████████| 905/905 [55:18<00:00,  3.67s/it]


In [None]:
Pokemons_df.tail()

Unnamed: 0,NUMBER,CODE,SERIAL,NAME,TYPE1,TYPE2,COLOR,ABILITY1,ABILITY2,ABILITY HIDDEN,GENERATION,LEGENDARY,MYTHICAL,MEGA_EVOLUTION,HEIGHT,WEIGHT,HP,ATK,DEF,SP_ATK,SP_DEF,SPD,TOTAL,MOVES
1126,902,1,9021,Basculegion,Water,Ghost,Green,Rattled,Adaptability,Mold Breaker,8,0,0,0,3.0,110.0,120,112,65,80,75,78,530,move type category power 0 ...
1127,903,1,9031,Sneasler,Fighting,Poison,Gray,Pressure,,Poison Touch,8,0,0,0,1.3,43.0,80,130,60,40,80,120,510,move type category power 0 ...
1128,904,1,9041,Overqwil,Dark,Poison,Black,Poison Point,Swift Swim,Intimidate,8,0,0,0,2.5,60.5,85,115,95,65,65,85,510,move type category power 0 ...
1129,905,1,9051,Incarnate Forme,Fairy,Flying,Pink,Healer,,Contrary,8,1,0,0,1.6,48.0,74,115,70,135,80,106,580,move type category power...
1130,905,2,9052,Therian Forme,Fairy,Flying,Pink,Overcoat,,,8,1,0,0,1.6,48.0,74,115,110,135,100,46,580,move type category power...


In [None]:
Pokemons_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1131 entries, 0 to 1130
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   NUMBER          1131 non-null   object 
 1   CODE            1131 non-null   object 
 2   SERIAL          1131 non-null   object 
 3   NAME            1131 non-null   object 
 4   TYPE1           1127 non-null   object 
 5   TYPE2           1127 non-null   object 
 6   COLOR           1131 non-null   object 
 7   ABILITY1        1118 non-null   object 
 8   ABILITY2        530 non-null    object 
 9   ABILITY HIDDEN  913 non-null    object 
 10  GENERATION      1131 non-null   object 
 11  LEGENDARY       1131 non-null   object 
 12  MYTHICAL        1131 non-null   object 
 13  MEGA_EVOLUTION  1131 non-null   object 
 14  HEIGHT          1129 non-null   float64
 15  WEIGHT          1096 non-null   float64
 16  HP              1113 non-null   object 
 17  ATK             1113 non-null   o

In [None]:
Pokemons_df.iloc[0]["MOVES"]

Unnamed: 0,move,type,category,power
0,Tackle,Normal,Physical,40
1,Vine Whip,Grass,Physical,45
2,Razor Leaf,Grass,Physical,55
3,Seed Bomb,Grass,Physical,80
4,Take Down,Normal,Physical,90
5,Double-Edge,Normal,Physical,120
6,Solar Beam,Grass,Special,120
7,Magical Leaf,Grass,Special,60
8,Snore,Normal,Special,50
9,Giga Drain,Grass,Special,75


#Analysis of Data:

In [None]:
Pokemons_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1131 entries, 0 to 1130
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   NUMBER          1131 non-null   object 
 1   CODE            1131 non-null   object 
 2   SERIAL          1131 non-null   object 
 3   NAME            1131 non-null   object 
 4   TYPE1           1127 non-null   object 
 5   TYPE2           1127 non-null   object 
 6   COLOR           1131 non-null   object 
 7   ABILITY1        1118 non-null   object 
 8   ABILITY2        530 non-null    object 
 9   ABILITY HIDDEN  913 non-null    object 
 10  GENERATION      1131 non-null   object 
 11  LEGENDARY       1131 non-null   object 
 12  MYTHICAL        1131 non-null   object 
 13  MEGA_EVOLUTION  1131 non-null   object 
 14  HEIGHT          1129 non-null   float64
 15  WEIGHT          1096 non-null   float64
 16  HP              1113 non-null   object 
 17  ATK             1113 non-null   o

In [None]:
#Convert LEGENDARY, MYTHICAL and MEGA_EVOLUTION features to bool
Pokemons_df.LEGENDARY = Pokemons_df.LEGENDARY.astype('bool')
Pokemons_df.MYTHICAL = Pokemons_df.MYTHICAL.astype('bool')
Pokemons_df.MEGA_EVOLUTION = Pokemons_df.MEGA_EVOLUTION.astype('bool')

In [None]:
# Counting the number of Pokemons that do not have stat values
Pokemons_df['HP'].isna().sum()

18

In [None]:
# Pokemons that do not have stat values
Pokemons_df[Pokemons_df['HP'].isna()]

Unnamed: 0,NUMBER,CODE,SERIAL,NAME,TYPE1,TYPE2,COLOR,ABILITY1,ABILITY2,ABILITY HIDDEN,GENERATION,LEGENDARY,MYTHICAL,MEGA_EVOLUTION,HEIGHT,WEIGHT,HP,ATK,DEF,SP_ATK,SP_DEF,SPD,TOTAL,MOVES
4,3,3,33,Gigantamax Venusaur,Grass,Poison,Green,,,Chlorophyll,1,0,0,0,24.0,,,,,,,,,move type category power...
10,6,4,64,Gigantamax Charizard,,,Red,,,Solar Power,1,0,0,0,28.0,,,,,,,,,move type category power 0 ...
15,9,3,93,Gigantamax Blastoise,Water,Unknown,Blue,,,Rain Dish,1,0,0,0,25.0,,,,,,,,,move type category power 0...
77,52,4,524,Gigantamax Meowth,,,Yellow,,,,1,0,0,0,33.0,,,,,,,,,move type category power 0...
137,94,3,943,Gigantamax Gengar,Ghost,Poison,Purple,,,,1,0,0,0,20.0,,,,,,,,,move type category power 0...
597,479,2,4792,Heat Rotom,Electric,Fire,Red,Levitate,,,4,0,0,0,0.3,0.3,,,,,,,,move type category power ...
598,479,3,4793,Wash Rotom,Electric,Water,Red,Levitate,,,4,0,0,0,0.3,0.3,,,,,,,,move type category power ...
599,479,4,4794,Frost Rotom,Electric,Ice,Red,Levitate,,,4,0,0,0,0.3,0.3,,,,,,,,move type category power ...
600,479,5,4795,Fan Rotom,Electric,Flying,Red,Levitate,,,4,0,0,0,0.3,0.3,,,,,,,,move type category power ...
601,479,6,4796,Mow Rotom,Electric,Grass,Red,Levitate,,,4,0,0,0,0.3,0.3,,,,,,,,move type category power ...


Most of the Pokemons who does not have stat values was due to that it was not mentioned in their source pages. Only few cases had some different and single case HTML structure the needed to be fixed manully by inserting not editing the scrapper once more.

In [None]:
# Filling few wrong nulls instead of altering the whole scraper to deal with their single case HTML structure
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Heat Rotom", "Wash Rotom", "Frost Rotom", "Fan Rotom", "Mow Rotom"]), ["HP"]] = 50
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Heat Rotom", "Wash Rotom", "Frost Rotom", "Fan Rotom", "Mow Rotom"]), ["ATK"]] = 65
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Heat Rotom", "Wash Rotom", "Frost Rotom", "Fan Rotom", "Mow Rotom"]), ["DEF"]] = 107
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Heat Rotom", "Wash Rotom", "Frost Rotom", "Fan Rotom", "Mow Rotom"]), ["SP_ATK"]] = 105
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Heat Rotom", "Wash Rotom", "Frost Rotom", "Fan Rotom", "Mow Rotom"]), ["SP_DEF"]] = 107
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Heat Rotom", "Wash Rotom", "Frost Rotom", "Fan Rotom", "Mow Rotom"]), ["SPD"]] = 86
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Heat Rotom", "Wash Rotom", "Frost Rotom", "Fan Rotom", "Mow Rotom"]), ["TOTAL"]] = 520
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Red Flower", "Yellow Flower", "Orange Flower", "Blue Flower", "White Flower"]), ["HP"]] = 50
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Red Flower", "Yellow Flower", "Orange Flower", "Blue Flower", "White Flower"]), ["ATK"]] = 65
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Red Flower", "Yellow Flower", "Orange Flower", "Blue Flower", "White Flower"]), ["DEF"]] = 107
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Red Flower", "Yellow Flower", "Orange Flower", "Blue Flower", "White Flower"]), ["SP_ATK"]] = 105
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Red Flower", "Yellow Flower", "Orange Flower", "Blue Flower", "White Flower"]), ["SP_DEF"]] = 107
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Red Flower", "Yellow Flower", "Orange Flower", "Blue Flower", "White Flower"]), ["SPD"]] = 86
Pokemons_df.loc[Pokemons_df['NAME'].isin(["Red Flower", "Yellow Flower", "Orange Flower", "Blue Flower", "White Flower"]), ["TOTAL"]] = 520
Pokemons_df.loc[Pokemons_df['NAME'] == "Aegislash", ["HP"]] = 60
Pokemons_df.loc[Pokemons_df['NAME'] == "Aegislash", ["ATK"]] = 50
Pokemons_df.loc[Pokemons_df['NAME'] == "Aegislash", ["DEF"]] = 150
Pokemons_df.loc[Pokemons_df['NAME'] == "Aegislash", ["SP_ATK"]] = 50
Pokemons_df.loc[Pokemons_df['NAME'] == "Aegislash", ["SP_DEF"]] = 150
Pokemons_df.loc[Pokemons_df['NAME'] == "Aegislash", ["SPD"]] = 60
Pokemons_df.loc[Pokemons_df['NAME'] == "Aegislash", ["TOTAL"]] = 520
Pokemons_df.dropna(subset=["HP"], inplace= True)
Pokemons_df.reset_index(drop=True, inplace= True)

In [None]:
Pokemons_df['HP'].isna().sum()

0

In [None]:
# Pokemon Type Chart: https://img.pokemondb.net/images/typechart.png
Type_Modifiers = {
    "Fire":{"Fire":0.5, "Water":0.5, "Grass":2, "Electric":1, "Ice":2, "Psychic":1, "Dark":1, "Dragon":0.5, "Fairy":1, "Normal":1, "Fighting":1, "Flying":1, "Ground":1, "Rock":0.5, "Bug":2, "Poison":1, "Ghost":0, "Steel":2},
    "Water":{"Fire":2, "Water":0.5, "Grass":0.5, "Electric":1, "Ice":1, "Psychic":1, "Dark":1, "Dragon":0.5, "Fairy":1, "Normal":1, "Fighting":1, "Flying":1, "Ground":2, "Rock":2, "Bug":1, "Poison":1, "Ghost":1, "Steel":1},
    "Grass":{"Fire":0.5, "Water":2, "Grass":0.5, "Electric":1, "Ice":1, "Psychic":1, "Dark":1, "Dragon":0.5, "Fairy":1, "Normal":1, "Fighting":1, "Flying":0.5, "Ground":2, "Rock":2, "Bug":0.5, "Poison":0.5, "Ghost":1, "Steel":0.5},
    "Electric":{"Fire":1, "Water":2, "Grass":0.5, "Electric":0.5, "Ice":1, "Psychic":1, "Dark":1, "Dragon":0.5, "Fairy":1, "Normal":1, "Fighting":1, "Flying":2, "Ground":0, "Rock":1, "Bug":1, "Poison":1, "Ghost":1, "Steel":1},
    "Ice":{"Fire":0.5, "Water":0.5, "Grass":2, "Electric":1, "Ice":0.5, "Psychic":1, "Dark":1, "Dragon":2, "Fairy":1, "Normal":1, "Fighting":1, "Flying":2, "Ground":2, "Rock":1, "Bug":1, "Poison":1, "Ghost":1, "Steel":0.5},
    "Psychic":{"Fire":1, "Water":1, "Grass":1, "Electric":1, "Ice":1, "Psychic":0.5, "Dark":0, "Dragon":1, "Fairy":1, "Normal":1, "Fighting":2, "Flying":1, "Ground":1, "Rock":1, "Bug":1, "Poison":2, "Ghost":1, "Steel":0.5},
    "Dark":{"Fire":1, "Water":1, "Grass":1, "Electric":1, "Ice":1, "Psychic":2, "Dark":0.5, "Dragon":1, "Fairy":0.5, "Normal":1, "Fighting":0.5, "Flying":1, "Ground":1, "Rock":1, "Bug":1, "Poison":1, "Ghost":2, "Steel":1},
    "Dragon":{"Fire":1, "Water":1, "Grass":1, "Electric":1, "Ice":1, "Psychic":1, "Dark":1, "Dragon":2, "Fairy":0, "Normal":1, "Fighting":1, "Flying":1, "Ground":1, "Rock":1, "Bug":1, "Poison":1, "Ghost":1, "Steel":0.5},
    "Fairy":{"Fire":0.5, "Water":1, "Grass":1, "Electric":1, "Ice":1, "Psychic":1, "Dark":2, "Dragon":2, "Fairy":1, "Normal":1, "Fighting":2, "Flying":1, "Ground":1, "Rock":1, "Bug":1, "Poison":0.5, "Ghost":1, "Steel":0.5},    
    "Normal":{"Fire":1, "Water":1, "Grass":1, "Electric":1, "Ice":1, "Psychic":1, "Dark":1, "Dragon":1, "Fairy":1, "Normal":1, "Fighting":1, "Flying":1, "Ground":1, "Rock":0.5, "Bug":1, "Poison":1, "Ghost":0, "Steel":0.5},
    "Fighting":{"Fire":1, "Water":1, "Grass":1, "Electric":1, "Ice":2, "Psychic":0.5, "Dark":2, "Dragon":1, "Fairy":0.5, "Normal":2, "Fighting":1, "Flying":0.5, "Ground":1, "Rock":2, "Bug":0.5, "Poison":0.5, "Ghost":0, "Steel":2},
    "Flying":{"Fire":1, "Water":1, "Grass":2, "Electric":0.5, "Ice":1, "Psychic":1, "Dark":1, "Dragon":1, "Fairy":1, "Normal":1, "Fighting":2, "Flying":1, "Ground":1, "Rock":0.5, "Bug":2, "Poison":1, "Ghost":1, "Steel":0.5},
    "Ground":{"Fire":2, "Water":1, "Grass":0.5, "Electric":2, "Ice":1, "Psychic":1, "Dark":1, "Dragon":1, "Fairy":1, "Normal":1, "Fighting":1, "Flying":0, "Ground":1, "Rock":2, "Bug":0.5, "Poison":2, "Ghost":1, "Steel":2},
    "Rock":{"Fire":2, "Water":1, "Grass":1, "Electric":1, "Ice":2, "Psychic":1, "Dark":1, "Dragon":1, "Fairy":1, "Normal":1, "Fighting":0.5, "Flying":2, "Ground":0.5, "Rock":1, "Bug":2, "Poison":1, "Ghost":1, "Steel":0.5},
    "Bug":{"Fire":0.5, "Water":1, "Grass":2, "Electric":1, "Ice":1, "Psychic":2, "Dark":2, "Dragon":1, "Fairy":0.5, "Normal":1, "Fighting":0.5, "Flying":0.5, "Ground":1, "Rock":1, "Bug":1, "Poison":0.5, "Ghost":0.5, "Steel":0.5},
    "Poison":{"Fire":1, "Water":1, "Grass":2, "Electric":1, "Ice":1, "Psychic":1, "Dark":1, "Dragon":1, "Fairy":2, "Normal":1, "Fighting":1, "Flying":1, "Ground":0.5, "Rock":0.5, "Bug":1, "Poison":0.5, "Ghost":0.5, "Steel":0},
    "Ghost":{"Fire":1, "Water":1, "Grass":1, "Electric":1, "Ice":1, "Psychic":2, "Dark":0.5, "Dragon":1, "Fairy":1, "Normal":0, "Fighting":1, "Flying":1, "Ground":1, "Rock":1, "Bug":1, "Poison":1, "Ghost":2, "Steel":1},
    "Steel":{"Fire":0.5, "Water":0.5, "Grass":1, "Electric":0.5, "Ice":2, "Psychic":1, "Dark":1, "Dragon":1, "Fairy":2, "Normal":1, "Fighting":1, "Flying":1, "Ground":1, "Rock":2, "Bug":1, "Poison":1, "Ghost":1, "Steel":0.5}
}


In [None]:
# Calculating the attacking strength of each Pokemon as the mean of the maximum damages it can cause to other defending Pokemons (considering all the attacker moves)
Pokemons_df['STRENGTH_ATK'] = np.nan
for i in tqdm(range(len(Pokemons_df))):
  attacker = Pokemons_df.iloc[i]
  attacks =  attacker["MOVES"]
  if len(attacks) == 0 : continue     #few Pokemon pages did not mention any attacking moves so we do not calculate attacking strength for them
  damages = []                        #list of maximum damages the Pokmen can make to other Pokemons
  for j in range(len(Pokemons_df)):
    defender = Pokemons_df.iloc[j]
    max_damage = 0
    for attack_index in range(len(attacks)):
      attack = attacks.iloc[attack_index]
      attack_type = attack["type"]
      attack_category = attack["category"]
      attack_power = attack["power"]
      STAB = 1                  #STAB is Same-Type Attack Bonus (STAB)
      if attacker["TYPE1"] == attack_type or attacker["TYPE2"] == attack_type:
        STAB = 1.5
      factor_1 = Type_Modifiers[attack_type][defender["TYPE1"]]     #factor_1 and factor_2 are type modifires that depends on the attack move type and the defender types
      factor_2 = 1
      if defender["TYPE2"] != "Unknown":
        factor_2 = Type_Modifiers[attack_type][defender["TYPE2"]]
      modifier_overall = factor_1 * factor_2
      if attack_category == "Physical":
        damage = ((((((200/5+2)*attacker["ATK"]*attack_power)/defender["DEF"])/50)+2)*STAB)*(modifier_overall/10)      #Calculation of damage using the formula mentioned in the (math.miami) website
      elif attack_category == "Special":
        damage = ((((((200/5+2)*attacker["SP_ATK"]*attack_power)/defender["SP_DEF"])/50)+2)*STAB)*(modifier_overall/10)
      if damage > max_damage:
        max_damage = damage        #maximum damage that the attacker can make to the defender using any of its attacking moves
    damages.append(max_damage)                      
  Pokemons_df.at[i, 'STRENGTH_ATK'] = sum(damages)/len(damages)   #calculating the mean of maximum damages for that attacker Pokemon

100%|██████████| 1124/1124 [1:19:00<00:00,  4.22s/it]


In [None]:
# Calculating the defending strength of each Pokemon as the mean of the maximum damages it can receive from other attacking Pokemons (considering all the attacker moves)
Pokemons_df['STRENGTH_DEF'] = np.nan
for i in tqdm(range(len(Pokemons_df))):
  defender = Pokemons_df.iloc[i]
  damages = []      #list of maximum damages the Pokmen can receive from other Pokemons
  for j in range(len(Pokemons_df)):
    attacker = Pokemons_df.iloc[j]
    attacks =  attacker["MOVES"]
    if len(attacks) == 0 : continue
    max_damage = 0
    for attack_index in range(len(attacks)):
      attack = attacks.iloc[attack_index]
      attack_type = attack["type"]
      attack_category = attack["category"]
      attack_power = attack["power"]
      STAB = 1    #STAB is Same-Type Attack Bonus (STAB)
      if attacker["TYPE1"] == attack_type or attacker["TYPE2"] == attack_type:
        STAB = 1.5
      factor_1 = Type_Modifiers[attack_type][defender["TYPE1"]]  #factor_1 and factor_2 are type modifires that depends on the attack move type and the defender types
      factor_2 = 1
      if defender["TYPE2"] != "Unknown":
        factor_2 = Type_Modifiers[attack_type][defender["TYPE2"]]
      modifier_overall = factor_1 * factor_2
      if attack_category == "Physical":
        damage = ((((((200/5+2)*attacker["ATK"]*attack_power)/defender["DEF"])/50)+2)*STAB)*(modifier_overall/10)  #Calculation of damage using the formula mentioned in the (math.miami) website
      elif attack_category == "Special":
        damage = ((((((200/5+2)*attacker["SP_ATK"]*attack_power)/defender["SP_DEF"])/50)+2)*STAB)*(modifier_overall/10)
      if damage > max_damage:
        max_damage = damage   #maximum damage that the attacker can make to the defender using any of its attacking moves
    damages.append(max_damage)
  Pokemons_df.at[i, 'STRENGTH_DEF'] = sum(damages)/len(damages)  #calculating the mean of maximum damages for that defending Pokemon

100%|██████████| 1124/1124 [1:33:04<00:00,  4.97s/it]


In [None]:
# Normalizing the attacking and defending strength columns and calculating the overall strength column
Pokemons_df["STRENGTH_ATK(norm)"] = ( Pokemons_df["STRENGTH_ATK"] - (Pokemons_df["STRENGTH_ATK"]).min() ) / ( (Pokemons_df["STRENGTH_ATK"]).max() - (Pokemons_df["STRENGTH_ATK"]).min() )
Pokemons_df["STRENGTH_DEF(norm)"] = 1 - ( Pokemons_df["STRENGTH_DEF"] - (Pokemons_df["STRENGTH_DEF"]).min() ) / ( (Pokemons_df["STRENGTH_DEF"]).max() - (Pokemons_df["STRENGTH_DEF"]).min() )
Pokemons_df["OVERALL_STRENGTH"] = 0.5 * Pokemons_df["STRENGTH_ATK(norm)"] + 0.5 * Pokemons_df["STRENGTH_DEF(norm)"]

In [None]:
Pokemons_df.sort_values(by=['OVERALL_STRENGTH'], ascending=False, inplace = True)
Pokemons_df.reset_index(drop=True, inplace= True)

In [None]:
Pokemons_df.head()

Unnamed: 0,NUMBER,CODE,SERIAL,NAME,TYPE1,TYPE2,COLOR,ABILITY1,ABILITY2,ABILITY HIDDEN,GENERATION,LEGENDARY,MYTHICAL,MEGA_EVOLUTION,HEIGHT,WEIGHT,HP,ATK,DEF,SP_ATK,SP_DEF,SPD,TOTAL,STRENGTH_ATK,STRENGTH_DEF,STRENGTH_ATK(norm),STRENGTH_DEF(norm),OVERALL_STRENGTH
0,150,2,1502,Mega Mewtwo X,Psychic,Fighting,Purple,Steadfast,,Unnerve,1,True,False,1,2.3,127.0,106,190,100,154,100,130,780,84.877443,16.663557,1.0,0.971443,0.985722
1,384,2,3842,Mega Rayquaza,Dragon,Flying,Green,Delta Stream,,,3,True,False,1,10.8,392.0,105,180,100,180,100,115,780,80.394007,22.583679,0.945617,0.952353,0.948985
2,383,2,3832,Primal Groudon,Ground,Fire,Red,Desolate Land,,,3,True,False,0,5.0,999.7,100,180,160,150,90,90,770,77.806006,20.712772,0.914225,0.958386,0.936306
3,214,2,2142,Mega Heracross,Bug,Fighting,Blue,Skill Link,,Moxie,2,False,False,1,1.7,62.5,80,185,115,40,105,75,600,77.563818,21.583213,0.911287,0.95558,0.933433
4,150,3,1503,Mega Mewtwo Y,Psychic,Unknown,Purple,Insomnia,,Unnerve,1,True,False,1,1.5,33.0,106,150,70,194,120,140,780,74.684465,20.587141,0.876361,0.958791,0.917576


In [None]:
Pokemons_df.iloc[0]["NAME"]

'Mega Mewtwo X'

Most effective companion Pokemon is Mega Mewtwo X	