In [None]:
import requests
import json
import os
import pandas as pd


# Data Fetching

In [None]:
# function for getting the names of all the pokemon in a generation (int)
def get_gen_pokemon(generation):
    response = requests.get(f"https://pokeapi.co/api/v2/generation/{generation}")
    data = response.json()
    # sort the data by their pokemon ids using the ending part of their urls
    data["pokemon_species"].sort(key=lambda x: int(x["url"]
                                 .replace("https://pokeapi.co/api/v2/pokemon-species/", "")
                                 .replace("/", "")))                            
    return data["pokemon_species"]

In [None]:
# function for generating a pokemon model 
def generate_pokemon_model(generation, name, id, pokemon_data):
    pokemon_model = {}
    pokemon_model["name"] = name
    pokemon_model["id"] = id
    pokemon_model["generation"] = generation
    pokemon_model["types"] = []
    for type in pokemon_data["types"]:
        pokemon_model["types"].append(type["type"]["name"])
    return pokemon_model

In [None]:
# create an empty list to store all the pokemon
pokemon_list = []

# looks like some pokemon will have broken urls, we'll save those here
# so we can let the maintainers of pokeapi know that these routes are broken
broken_endpoints = []

In [None]:
# now we need a function that can fetch all the generational data and use that to grab all the detailed
# pokemon data within that generation. Along the way, when we encounter a broken
# url, we'll note that and use a fallback endpoint to get that pokemon's dat
def get_all_pokemon_data(gen_num):
    generation = get_gen_pokemon(gen_num)

    # loop through each pokemon in the generation
    for pokemon in generation:
        # get the pokemon's name
        name = pokemon["name"]
        url = f"https://pokeapi.co/api/v2/pokemon/{name}"
        # get the url for the pokemon's data
        response = requests.get(url)
        try:
            pokemon_data = response.json()
        except requests.exceptions.JSONDecodeError as e:
            pokemon_id = pokemon["url"].replace("https://pokeapi.co/api/v2/pokemon-species/", "").replace("/", "")            
            fallback_url = f"https://pokeapi.co/api/v2/pokemon/{pokemon_id}"
            response = requests.get(fallback_url)        
            pokemon_data = response.json()
            broken_endpoints.append(
                {
                    "name": name,
                    "generation": gen_num,
                    "broken_url": url,
                    "fallback_url": fallback_url,                    
                }
            )
        finally:
            pokemon_id_number = pokemon_data["id"]
            pokemon_model = generate_pokemon_model(gen_num, name, pokemon_id_number, pokemon_data) 
            pokemon_list.append(pokemon_model)
    print(f"Done fetching all {len(generation)} pokemon data for Generation {gen_num}")    

In [None]:
# now we can combine all the functions together to get all the 
# pokemon data for every generation
for gen_num in range(1,10):
    get_all_pokemon_data(gen_num)

In [None]:
# view pokemon_list to make sure it has all 1017 pokemon
print(len(pokemon_list))
pokemon_list_df = pd.DataFrame.from_dict(pokemon_list)
print(pokemon_list_df.to_markdown())

In [None]:
# let's view how many broken routes we found
print(len(broken_endpoints))
broken_endpoints_df = pd.DataFrame.from_dict(broken_endpoints)
print(broken_endpoints_df.to_markdown())

In [None]:
# save the pokemon_list and broken_routes as json files
with open(f"./data/pokemon_list.json", "w") as f:
    json.dump(pokemon_list, f, indent=4)

with open(f"./data/broken_endpoints.json", "w") as f:
    json.dump(broken_endpoints, f, indent=4)    

# Data Clean-Up

In [None]:
# let's open our pokemon_list.json and turn it into a pandas dataframe
with open('./data/pokemon_list.json', 'r') as f:
        all_pokemon_dict = json.load(f)
all_pokemon_df = pd.read_json('./data/pokemon_list.json')
all_pokemon_df

In [None]:
# looking at our column types, we can see that the data in the "types" column
# is an object representing an array of types (since some pokemon only have one type,
# while others have two types) 
all_pokemon_df.dtypes

In [None]:
# having a array as a column isn't ideal for the type of analysis we want to do later
# so let's try to normalize the types data from an array into 'type 1' and 'type 2'
# we'll build a function that can help us
def normalize_pokemon_model(pokemon):
    name = pokemon['name']
    id = pokemon['id']
    generation = pokemon['generation']
    type1 = pokemon['types'][0]
    type2 = ""
    if len(pokemon['types']) > 1:
        type2 = pokemon['types'][1]
    model = { 
        'name': name, 
        'id': id, 
        'generation': generation, 
        'type1': type1, 
        'type2': type2
    }
    return model

In [None]:
# now let's use our function with our original json dict to normalize the types
# using list comprehension, this can be accomplished in 1 line!
normalized = [normalize_pokemon_model(pokemon) for pokemon in all_pokemon_dict]

In [None]:
all_pokemon_df = pd.DataFrame.from_dict(normalized)
# now our dataframe should have "type1" & "type2" columns instead of the "types" column
all_pokemon_df

In [None]:
# let's also turn our broken_endpoints json into a dataframe
broken_endpoints_df = pd.read_json('./data/broken_endpoints.json')
broken_endpoints_df

In [None]:
# we eventually want to combine the broken_endpoints data to the all_pokemon data
# but first we'll need to clean up the broken_endpoints table 
broken_endpoints_df_cleaned = broken_endpoints_df.drop(
    ['generation', 'broken_url'], axis=1
)
broken_endpoints_df_cleaned

In [None]:
# now we can merge the two dataframes together using the pokemon name
merged_all_pokemon = all_pokemon_df.merge(broken_endpoints_df_cleaned, how="left", on="name")
merged_all_pokemon

In [None]:
# we can check that the broken links data merged correctly by ensuring 
# validating that there are 28 non-NaN records in the dataframe
len(merged_all_pokemon.dropna())

In [None]:
# now we should correct all the NaN urls 
for index, pokemon in merged_all_pokemon.iterrows():
    if pd.isnull(pokemon['fallback_url']):
        merged_all_pokemon.set_value(index, 'fallback_url', f"https://pokeapi.co/api/v2/pokemon/{pokemon['name']}")
merged_all_pokemon
        