In [79]:
import pandas as pd
import json

In [80]:
#Read the CSV file contains the JSON data
trainer_df = pd.read_csv(r"C:\Users\LeLuu\Documents\Python_Practice\Challenge_353\pokemon_trainer.csv")

In [81]:
# Rename the column name,polemons to poke_data
trainer_df.rename(columns={'name,pokemons': 'poke_data'}, inplace=True)

In [82]:
#Analyze the JSON data in the poke_data column
#The trainer name text is in the first part of the string, and the pokemons are in the second part
trainer_df

Unnamed: 0,poke_data
0,"""Ash"",[{""pokemonNumber"":1,""pokemonName"":""Pikac..."
1,"""Brock"",[{""pokemonNumber"":1,""pokemonName"":""Geo..."
2,"""Misty"",[{""pokemonNumber"":1,""pokemonName"":""Hor..."
3,"""Amit"",[{""pokemonNumber"":1,""pokemonName"":""Jolt..."


### Option 1: Split the text and then parse JSON and flatten 
- Step 1: Use REGEX to extract the data
- Step 2: After getting the JSON structure, Parse it with json.loads
- Step 3: Normalize the key-value pairs into each separate row
- Step 4: Flatten the key-value pairs into separate columns
- Step 5: Concatnate with the original data

In [83]:
#There are 2 options for parsing the JSON data
# 1. Use REGEX to extract the trainer name and pokemons. Then parse JSON to extract the pokemons.
trainer_df[['trainer_name', 'pokemons']] = trainer_df['poke_data'].str.extract(r'^"(.*?)",(.*)$')

In [84]:
trainer_df

Unnamed: 0,poke_data,trainer_name,pokemons
0,"""Ash"",[{""pokemonNumber"":1,""pokemonName"":""Pikac...",Ash,"[{""pokemonNumber"":1,""pokemonName"":""Pikachu""},{..."
1,"""Brock"",[{""pokemonNumber"":1,""pokemonName"":""Geo...",Brock,"[{""pokemonNumber"":1,""pokemonName"":""Geodude""},{..."
2,"""Misty"",[{""pokemonNumber"":1,""pokemonName"":""Hor...",Misty,"[{""pokemonNumber"":1,""pokemonName"":""Horsea""},{""..."
3,"""Amit"",[{""pokemonNumber"":1,""pokemonName"":""Jolt...",Amit,"[{""pokemonNumber"":1,""pokemonName"":""Jolteon""},{..."


In [85]:
#Parse JSON data into the dictionary format in Python
trainer_df['pokemons'] = trainer_df['pokemons'].apply(json.loads)

In [86]:
#Normalize the key-value pairs in each object (inside the curly braces) in the pokemons column into a separate row
#Then each row contains one object (contains multiple key-value pairs)
normalize_pokemon_df = trainer_df.explode('pokemons')

In [87]:
#Flatten the pokemons column into separate columns (each key-value pair becomes a separate column)
normalize_pokemon_df['pokemons'].apply(pd.Series)

Unnamed: 0,pokemonNumber,pokemonName
0,1,Pikachu
0,2,Butterfree
0,3,Pidgeotto
0,4,Bulbasaur
0,5,Charizard
0,6,Squirtle
1,1,Geodude
1,2,Chansey
1,3,Crobat
1,4,Vulpix


In [88]:
# Concatenate the flattened columns with the original DataFrame
pd.concat([normalize_pokemon_df,normalize_pokemon_df['pokemons'].apply(pd.Series)],axis=1)

Unnamed: 0,poke_data,trainer_name,pokemons,pokemonNumber,pokemonName
0,"""Ash"",[{""pokemonNumber"":1,""pokemonName"":""Pikac...",Ash,"{'pokemonNumber': 1, 'pokemonName': 'Pikachu'}",1,Pikachu
0,"""Ash"",[{""pokemonNumber"":1,""pokemonName"":""Pikac...",Ash,"{'pokemonNumber': 2, 'pokemonName': 'Butterfree'}",2,Butterfree
0,"""Ash"",[{""pokemonNumber"":1,""pokemonName"":""Pikac...",Ash,"{'pokemonNumber': 3, 'pokemonName': 'Pidgeotto'}",3,Pidgeotto
0,"""Ash"",[{""pokemonNumber"":1,""pokemonName"":""Pikac...",Ash,"{'pokemonNumber': 4, 'pokemonName': 'Bulbasaur'}",4,Bulbasaur
0,"""Ash"",[{""pokemonNumber"":1,""pokemonName"":""Pikac...",Ash,"{'pokemonNumber': 5, 'pokemonName': 'Charizard'}",5,Charizard
0,"""Ash"",[{""pokemonNumber"":1,""pokemonName"":""Pikac...",Ash,"{'pokemonNumber': 6, 'pokemonName': 'Squirtle'}",6,Squirtle
1,"""Brock"",[{""pokemonNumber"":1,""pokemonName"":""Geo...",Brock,"{'pokemonNumber': 1, 'pokemonName': 'Geodude'}",1,Geodude
1,"""Brock"",[{""pokemonNumber"":1,""pokemonName"":""Geo...",Brock,"{'pokemonNumber': 2, 'pokemonName': 'Chansey'}",2,Chansey
1,"""Brock"",[{""pokemonNumber"":1,""pokemonName"":""Geo...",Brock,"{'pokemonNumber': 3, 'pokemonName': 'Crobat'}",3,Crobat
1,"""Brock"",[{""pokemonNumber"":1,""pokemonName"":""Geo...",Brock,"{'pokemonNumber': 4, 'pokemonName': 'Vulpix'}",4,Vulpix


### Option 2: Parse JSON and flatten data
- Step 1: Initialize an empty array to store data
- Step 2: Iterate to each row in the column, Add the square brackets outside of each row
- Step 3: Parse JSON using json.loads function into dictionary for trainer and pokemons
- Step 4: For each row after parsing, flatten each pokemon with the trainer name into separate row
- Step 5: Store the data as a Pandas dataframe

In [89]:
#Option 2 is faster
# 2. Use the json.loads() method to parse the JSON string. But first, need to fix the string to make it valid JSON.
records = []

for row in trainer_df['poke_data']:
    # Step 1: Fix the string into valid JSO by wrapping it in square brackets
    valid_json = f"[{row}]"

    # Step 2: Parse as JSON list
    trainer_name, pokemons = json.loads(valid_json)

    # Step 3: Flatten each Pokémon with trainer name
    for p in pokemons:
        records.append({
            'trainer': trainer_name,
            'pokemonNumber': p['pokemonNumber'],
            'pokemonName': p['pokemonName']
        })

In [90]:
records

[{'trainer': 'Ash', 'pokemonNumber': 1, 'pokemonName': 'Pikachu'},
 {'trainer': 'Ash', 'pokemonNumber': 2, 'pokemonName': 'Butterfree'},
 {'trainer': 'Ash', 'pokemonNumber': 3, 'pokemonName': 'Pidgeotto'},
 {'trainer': 'Ash', 'pokemonNumber': 4, 'pokemonName': 'Bulbasaur'},
 {'trainer': 'Ash', 'pokemonNumber': 5, 'pokemonName': 'Charizard'},
 {'trainer': 'Ash', 'pokemonNumber': 6, 'pokemonName': 'Squirtle'},
 {'trainer': 'Brock', 'pokemonNumber': 1, 'pokemonName': 'Geodude'},
 {'trainer': 'Brock', 'pokemonNumber': 2, 'pokemonName': 'Chansey'},
 {'trainer': 'Brock', 'pokemonNumber': 3, 'pokemonName': 'Crobat'},
 {'trainer': 'Brock', 'pokemonNumber': 4, 'pokemonName': 'Vulpix'},
 {'trainer': 'Brock', 'pokemonNumber': 5, 'pokemonName': 'Forretress'},
 {'trainer': 'Brock', 'pokemonNumber': 6, 'pokemonName': 'Steelix'},
 {'trainer': 'Misty', 'pokemonNumber': 1, 'pokemonName': 'Horsea'},
 {'trainer': 'Misty', 'pokemonNumber': 2, 'pokemonName': 'Starmie'},
 {'trainer': 'Misty', 'pokemonNumber

In [91]:
records_df = pd.DataFrame(records)

In [92]:
records_df

Unnamed: 0,trainer,pokemonNumber,pokemonName
0,Ash,1,Pikachu
1,Ash,2,Butterfree
2,Ash,3,Pidgeotto
3,Ash,4,Bulbasaur
4,Ash,5,Charizard
5,Ash,6,Squirtle
6,Brock,1,Geodude
7,Brock,2,Chansey
8,Brock,3,Crobat
9,Brock,4,Vulpix


## Add Other Dataset to get the type of Pokemon

In [93]:
pokedex_df = pd.read_csv(r"C:\Users\LeLuu\Documents\Python_Practice\Challenge_353\pokedex_(Update_05.20).csv")

In [94]:
pokedex_df

Unnamed: 0.1,Unnamed: 0,pokedex_number,name,german_name,japanese_name,generation,status,species,type_number,type_1,...,against_ground,against_flying,against_psychic,against_bug,against_rock,against_ghost,against_dragon,against_dark,against_steel,against_fairy
0,0,1,Bulbasaur,Bisasam,フシギダネ (Fushigidane),1,Normal,Seed Pokémon,2,Grass,...,1.0,2.0,2.0,1.00,1.00,1.0,1.0,1.0,1.0,0.5
1,1,2,Ivysaur,Bisaknosp,フシギソウ (Fushigisou),1,Normal,Seed Pokémon,2,Grass,...,1.0,2.0,2.0,1.00,1.00,1.0,1.0,1.0,1.0,0.5
2,2,3,Venusaur,Bisaflor,フシギバナ (Fushigibana),1,Normal,Seed Pokémon,2,Grass,...,1.0,2.0,2.0,1.00,1.00,1.0,1.0,1.0,1.0,0.5
3,3,3,Mega Venusaur,Bisaflor,フシギバナ (Fushigibana),1,Normal,Seed Pokémon,2,Grass,...,1.0,2.0,2.0,1.00,1.00,1.0,1.0,1.0,1.0,0.5
4,4,4,Charmander,Glumanda,ヒトカゲ (Hitokage),1,Normal,Lizard Pokémon,1,Fire,...,2.0,1.0,1.0,0.50,2.00,1.0,1.0,1.0,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023,1023,888,Zacian Hero of Many Battles,,,8,Legendary,Warrior Pokémon,1,Fairy,...,1.0,1.0,1.0,0.50,1.00,1.0,0.0,0.5,2.0,1.0
1024,1024,889,Zamazenta Crowned Shield,,,8,Legendary,Warrior Pokémon,2,Fighting,...,2.0,1.0,1.0,0.25,0.25,1.0,0.5,0.5,0.5,1.0
1025,1025,889,Zamazenta Hero of Many Battles,,,8,Legendary,Warrior Pokémon,1,Fighting,...,1.0,2.0,2.0,0.50,0.50,1.0,1.0,0.5,1.0,2.0
1026,1026,890,Eternatus,,,8,Legendary,Gigantic Pokémon,2,Poison,...,2.0,1.0,2.0,0.50,1.00,1.0,2.0,1.0,1.0,1.0


In [95]:
#Inner join with the records_df and pokedex_df on the pokemonName column to mapping the type of each Pokémon
merge_df = pd.merge(left=records_df, right=pokedex_df, left_on='pokemonName' , right_on='name',how='inner')

In [96]:
#List of columns to keep in the final DataFrame
columns_to_keep = ['trainer', 'pokemonNumber', 'pokemonName','type_1', 'type_2']

In [97]:
merge_df= merge_df[columns_to_keep]

In [98]:
merge_df

Unnamed: 0,trainer,pokemonNumber,pokemonName,type_1,type_2
0,Ash,1,Pikachu,Electric,
1,Ash,2,Butterfree,Bug,Flying
2,Ash,3,Pidgeotto,Normal,Flying
3,Ash,4,Bulbasaur,Grass,Poison
4,Ash,5,Charizard,Fire,Flying
5,Ash,6,Squirtle,Water,
6,Brock,1,Geodude,Rock,Ground
7,Brock,2,Chansey,Normal,
8,Brock,3,Crobat,Poison,Flying
9,Brock,4,Vulpix,Fire,


In [99]:
# Add the 2 column types to the list of types
merge_df['types'] = merge_df[['type_1', 'type_2']].values.tolist()

In [100]:
merge_df

Unnamed: 0,trainer,pokemonNumber,pokemonName,type_1,type_2,types
0,Ash,1,Pikachu,Electric,,"[Electric, nan]"
1,Ash,2,Butterfree,Bug,Flying,"[Bug, Flying]"
2,Ash,3,Pidgeotto,Normal,Flying,"[Normal, Flying]"
3,Ash,4,Bulbasaur,Grass,Poison,"[Grass, Poison]"
4,Ash,5,Charizard,Fire,Flying,"[Fire, Flying]"
5,Ash,6,Squirtle,Water,,"[Water, nan]"
6,Brock,1,Geodude,Rock,Ground,"[Rock, Ground]"
7,Brock,2,Chansey,Normal,,"[Normal, nan]"
8,Brock,3,Crobat,Poison,Flying,"[Poison, Flying]"
9,Brock,4,Vulpix,Fire,,"[Fire, nan]"


In [101]:
#Removing the None values from the types list
merge_df['types'] = merge_df['types'].apply(lambda x: [t for t in x if pd.notna(t)])

In [102]:
merge_df

Unnamed: 0,trainer,pokemonNumber,pokemonName,type_1,type_2,types
0,Ash,1,Pikachu,Electric,,[Electric]
1,Ash,2,Butterfree,Bug,Flying,"[Bug, Flying]"
2,Ash,3,Pidgeotto,Normal,Flying,"[Normal, Flying]"
3,Ash,4,Bulbasaur,Grass,Poison,"[Grass, Poison]"
4,Ash,5,Charizard,Fire,Flying,"[Fire, Flying]"
5,Ash,6,Squirtle,Water,,[Water]
6,Brock,1,Geodude,Rock,Ground,"[Rock, Ground]"
7,Brock,2,Chansey,Normal,,[Normal]
8,Brock,3,Crobat,Poison,Flying,"[Poison, Flying]"
9,Brock,4,Vulpix,Fire,,[Fire]


In [103]:
#Use set for the types column to get unique types for each trainer
pokemon_type_by_trainer = merge_df.groupby('trainer')['types'].sum().apply(set).reset_index()

In [104]:
pokemon_type_by_trainer

Unnamed: 0,trainer,types
0,Amit,"{Water, Psychic, Electric, Rock, Ghost, Poison..."
1,Ash,"{Water, Bug, Electric, Poison, Normal, Flying,..."
2,Brock,"{Bug, Rock, Steel, Poison, Normal, Ground, Fly..."
3,Misty,"{Flying, Water, Psychic, Fairy}"


In [105]:
#Count how many unique types each trainer has
pokemon_type_by_trainer['count_types'] = pokemon_type_by_trainer['types'].apply(len)

In [106]:
pokemon_type_by_trainer

Unnamed: 0,trainer,types,count_types
0,Amit,"{Water, Psychic, Electric, Rock, Ghost, Poison...",8
1,Ash,"{Water, Bug, Electric, Poison, Normal, Flying,...",8
2,Brock,"{Bug, Rock, Steel, Poison, Normal, Ground, Fly...",8
3,Misty,"{Flying, Water, Psychic, Fairy}",4


In [107]:
#Qualified trainers are those who have at least 6 unique types
qualified_trainers = pokemon_type_by_trainer[pokemon_type_by_trainer['count_types'] >= 6]

In [108]:
qualified_trainers

Unnamed: 0,trainer,types,count_types
0,Amit,"{Water, Psychic, Electric, Rock, Ghost, Poison...",8
1,Ash,"{Water, Bug, Electric, Poison, Normal, Flying,...",8
2,Brock,"{Bug, Rock, Steel, Poison, Normal, Ground, Fly...",8


In [109]:
#Merge the qualified trainers with the records_df to filter only qualified trainers' Pokémon with their pokemons name
final_df = pd.merge(left=qualified_trainers, right=records_df, on='trainer', how='inner')[['trainer', 'pokemonName', 'pokemonNumber']]

In [110]:
final_df

Unnamed: 0,trainer,pokemonName,pokemonNumber
0,Amit,Jolteon,1
1,Amit,Alakazam,2
2,Amit,Tauros,3
3,Amit,Gyarados,4
4,Amit,Gengar,5
5,Amit,Aerodactyl,6
6,Ash,Pikachu,1
7,Ash,Butterfree,2
8,Ash,Pidgeotto,3
9,Ash,Bulbasaur,4


In [111]:
#Final step, pivot all Pokemon names for each trainer into a single row
final_ans = final_df.pivot(index='trainer',columns='pokemonNumber', values='pokemonName')

In [112]:
final_ans

pokemonNumber,1,2,3,4,5,6
trainer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Amit,Jolteon,Alakazam,Tauros,Gyarados,Gengar,Aerodactyl
Ash,Pikachu,Butterfree,Pidgeotto,Bulbasaur,Charizard,Squirtle
Brock,Geodude,Chansey,Crobat,Vulpix,Forretress,Steelix


In [113]:
#Rename the columns by adding Pokmon prefix
final_ans.columns = [f'Pokemon {col}' for col in final_ans.columns]

In [114]:
final_ans.reset_index(inplace=True)

In [115]:
final_ans

Unnamed: 0,trainer,Pokemon 1,Pokemon 2,Pokemon 3,Pokemon 4,Pokemon 5,Pokemon 6
0,Amit,Jolteon,Alakazam,Tauros,Gyarados,Gengar,Aerodactyl
1,Ash,Pikachu,Butterfree,Pidgeotto,Bulbasaur,Charizard,Squirtle
2,Brock,Geodude,Chansey,Crobat,Vulpix,Forretress,Steelix
