In [81]:
import pandas as pd
import numpy as np
import json

from bs4 import BeautifulSoup

In [82]:
gotham_academy = pd.read_csv("data/gotham_academy.csv")

In [83]:
gotham_academy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   18 non-null     int64  
 1   aliases                      0 non-null      float64
 2   api_detail_url               18 non-null     object 
 3   associated_images            18 non-null     object 
 4   character_credits            18 non-null     object 
 5   character_died_in            18 non-null     object 
 6   concept_credits              18 non-null     object 
 7   cover_date                   18 non-null     object 
 8   date_added                   18 non-null     object 
 9   date_last_updated            18 non-null     object 
 10  deck                         0 non-null      float64
 11  description                  18 non-null     object 
 12  first_appearance_characters  0 non-null      float64
 13  first_appearance_conce

In [84]:
def clean_data(input_dataframe):
    df = input_dataframe.copy()
    df.drop(columns=["Unnamed: 0", "aliases", "deck", "first_appearance_characters", "first_appearance_concepts", 
                    "first_appearance_locations", "first_appearance_objects", "first_appearance_storyarcs",
                    "first_appearance_teams", "store_date"], inplace=True)
    return df

In [85]:
clean_df = clean_data(gotham_academy)

In [86]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   api_detail_url     18 non-null     object
 1   associated_images  18 non-null     object
 2   character_credits  18 non-null     object
 3   character_died_in  18 non-null     object
 4   concept_credits    18 non-null     object
 5   cover_date         18 non-null     object
 6   date_added         18 non-null     object
 7   date_last_updated  18 non-null     object
 8   description        18 non-null     object
 9   has_staff_review   18 non-null     object
 10  id                 18 non-null     int64 
 11  image              18 non-null     object
 12  issue_number       18 non-null     int64 
 13  location_credits   18 non-null     object
 14  name               17 non-null     object
 15  object_credits     18 non-null     object
 16  person_credits     18 non-null     object
 17 

In [87]:
clean_df.head(1)

Unnamed: 0,api_detail_url,associated_images,character_credits,character_died_in,concept_credits,cover_date,date_added,date_last_updated,description,has_staff_review,...,issue_number,location_credits,name,object_credits,person_credits,site_detail_url,story_arc_credits,team_credits,team_disbanded_in,volume
0,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,[{'api_detail_url': 'https://comicvine.gamespo...,[],[{'api_detail_url': 'https://comicvine.gamespo...,2014-12-31,2014-09-29 20:58:22,2015-03-25 22:10:19,<p><em>WELCOME TO GOTHAM ACADEMY! Gotham City’...,{'api_detail_url': 'https://comicvine.gamespot...,...,1,[{'api_detail_url': 'https://comicvine.gamespo...,Welcome to Gotham Academy,[{'api_detail_url': 'https://comicvine.gamespo...,[{'api_detail_url': 'https://comicvine.gamespo...,https://comicvine.gamespot.com/gotham-academy-...,[],[{'api_detail_url': 'https://comicvine.gamespo...,[],{'api_detail_url': 'https://comicvine.gamespot...


In [88]:
def clean_description(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    cleaned_text = soup.get_text(separator='\n')
    return cleaned_text

def clean_character_credits(json_str):
    json_str = json_str.replace("'", '"')
    characters = json.loads(json_str)
    names = [character['name'] for character in characters]
    return names

def clean_concept_credits(json_str):
    json_str = json_str.replace("'", '"')
    concepts = json.loads(json_str)
    names = [concept['name'] for concept in concepts]
    return names

def clean_location_credits(json_str):
    json_str = json_str.replace("'", '"')
    locations = json.loads(json_str)
    names = [location['name'] for location in locations]
    return names
    
def clean_object_credits(json_str):
    json_str = json_str.replace("'", '"')
    objects = json.loads(json_str)
    names = [object['name'] for object in objects]
    return names

def clean_person_credits(json_str):
    json_str = json_str.replace("'", '"')
    persons = json.loads(json_str)
    names = [person['name'] for person in persons]
    return names

def clean_story_arc_credits(json_str):
    json_str = json_str.replace("'", '"')
    story_arcs = json.loads(json_str)
    names = [story_arc['name'] for story_arc in story_arcs]
    return names


def clean_team_credits(json_str):
    json_str = json_str.replace("'",'"')
    team_credits = json.loads(json_str)
    names = [team_credit['name'] for team_credit in team_credits]
    return names

In [89]:
clean_df["description"]

0     <p><em>WELCOME TO GOTHAM ACADEMY! Gotham City’...
1     <p><em>Olive joins the creepy Order of the Bat...
2     <p><em>If you thought getting detention was a ...
3     <p><em>The hunt for the Ghost of Gotham Academ...
4     <p><em>This month’s assignment: Uncover the hi...
5     <p><em>Holy cow, it’s Taco Tuesday! Oh…and Oli...
6     <p><em>Special guest student Damian Wayne drop...
7     <p><em>Classes are suspended for a funeral. So...
8     <p><em>If the gang thought it was hard to keep...
9     <p><em>“Bubble, bubble, toil and trouble!” To ...
10    <p><em>The gang is going downtown! Olive and M...
11    <p><em>As Olive and Maps search for Kyle, they...
12    <p><em>A “Robin War” tie-in! With Robins fight...
13    <p><em>An all-new era of GOTHAM ACADEMY begins...
14    <p><em>It’s part two of “Gotham Academy Yearbo...
15    <p><em>It’s “Yearbook” part 3! As Olive and th...
16    <p><em>It’s the final chapter of “Yearbook”! M...
17    <p><em>As the “Gotham Academy Yearbook” st

In [90]:
def clean_dataframe(df):
    df["description"] = df["description"].apply(clean_description)
    df["character_credits"] = df["character_credits"].apply(clean_character_credits)
    df["concept_credits"] = df["concept_credits"].apply(clean_concept_credits)
    df["location_credits"] = df["location_credits"].apply(clean_location_credits)
    df["object_credits"] = df["object_credits"].apply(clean_object_credits)
    df["person_credits"] = df["person_credits"].apply(clean_person_credits)
    df["story_arc_credits"] = df["story_arc_credits"].apply(clean_story_arc_credits)
    df["team_credits"] = df["team_credits"].apply(clean_team_credits)

In [99]:
clean_dataframe(clean_df)

In [100]:
clean_df

Unnamed: 0,api_detail_url,associated_images,character_credits,character_died_in,concept_credits,cover_date,date_added,date_last_updated,description,has_staff_review,...,issue_number,location_credits,name,object_credits,person_credits,site_detail_url,story_arc_credits,team_credits,team_disbanded_in,volume
0,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Aunt Harriet, Batman, Calamity, Colton Rivera...",[],"[Batman Villains, The New 52]",2014-12-31,2014-09-29 20:58:22,2015-03-25 22:10:19,WELCOME TO GOTHAM ACADEMY! Gotham City’s most ...,{'api_detail_url': 'https://comicvine.gamespot...,...,1,"[Gotham Academy, Gotham City]",Welcome to Gotham Academy,"[Bat-Signal, Lipstick]","[Becky Cloonan, Brenden Fletcher, Dave McCaig,...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
1,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Bookworm, Colton Rivera, Heathcliff Ray, Kyle...",[],[Batman Villains],2015-01-31,2014-11-03 20:56:58,2015-09-09 19:41:51,Olive joins the creepy Order of the Bat as an ...,{'api_detail_url': 'https://comicvine.gamespot...,...,2,"[Gotham Academy, Gotham City]",The Diary of Millie Jane Cobblepot,[],"[Becky Cloonan, Brenden Fletcher, Dave McCaig,...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
2,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Aunt Harriet, Batman, Colton Rivera, Headmast...",[],[],2015-02-01,2014-12-01 20:57:29,2015-03-25 22:11:06,"If you thought getting detention was a pain, j...",{'api_detail_url': 'https://comicvine.gamespot...,...,3,"[Gotham Academy, Gotham City]",The Ghost in the North Hall,[],"[Becky Cloonan, Brenden Fletcher, Karl Kerschl...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
3,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Batman, Calamity, Coach Humphreys, Gray Ghost...",[],[Batman Villains],2015-03-25,2015-01-26 20:56:37,2015-09-09 19:42:42,The hunt for the Ghost of Gotham Academy begins!,False,...,4,"[Gotham Academy, Gotham City]",The Secret of the Symbol,[],"[Becky Cloonan, Brenden Fletcher, Dave McCaig,...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
4,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Batman, Calamity, Colton Rivera, Headmaster H...",[],[],2015-04-01,2015-02-23 20:57:22,2019-06-21 04:57:55,This month’s assignment: Uncover the hideous s...,{'api_detail_url': 'https://comicvine.gamespot...,...,5,"[Gotham Academy, Gotham City]",Save The Last Dance,[],"[Becky Cloonan, Brenden Fletcher, Karl Kerschl...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
5,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Amadeus Arkham, Batman, Calamity, Colton Rive...",[],[],2015-05-01,2015-03-23 20:56:08,2016-07-20 09:39:10,"Holy cow, it’s Taco Tuesday! Oh…and Olive batt...",{'api_detail_url': 'https://comicvine.gamespot...,...,6,"[Arkham Asylum, Gotham Academy, Gotham City]",Pizza Club,[],"[Babs Tarr, Becky Cloonan, Brenden Fletcher, K...",https://comicvine.gamespot.com/gotham-academy-...,[],"[Arkham Asylum Inmates, Gotham Academy Student...",[],{'api_detail_url': 'https://comicvine.gamespot...
6,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Batman, Bookworm, Colton Rivera, Damian Wayne...",[],[Joker 75th Anniversary Variant],2015-08-01,2015-06-08 20:57:04,2019-06-21 04:57:55,Special guest student Damian Wayne drops by th...,False,...,7,"[Gotham Academy, Gotham City]",Curse of the Inishtree Quill,[],"[Becky Cloonan, Brenden Fletcher, Craig Rousse...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
7,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Anna, Calamity, Clayface (Karlo), Coach Humph...",[{'api_detail_url': 'https://comicvine.gamespo...,[Teen Titans Go! Variant],2015-09-01,2015-07-06 20:55:25,2015-07-08 19:25:36,"Classes are suspended for a funeral. So, is it...",False,...,8,"[Gotham Academy, Gotham City]",Requiem,[],"[Becky Cloonan, Brenden Fletcher, Karl Kerschl...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
8,https://comicvine.gamespot.com/api/issue/4000-...,[],"[Calamity, Clayface (Karlo), Coach Humphreys, ...",[],[],2015-10-31,2015-08-11 00:54:48,2019-06-21 04:57:55,If the gang thought it was hard to keep up wit...,False,...,9,[Gotham Academy],Calamity,[],"[Becky Cloonan, Brenden Fletcher, Karl Kerschl...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
9,https://comicvine.gamespot.com/api/issue/4000-...,[],"[Calamity, Clayface (Karlo), Colton Rivera, Gr...",[],[],2015-11-30,2015-09-08 00:02:31,2019-06-21 04:57:54,"“Bubble, bubble, toil and trouble!” To investi...",False,...,10,[Gotham Academy],The Cursed Play,[],"[Becky Cloonan, Brenden Fletcher, Karl Kerschl...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...


In [102]:
clean_dataframe(gotham_academy)

In [104]:
gotham_academy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   18 non-null     int64  
 1   aliases                      0 non-null      float64
 2   api_detail_url               18 non-null     object 
 3   associated_images            18 non-null     object 
 4   character_credits            18 non-null     object 
 5   character_died_in            18 non-null     object 
 6   concept_credits              18 non-null     object 
 7   cover_date                   18 non-null     object 
 8   date_added                   18 non-null     object 
 9   date_last_updated            18 non-null     object 
 10  deck                         0 non-null      float64
 11  description                  18 non-null     object 
 12  first_appearance_characters  0 non-null      float64
 13  first_appearance_conce