In [407]:
import pandas as pd
import numpy as np

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# LOADING CSV FILES

In [408]:
df = pd.read_csv("data/final/final_dataset.csv")

In [409]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3255 entries, 0 to 3254
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   3255 non-null   int64  
 1   aliases                      0 non-null      float64
 2   api_detail_url               3255 non-null   object 
 3   associated_images            3255 non-null   object 
 4   character_credits            3255 non-null   object 
 5   character_died_in            3255 non-null   object 
 6   concept_credits              3255 non-null   object 
 7   cover_date                   3255 non-null   object 
 8   date_added                   3255 non-null   object 
 9   date_last_updated            3255 non-null   object 
 10  deck                         128 non-null    object 
 11  description                  3210 non-null   object 
 12  first_appearance_characters  0 non-null      float64
 13  first_appearance_c

# PREPROCESSING & FEATURE ENGINEERING FUNCTIONS

**clean_dataframe** <br>
input: df <br>
drops various columns with too many nulls, converts some types to datetime and fill in null descriptions with "" <br>
also drops various columns I deem aren't needed for the final model <br>
output: df <br>

In [410]:
def clean_dataframe(input_dataframe):
    df = input_dataframe.copy()
    df.drop(columns=["Unnamed: 0", "aliases", "deck", "first_appearance_characters", "first_appearance_concepts", 
                    "first_appearance_locations", "first_appearance_objects", "first_appearance_storyarcs",
                    "first_appearance_teams", "store_date"], inplace=True)
    df.dropna(subset=['name'], inplace=True)
    df['cover_date'] = pd.to_datetime(df['cover_date'], errors='coerce')
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    df['date_last_updated'] = pd.to_datetime(df['date_last_updated'], errors='coerce')
    df.drop(columns=['api_detail_url', 'associated_images', 'date_added', 'date_last_updated', 'image',
                    'site_detail_url'], inplace=True)
    df['description'] = df['description'].fillna('')
    #df = df.reset_index()
    return df

**clean_description**
cleans the description as well as possible
the format and contents of the descriptions are not always consistent so there are some errors

In [411]:
def clean_description(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    cleaned_text = soup.get_text(separator='\n')
    cleaned_text = cleaned_text.replace('\\', '').replace('\n', '')
    return cleaned_text

**get_names and get_obj** <br>
get_names takes in lists and get_obj is for when there is only one object <br>
when there is an error, i've set it to return an empty list ([])

In [412]:
def get_names(json_str, index):
    json_str = json_str.replace("'",'"')
    try:
        lst = json.loads(json_str)
    except json.JSONDecodeError as e:
        #print(f"JSONDecodeError at row {index}: {e}")
        return []
    names = [item["name"] for item in lst]
    return names
def get_obj(json_str, index):
    json_str = json_str.replace("'",'"')
    try:
        obj = json.loads(json_str)
    except json.JSONDecodeError as e:
        #print(f"JSONDecodeError at row {index}: {e}")
        return []
    names = [obj["name"]]  # Since we are dealing with a single object, not a list
    return names

**total_clean** applies all the above functions

In [413]:
def total_clean(df):
    df["description"] = df["description"].apply(clean_description)
    df['character_credits'] = df.apply(lambda row: get_names(row['character_credits'], row.name), axis=1)
    df['character_died_in'] = df.apply(lambda row: get_names(row['character_died_in'], row.name), axis=1)
    df['concept_credits'] = df.apply(lambda row: get_names(row['concept_credits'], row.name), axis=1)
    df['location_credits'] = df.apply(lambda row: get_names(row['location_credits'], row.name), axis=1)
    df['object_credits'] = df.apply(lambda row: get_names(row['object_credits'], row.name), axis=1)
    df['person_credits'] = df.apply(lambda row: get_names(row['person_credits'], row.name), axis=1)
    df['story_arc_credits'] = df.apply(lambda row: get_names(row['story_arc_credits'], row.name), axis=1)
    df["team_credits"] = df.apply(lambda row: get_names(row['team_credits'], row.name), axis=1)
    df['team_disbanded_in'] = df.apply(lambda row: get_names(row['team_disbanded_in'], row.name), axis=1)
    df['volume'] = df.apply(lambda row: get_obj(row['volume'], row.name), axis=1)

**to_underscore** replaces the spaces with underscore in the credits <br>
this is in preparation for when we vectorize and to make sure that we don't seperate the first and last names

In [414]:
def to_underscore(df):
    df['character_credits'] = df['character_credits'].apply(lambda x:[name.replace(' ', '_') for name in x])
    df['character_died_in'] = df['character_died_in'].apply(lambda x:[name.replace(' ', '_') for name in x])
    df['concept_credits'] = df['concept_credits'].apply(lambda x:[name.replace(' ', '_') for name in x])
    df['location_credits'] = df['location_credits'].apply(lambda x: [name.replace(' ', '_') for name in x])
    df['object_credits'] = df['object_credits'].apply(lambda x: [name.replace(' ', '_') for name in x])
    df['person_credits'] = df['person_credits'].apply(lambda x: [name.replace(' ', '_') for name in x])
    df['story_arc_credits'] = df['story_arc_credits'].apply(lambda x: [name.replace(' ', '_') for name in x])
    df['team_credits'] = df['team_credits'].apply(lambda x: [name.replace(' ', '_') for name in x])
    df['team_disbanded_in'] = df['team_disbanded_in'].apply(lambda x: [name.replace(' ', '_') for name in x])
    df['volume'] = df['volume'].apply(lambda x: [name.replace(' ', '_') for name in x])

**to_string** converts the various credits into from lst to string <br>
**combined_description** combines the description col with all the credits cols to create a new feature 

In [415]:
def to_string(lst):
    string = ""
    for item in lst:
        string = string + " " + item
    return string

In [416]:
def combined_description(df):
    lst = ['character_credits', 'concept_credits', 'location_credits', 'object_credits', 
           'person_credits', 'story_arc_credits', 'team_credits', 'volume']
    df['combined_description'] = df['description']
    for col in lst:
        df['combined_description'] = df['combined_description'] + df[col].apply(to_string)

# ACTUALLY DOING THE PREPROCESSING & FEATURE ENGINEERING

In [417]:
#dropping stuff and converting datetime and filling in nulls
df = clean_dataframe(df)

In [418]:
#cleaning text and getting lists for various credits
total_clean(df)

In [419]:
#replcing spaces with underscores in various credits
to_underscore(df)

In [420]:
#combining desciprtion and various credits
combined_description(df)

In [421]:
df.sample(3)

Unnamed: 0,character_credits,character_died_in,concept_credits,cover_date,description,has_staff_review,id,issue_number,location_credits,name,object_credits,person_credits,story_arc_credits,team_credits,team_disbanded_in,volume,combined_description
736,"[Batman, Catwoman, Chief_Hot_Foot, Dick_Grayso...",[],"[Animal_Attacks, Batman_Villains, Golden_Age_o...",1947-08-31,“CLAWS OF THE CATWOMAN!” Catwoman escapes from...,False,122711,42,"[Gotham_City, Gotham_State_Penitentiary, Wayne...",Claws of the Catwoman!,"[Bat-Signal, Bat-Suit, Batmobile, Gotham_Gazet...","[Bill_Finger, Bob_Kane, Charles_Paris, Curt_Sw...",[],"[Batman_and_Robin, Gotham_City_Police_Department]",[],[Batman],“CLAWS OF THE CATWOMAN!” Catwoman escapes from...
301,"[Alfred_Pennyworth, Batman, Diamond_Jack_Caffe...",[],[Bronze_Age_of_Comics],1978-10-31,The Dark Knight has been kidnapped and drugged...,False,18856,304,[],To Hell With Batman... and Back!,[],"[David_V._Reed, Jim_Aparo]",[],[],[],[Batman],The Dark Knight has been kidnapped and drugged...
1407,"[Demon_John, First_of_the_Fallen, Ghant, John_...",[],[Vertigo],2004-10-24,When all intrigues are laid bare and all masks...,False,108149,199,[],"Stations Of The Cross, Part 3",[],"[Lee_Loughridge, Marcelo_Frusin, Mike_Carey, T...",[],[],[],[Hellblazer],When all intrigues are laid bare and all masks...


In [425]:
df['character_credits'][350]

['Alfred_Pennyworth',
 'Arthur_Reeves',
 'Batman',
 'Battle_Cat',
 'Beast-Man',
 'Dick_Grayson',
 'He-Man',
 'James_Gordon',
 'Joker',
 'Man-At-Arms',
 'Mer-Man',
 'Olivia_Ortega',
 'Skeletor',
 'Superman',
 'The_Sorceress',
 'Vicki_Vale',
 'Zodac']

In [423]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3221 entries, 0 to 3254
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   character_credits     3221 non-null   object        
 1   character_died_in     3221 non-null   object        
 2   concept_credits       3221 non-null   object        
 3   cover_date            3221 non-null   datetime64[ns]
 4   description           3221 non-null   object        
 5   has_staff_review      3221 non-null   object        
 6   id                    3221 non-null   int64         
 7   issue_number          3221 non-null   object        
 8   location_credits      3221 non-null   object        
 9   name                  3221 non-null   object        
 10  object_credits        3221 non-null   object        
 11  person_credits        3221 non-null   object        
 12  story_arc_credits     3221 non-null   object        
 13  team_credits          3

In [424]:
df.to_csv("data/final/final_dataset_clean.csv", index=False)