## Loading in the data

In [406]:
import pandas as pd
import re
import numpy as np

In [407]:
# Concatenate csvs into one master dataframe

genre_list = ['action','adventure','animation','biography','comedy','crime','documentary','drama'
              ,'family','fantasy','history','horror','music','musical','mystery','romance','sci-fi'
              ,'sport','superhero','thriller','war','western']

master_df = pd.DataFrame()

for genre in genre_list:
    genre_df = pd.read_csv('{}_series.csv'.format(genre))
    master_df = pd.concat([master_df,genre_df])

## Data Cleaning & Preprocessing

In [408]:
# remove duplicate rows since each title appears in multiple genre files
master_df.drop_duplicates(subset = 'IMDb ID', inplace = True)

In [409]:
# Release Year - For TV shows we will keep the year in which the show first started. Take first 4 digit number per record
master_df['Release Year'] = master_df['Release Year'].str.extract(r'(\d{1,4})')

In [410]:
# Genre - Each record has 3 genres, so we will convert it into a singular array format
master_df['Genre'] = master_df['Genre'].str.split(',').apply(lambda x: [genre.strip() for genre in x])

In [411]:
# Cast - Turn directors and actors into their own columns (array of strings)

# Function to clean up the Cast column string
# Using regex patterns for director(s) and star(s), then splitting names by commas and removing white spaces

def extract_string(col, pattern):
    
    # The pattern for directors is extract names after 'Director' or 'Directors' followed by ':' and before '|'
    # The pattern for actors is extract names after 'Star' or 'Stars' followed by ':' until the end of the string
    names = col.str.extractall(pattern)[0]
    
    # Split the names by commas and remove empty spaces
    names_list = names.str.split(',').explode().str.strip()
    names_list = names_list[names_list != ""]
    
    # Convert the names into a single list of strings, each string is a full name
    final_names_list = names_list.groupby(level=0).apply(lambda x: np.array(x, dtype=str))
    
    return final_names_list

In [412]:
# The pattern for directors is extract names after 'Director' or 'Directors' followed by ':' and before '|'
directors_regex = r"(?i)Director[s]?[:,]\s*([^|]+)"
master_df['directors'] = extract_string(master_df['Cast'],directors_regex)

# The pattern for actors is extract names after 'Star' or 'Stars' followed by ':' until the end of the string
actors_regex = r"(?i)Star[s]?[:,]\s*([^:]+)$"
master_df['actors'] = extract_string(master_df['Cast'],actors_regex)

In [413]:
# remove 'min' from runtime to make it just the number of minutes
master_df['Runtime'] = master_df['Runtime'].str.replace('min','').str.strip()

In [414]:
# create a column that determines if its a movie or tv show
# we can use the directors column to do this. if it has a director, its a movie. if not, its a tv show
master_df['movie_series_ind'] = master_df['Runtime'].apply(lambda x: 'series' if pd.isnull(x) else 'movie')

In [415]:
# remove unncessary columns that wont be used in the final data
master_df.drop(columns = ['Cast','Synopsis','Gross Revenue'], inplace=True)

In [416]:
# make all column names lowercase with underscores for column names
master_df.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)

# add a minutes classifier to the runtime column name
master_df.rename(columns={'runtime':'runtime_mins'}, inplace=True)

In [418]:
# Output the final table as a csv
master_df.to_csv('series_dataset.csv')