In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx

import matplotlib.pyplot as plt
!python3 -m spacy download en_core_web_sm
import os

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 1.2 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Load spacy English languague model
NER = spacy.load("en_core_web_sm")

In [3]:
# Load datasets 
characters = pd.read_csv("characters_list.csv", sep='\t') # List of characters
scripts = [s for s in sorted(os.listdir('data')) if s.endswith('.csv')] # List of scripts sorted 

In [5]:
# Leave only lines of dialogue 
def prepare_dataframe(df):
    df.rename(columns = {list(df)[0]:'Line'}, inplace = True) 
    df['Character'] = df['Line'].str.split(' ').str[0] # Create a column with first word from line 
    df['Line'] = df['Line'].str.split(n=1).str[1] # Delete the first word from line 
    df = df[df['Character'].str.endswith(':', na=False)] # Delete row if doesn't end with ":" 

    # Change the order of columns 
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]

    # Capitalize character names 
    df['Character'] = df['Character'].str.capitalize()
    df['Character'] = df['Character'].str.replace(':', '')
    
    return df 

In [6]:
# Create list of dataframes 
seasons_list = []
 
# Append seasons scripts into the list
for i in range(len(scripts)):
    df = pd.read_csv("data/"+scripts[i])
    seasons_list.append(df)

# df_list = [season1, season2, season3, season4, season5, season6]
# seasons_list = [season.pipe(prepare_dataframe) for season in seasons_list]

In [7]:
# Prepare dataframes 
season1 = prepare_dataframe(seasons_list[0])
season1['Season'] = '1'

season2 = prepare_dataframe(seasons_list[1])
season2['Season'] = '2'

season3 = prepare_dataframe(seasons_list[2])
season3['Season'] = '3'

season4 = prepare_dataframe(seasons_list[3])
season4['Season'] = '4'

season5 = prepare_dataframe(seasons_list[4])
season5['Season'] = '5'

season6 = prepare_dataframe(seasons_list[5])
season6['Season'] = '6'

season7 = prepare_dataframe(seasons_list[6])
season7['Season'] = '7'


In [22]:
# Merge dataframes
all_seasons = pd.concat([season1, season2, season3, season4, season5, season6, season7])

In [20]:
# Save output to csv 
all_seasons.to_csv(r'/Users/julkakubisa/gilmore_girls_network/data/Gilmore_Girls_Lines.csv')