# Shakespeare Character Genders

## Imports

In [1]:
from os import listdir
from bs4 import BeautifulSoup
import pandas as pd
from nltk.tokenize import word_tokenize

## Data Loading

### Load Gender Labels

In [2]:
df = pd.read_csv('all_globe_name_id.csv', 
            header=None, 
            names=['role_id', 'role', 'file_name', 'blank', 'gender', 'initials'])
df = df.drop(labels=['blank', 'initials'], axis=1)

In [3]:
df['gender'] = df['gender'].str.upper()

In [4]:
df['speech_count'] = 0
df['total_words'] = 0
df

Unnamed: 0,role_id,role,file_name,gender,speech_count,total_words
0,tim.,TIMON of Athens,Globe_Shakespeare/tim.xml,MALE,0,0
1,lucul.,LUCULLUS,Globe_Shakespeare/tim.xml,MALE,0,0
2,sem.,SEMPRONIUS,Globe_Shakespeare/tim.xml,MALE,0,0
3,ven.,VENTIDIUS,Globe_Shakespeare/tim.xml,MALE,0,0
4,alcib.,ALCIBIADES,Globe_Shakespeare/tim.xml,MALE,0,0
...,...,...,...,...,...,...
1116,serv.,Servant,Globe_Shakespeare/ham.xml,MALE,0,0
1117,sail.,Sailor,Globe_Shakespeare/ham.xml,MALE,0,0
1118,mess.,Mess.,Globe_Shakespeare/ham.xml,MALE,0,0
1119,lord.,Lord.,Globe_Shakespeare/ham.xml,MALE,0,0


## Files

In [5]:
files = [f for f in listdir('Globe_Shakespeare')]

In [6]:
base = 'Globe_Shakespeare/'

for file in files:
    file_name = base + file
    with open(file_name, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')

        for speech in soup.find_all('sp'):
            for tag in speech.find_all(['stage', 'speaker']):
                tag.decompose()

            text = speech.get_text()
            words = [word for word in word_tokenize(text) if word.isalpha()]

            df.loc[(df['file_name'] == file_name) & (df['role_id'] == speech.get('who')), 'speech_count'] += 1
            df.loc[(df['file_name'] == file_name) & (df['role_id'] == speech.get('who')), 'total_words'] += len(words)

In [7]:
df['avg_words_per_speech'] = df['total_words'] / df['speech_count']

In [8]:
df

Unnamed: 0,role_id,role,file_name,gender,speech_count,total_words,avg_words_per_speech
0,tim.,TIMON of Athens,Globe_Shakespeare/tim.xml,MALE,210,6225,29.642857
1,lucul.,LUCULLUS,Globe_Shakespeare/tim.xml,MALE,5,191,38.200000
2,sem.,SEMPRONIUS,Globe_Shakespeare/tim.xml,MALE,2,204,102.000000
3,ven.,VENTIDIUS,Globe_Shakespeare/tim.xml,MALE,2,59,29.500000
4,alcib.,ALCIBIADES,Globe_Shakespeare/tim.xml,MALE,39,1167,29.923077
...,...,...,...,...,...,...,...
1116,serv.,Servant,Globe_Shakespeare/ham.xml,MALE,1,9,9.000000
1117,sail.,Sailor,Globe_Shakespeare/ham.xml,MALE,2,39,19.500000
1118,mess.,Mess.,Globe_Shakespeare/ham.xml,MALE,2,36,18.000000
1119,lord.,Lord.,Globe_Shakespeare/ham.xml,MALE,3,67,22.333333


In [9]:
df.to_csv('processed_data.csv')