# Shakespeare Character Genders

## Imports

In [1]:
from os import listdir
from bs4 import BeautifulSoup
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
from nltk.corpus import stopwords

## Data Loading

### Load Gender Labels

In [2]:
df = pd.read_csv('all_globe_name_id.csv', 
            header=None, 
            names=['role_id', 'role', 'file_name', 'blank', 'gender', 'initials'])
df = df.drop(labels=['blank', 'initials'], axis=1)

In [3]:
df['gender'] = df['gender'].str.upper()

In [4]:
df['speech_count'] = 0
df['total_words'] = 0
df

Unnamed: 0,role_id,role,file_name,gender,speech_count,total_words
0,tim.,TIMON of Athens,Globe_Shakespeare/tim.xml,MALE,0,0
1,lucul.,LUCULLUS,Globe_Shakespeare/tim.xml,MALE,0,0
2,sem.,SEMPRONIUS,Globe_Shakespeare/tim.xml,MALE,0,0
3,ven.,VENTIDIUS,Globe_Shakespeare/tim.xml,MALE,0,0
4,alcib.,ALCIBIADES,Globe_Shakespeare/tim.xml,MALE,0,0
...,...,...,...,...,...,...
1116,serv.,Servant,Globe_Shakespeare/ham.xml,MALE,0,0
1117,sail.,Sailor,Globe_Shakespeare/ham.xml,MALE,0,0
1118,mess.,Mess.,Globe_Shakespeare/ham.xml,MALE,0,0
1119,lord.,Lord.,Globe_Shakespeare/ham.xml,MALE,0,0


## Files

In [5]:
files = [f for f in listdir('Globe_Shakespeare')]

In [6]:
base = 'Globe_Shakespeare/'

for file in files:
    file_name = base + file
    with open(file_name, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')

        for speech in soup.find_all('sp'):
            for tag in speech.find_all(['stage', 'speaker']):
                tag.decompose()

            text = speech.get_text()
            words = [word for word in word_tokenize(text) if word.isalpha()]

            df.loc[(df['file_name'] == file_name) & (df['role_id'] == speech.get('who')), 'speech_count'] += 1
            df.loc[(df['file_name'] == file_name) & (df['role_id'] == speech.get('who')), 'total_words'] += len(words)

In [7]:
df['avg_words_per_speech'] = df['total_words'] / df['speech_count']
df

Unnamed: 0,role_id,role,file_name,gender,speech_count,total_words,avg_words_per_speech
0,tim.,TIMON of Athens,Globe_Shakespeare/tim.xml,MALE,210,6225,29.642857
1,lucul.,LUCULLUS,Globe_Shakespeare/tim.xml,MALE,5,191,38.200000
2,sem.,SEMPRONIUS,Globe_Shakespeare/tim.xml,MALE,2,204,102.000000
3,ven.,VENTIDIUS,Globe_Shakespeare/tim.xml,MALE,2,59,29.500000
4,alcib.,ALCIBIADES,Globe_Shakespeare/tim.xml,MALE,39,1167,29.923077
...,...,...,...,...,...,...,...
1116,serv.,Servant,Globe_Shakespeare/ham.xml,MALE,1,9,9.000000
1117,sail.,Sailor,Globe_Shakespeare/ham.xml,MALE,2,39,19.500000
1118,mess.,Mess.,Globe_Shakespeare/ham.xml,MALE,2,36,18.000000
1119,lord.,Lord.,Globe_Shakespeare/ham.xml,MALE,3,67,22.333333


In [8]:
df.to_csv('processed_data.csv')

## Read balanced data

In [9]:
balanced_data = pd.read_csv('balanced_data.csv')
balanced_data

Unnamed: 0,role_id,role,file_name,gender,speech_count,total_words,avg_words_per_speech
0,hor.,THOMAS HORNER,Globe_Shakespeare/2h6.xml,MALE,6,167,27.833333
1,ant-14,Dolabella,Globe_Shakespeare/ant.xml,MALE,23,282,12.260870
2,cor.,CORIN,Globe_Shakespeare/ayl.xml,MALE,24,550,22.916667
3,patr.,PATROCLUS,Globe_Shakespeare/tro.xml,MALE,37,401,10.837838
4,lor.,LORENZO,Globe_Shakespeare/mv.xml,MALE,47,1312,27.914894
...,...,...,...,...,...,...,...
205,jul.,JULIET,Globe_Shakespeare/rom.xml,FEMALE,118,4206,35.644068
206,isab.,ISABELLA,Globe_Shakespeare/mm.xml,FEMALE,129,2972,23.038760
207,des.,DESDEMONA,Globe_Shakespeare/oth.xml,FEMALE,165,2729,16.539394
208,ros.,ROSALIND,Globe_Shakespeare/ayl.xml,FEMALE,201,5648,28.099502


In [10]:
role_id_file_name_to_speeches = {}

for index, row in balanced_data.iterrows():
    role_id_file_name_to_speeches[(row['role_id'], row['file_name'])] = []

## Collect speeches

In [11]:
base = 'Globe_Shakespeare/'

for file in files:
    file_name = base + file
    with open(file_name, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')

        for speech in soup.find_all('sp'):
            for tag in speech.find_all(['stage', 'speaker']):
                tag.decompose()

            text = speech.get_text()
            words = [word for word in word_tokenize(text) if word.isalpha()]
            
            if speech.get('who') is not None and (speech.get('who'), file_name) in role_id_file_name_to_speeches:
                role_id_file_name_to_speeches[(speech.get('who'), file_name)].append(words)

## Create frequency matrix

In [12]:
words = set()
for role_id_file_name, speeches in role_id_file_name_to_speeches.items():
    for speech in speeches:
        for word in speech:
            words.add(word.lower())

In [13]:
role_id_file_name_to_word_to_count = {}
for role_id_file_name, speeches in role_id_file_name_to_speeches.items():
    word_to_count = {}
    
    for speech in speeches:
        for word in speech:
            if word_to_count.get(word.lower()) is None:
                word_to_count[word.lower()] = 1
            else:
                word_to_count[word.lower()] = word_to_count.get(word.lower()) + 1

    role_id_file_name_to_word_to_count[role_id_file_name] = word_to_count

In [14]:
frequency_matrix = pd.DataFrame(0, index=role_id_file_name_to_speeches.keys(), columns=words)
frequency_matrix

Unnamed: 0,Unnamed: 1,cackling,separation,inde,predecessors,heath,puissant,mirror,checker,bodiless,haunches,...,forfeiture,sparks,stepping,variable,dulzura,add,interposer,gentles,englishmen,fat
hor.,Globe_Shakespeare/2h6.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ant-14,Globe_Shakespeare/ant.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cor.,Globe_Shakespeare/ayl.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
patr.,Globe_Shakespeare/tro.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
lor.,Globe_Shakespeare/mv.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jul.,Globe_Shakespeare/rom.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
isab.,Globe_Shakespeare/mm.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
des.,Globe_Shakespeare/oth.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ros.,Globe_Shakespeare/ayl.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
for role_id_file_name, word_to_count in role_id_file_name_to_word_to_count.items():
    for word, count in word_to_count.items():
        frequency_matrix.loc[role_id_file_name, word] += count

frequency_matrix

Unnamed: 0,Unnamed: 1,cackling,separation,inde,predecessors,heath,puissant,mirror,checker,bodiless,haunches,...,forfeiture,sparks,stepping,variable,dulzura,add,interposer,gentles,englishmen,fat
hor.,Globe_Shakespeare/2h6.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ant-14,Globe_Shakespeare/ant.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cor.,Globe_Shakespeare/ayl.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
patr.,Globe_Shakespeare/tro.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
lor.,Globe_Shakespeare/mv.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jul.,Globe_Shakespeare/rom.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
isab.,Globe_Shakespeare/mm.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
des.,Globe_Shakespeare/oth.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ros.,Globe_Shakespeare/ayl.xml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Top 800 words

In [16]:
top_words = frequency_matrix.sum(axis=0).sort_values(ascending=False).index
top_words

Index(['the', 'i', 'and', 'to', 'you', 'of', 'a', 'my', 'that', 'is',
       ...
       'thisby', 'curbed', 'summons', 'remediate', 'forfeiters', 'worshipful',
       'defeatures', 'droppeth', 'breastplate', 'cackling'],
      dtype='object', length=12695)

In [17]:
stop_words = stopwords.words('english')

In [18]:
top_words = top_words[~top_words.isin(stop_words)]
top_words

Index(['thou', 'thy', 'shall', 'good', 'thee', 'come', 'would', 'lord', 'sir',
       'love',
       ...
       'thisby', 'curbed', 'summons', 'remediate', 'forfeiters', 'worshipful',
       'defeatures', 'droppeth', 'breastplate', 'cackling'],
      dtype='object', length=12561)

In [19]:
top_words = top_words[:800]
top_words

Index(['thou', 'thy', 'shall', 'good', 'thee', 'come', 'would', 'lord', 'sir',
       'love',
       ...
       'deserve', 'margaret', 'anon', 'birth', 'faults', 'pour', 'feed',
       'bitter', 'try', 'becomes'],
      dtype='object', length=800)