In [2]:
import pandas as pd
import collections
import fuzzywuzzy
from fuzzywuzzy import process
import pprint

pd.set_option('display.max_colwidth', None)

Using data from Wikipedia, I've generate two dictionaries for the categorization of Shakespeare's plays. The first dictionary `play_cat_strict` uses the traditional categorization of plays into comedy, history, and tragedy. The second dictionary `play_cat_modern` has been reorganized to better reflect the tone of the play. Histories have been collapsed into the tragedy category and tragicomedies have been grouped as romances. 

In [3]:
play_cat_strict = {'comedy' : ['The Tempest',  'The Two Gentlemen of Verona',  'The Merry Wives of Windsor',  'Measure for Measure',  'The Comedy of Errors',  'Much Ado About Nothing',  'Loves Labours Lost',  'A Midsummer Nights Dream',  'The Merchant of Venice',  'As You Like It',  'The Taming of the Shrew',  'Alls Well That Ends Well',  'Twelfth Night',  'The Winters Tale',  'Pericles, Prince of Tyre',  'The Two Noble Kinsmen'], 'history' : ['King John', 'Edward III',  'Richard II',  'Henry IV, Part 1',  'Henry IV, Part 2',  'Henry V',  'Henry VI, Part 1',  'Henry VI, Part 2',  'Henry VI, Part 3',  'Richard III',  'Henry VIII'], 'tragedy' : ['Troilus and Cressida', 'Coriolanus', 'Titus Andronicus',  'Romeo and Juliet',  'Timon of Athens',  'Julius Caesar',  'Macbeth',  'Hamlet',  'King Lear',  'Othello',  'Antony and Cleopatra',  'Cymbeline']}
print('Number of plays per category in the traditional dataset')
for cat, plays in play_cat_strict.items():
    print(cat, ':', len(plays))

play_cat_modern = {'comedy' : ['The Two Gentlemen of Verona',  'The Merry Wives of Windsor',  'Measure for Measure',  'The Comedy of Errors',  'Much Ado About Nothing',  'Loves Labours Lost',  'A Midsummer Nights Dream',  'The Merchant of Venice',  'As You Like It',  'The Taming of the Shrew',  'Alls Well That Ends Well',  'Twelfth Night'], 'romance' : ['Pericles, Prince of Tyre',  'Cymbeline',  'The Winters Tale',  'The Tempest',  'The Two Noble Kinsmen'],  'tragedy' : ['Troilus and Cressida', 'Coriolanus', 'Titus Andronicus',  'Romeo and Juliet',  'Timon of Athens',  'Julius Caesar',  'Macbeth',  'Hamlet',  'King Lear',  'Othello',  'Antony and Cleopatra', 'King John', 'Edward III',  'Richard II',  'Henry IV, Part 1',  'Henry IV, Part 2',  'Henry V',  'Henry VI, Part 1',  'Henry VI, Part 2',  'Henry VI, Part 3',  'Richard III',  'Henry VIII']}
print('Number of plays per category in the modern dataset')
for cat, plays in play_cat_modern.items():
    print(cat, ':', len(plays))

Number of plays per category in the traditional dataset
comedy : 16
history : 11
tragedy : 12
Number of plays per category in the modern dataset
comedy : 12
romance : 5
tragedy : 22


I am grouping like this to keep my options open during sentiment analysis. There may be discernable differences in tone between comedies and tragedies (light/dark, happy/sad, fun/serious). Simimaly, the histories category may be more neutral (traditional dataset) or appropriately binned as a tragedy (modern). Finally, the romance plays are are different from the rest in that they are a mixture of comedy and tradegy, and the comingling may muddy sentiment analysis of the strict dataset.

In [4]:
wiki_plays = collections.OrderedDict()

for cat, plays in play_cat_strict.items():
    for i in range(len(plays)):
        wiki_plays[plays[i]] = []
        wiki_plays[plays[i]].append(cat)

for cat, plays in play_cat_modern.items():
    for i in range(len(plays)):
        wiki_plays[plays[i]].append(cat)

In [5]:
print(wiki_plays['The Tempest'])

['comedy', 'romance']


In [6]:
wiki_df = pd.DataFrame.from_dict(wiki_plays, orient='index', columns=['Traditional', 'Modern'])
wiki_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39 entries, The Tempest to Cymbeline
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Traditional  39 non-null     object
 1   Modern       39 non-null     object
dtypes: object(2)
memory usage: 936.0+ bytes


In [7]:
wiki_df = wiki_df.reset_index()
wiki_df = wiki_df.rename(columns={'index':'Play'})


In [8]:
wiki_df['Play'] = wiki_df['Play'].str.lower()
wiki_df.sample(5)

Unnamed: 0,Play,Traditional,Modern
26,henry viii,history,tragedy
15,the two noble kinsmen,comedy,romance
13,the winters tale,comedy,romance
12,twelfth night,comedy,comedy
5,much ado about nothing,comedy,comedy


Shalespeare's plays have been generously textised and are as a csv on Kaggle: https://www.kaggle.com/kingburrito666/shakespeare-plays

In [9]:
shakespeare_plays = pd.read_csv('../data/Shakespeare_data.csv')

In [10]:
shakespeare_plays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111396 entries, 0 to 111395
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Dataline          111396 non-null  int64  
 1   Play              111396 non-null  object 
 2   PlayerLinenumber  111393 non-null  float64
 3   ActSceneLine      105153 non-null  object 
 4   Player            111389 non-null  object 
 5   PlayerLine        111396 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 5.1+ MB


Compare play names in `wiki_plays` with `shakespeare_plays`

In [11]:
shakespeare_plays['Play'] = shakespeare_plays['Play'].str.lower()
shakespeare_plays.sample(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
17372,17373,as you like it,5.0,3.3.11,TOUCHSTONE,"Understanding, it strikes a man more dead than a"
34766,34767,hamlet,137.0,3.2.372,HAMLET,Or like a whale?
62439,62440,merchant of venice,6.0,2.7.78,MOROCCO,To take a tedious leave: thus losers part.
61556,61557,merchant of venice,32.0,1.1.188,ANTONIO,To have it of my trust or for my sake.
92479,92480,the tempest,160.0,,ARIEL,Enter CALIBAN with a burden of wood. A noise of thunder heard


In [12]:
shake_uniq = shakespeare_plays.Play.unique()
wiki_uniq = wiki_df.Play.unique()

In [13]:
print('Number of plays in kaggle file:', len(shake_uniq))
print('Number of plays in wiki categeory files:', len(wiki_uniq))

Number of plays in kaggle file: 36
Number of plays in wiki categeory files: 39


In [14]:
inconsistent = set(wiki_df.Play).difference(shakespeare_plays.Play)
print(len(inconsistent), inconsistent)

14 {'henry iv, part 1', 'the merchant of venice', 'the taming of the shrew', 'edward iii', 'the winters tale', 'henry vi, part 2', 'pericles, prince of tyre', 'henry iv, part 2', 'the merry wives of windsor', 'the two noble kinsmen', 'henry vi, part 1', 'the two gentlemen of verona', 'the comedy of errors', 'henry vi, part 3'}


Three of Shakespeare's plays are not prepresented in the kaggle dataset, and there is a namining discrepency between 14 plays. Investigating further...

In [15]:
shake_uniq.sort()
shake_uniq

array(['a comedy of errors', 'a midsummer nights dream', 'a winters tale',
       'alls well that ends well', 'antony and cleopatra',
       'as you like it', 'coriolanus', 'cymbeline', 'hamlet', 'henry iv',
       'henry v', 'henry vi part 1', 'henry vi part 2', 'henry vi part 3',
       'henry viii', 'julius caesar', 'king john', 'king lear',
       'loves labours lost', 'macbeth', 'measure for measure',
       'merchant of venice', 'merry wives of windsor',
       'much ado about nothing', 'othello', 'pericles', 'richard ii',
       'richard iii', 'romeo and juliet', 'taming of the shrew',
       'the tempest', 'timon of athens', 'titus andronicus',
       'troilus and cressida', 'twelfth night', 'two gentlemen of verona'],
      dtype=object)

In [16]:
wiki_uniq.sort()
wiki_uniq

array(['a midsummer nights dream', 'alls well that ends well',
       'antony and cleopatra', 'as you like it', 'coriolanus',
       'cymbeline', 'edward iii', 'hamlet', 'henry iv, part 1',
       'henry iv, part 2', 'henry v', 'henry vi, part 1',
       'henry vi, part 2', 'henry vi, part 3', 'henry viii',
       'julius caesar', 'king john', 'king lear', 'loves labours lost',
       'macbeth', 'measure for measure', 'much ado about nothing',
       'othello', 'pericles, prince of tyre', 'richard ii', 'richard iii',
       'romeo and juliet', 'the comedy of errors',
       'the merchant of venice', 'the merry wives of windsor',
       'the taming of the shrew', 'the tempest',
       'the two gentlemen of verona', 'the two noble kinsmen',
       'the winters tale', 'timon of athens', 'titus andronicus',
       'troilus and cressida', 'twelfth night'], dtype=object)

In [17]:
fuzzy_scores = {}

for proper_name in inconsistent:
    fuzzy_scores[proper_name] = fuzzywuzzy.process.extract(proper_name, shake_uniq, limit=2, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

In [18]:
pprint.pprint(fuzzy_scores)

{'edward iii': [('henry viii', 60), ('henry iv', 44)],
 'henry iv, part 1': [('henry vi part 1', 80), ('henry vi part 2', 73)],
 'henry iv, part 2': [('henry vi part 2', 80), ('henry vi part 1', 73)],
 'henry vi, part 1': [('henry vi part 1', 100), ('henry vi part 2', 93)],
 'henry vi, part 2': [('henry vi part 2', 100), ('henry vi part 1', 93)],
 'henry vi, part 3': [('henry vi part 3', 100), ('henry vi part 1', 93)],
 'pericles, prince of tyre': [('pericles', 52), ('taming of the shrew', 48)],
 'the comedy of errors': [('a comedy of errors', 84), ('the tempest', 45)],
 'the merchant of venice': [('merchant of venice', 90),
                            ('two gentlemen of verona', 53)],
 'the merry wives of windsor': [('merry wives of windsor', 92),
                                ('merchant of venice', 41)],
 'the taming of the shrew': [('taming of the shrew', 90),
                             ('a midsummer nights dream', 38)],
 'the two gentlemen of verona': [('two gentlemen of verona

'Henry IV, Part 2', 'The Two Noble Kinsmen', and 'Edward III' are completely missing from the `shakespeare_plays` dataset. The kaggle discussion seems to indicate that Part 1 and Part 2 have been collapesed into a single play. It looks like articles ("The" and "A") have been stripped from some of the plays in `shakespeare_plays`; however, there are also several instances of an incorrect article. There is also inconsistent capitalization in that dataset as well. I can use fuzzywuzzy scores to safely replace all the 100 score matches. The next closest score of 93 with start changing around the Henry IV and Henry the VI datasets.

In [19]:
def replace_col_matches(df, column, string_to_match, min_ratio = 100):
    """This function gets the top closest matches to our input string a list of unique strings and 
    replaces all rows with matches with a fuzzy ratio = 100 matches with the input matches
    """
    strings = df[column].unique()
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=1, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]
    rows_with_matches = df[column].isin(close_matches)
    df.loc[rows_with_matches, column] = string_to_match

In [20]:
remakespeare_plays = shakespeare_plays
type(remakespeare_plays)

pandas.core.frame.DataFrame

In [21]:
for proper_name in inconsistent:
    replace_col_matches(remakespeare_plays, 'Play', proper_name)

In [22]:
print(set(wiki_df.Play).difference(remakespeare_plays.Play))


{'henry iv, part 1', 'the merchant of venice', 'the taming of the shrew', 'edward iii', 'the winters tale', 'pericles, prince of tyre', 'henry iv, part 2', 'the merry wives of windsor', 'the two noble kinsmen', 'the two gentlemen of verona', 'the comedy of errors'}


In [23]:
print(set(remakespeare_plays.Play).difference(wiki_df.Play))


{'a comedy of errors', 'henry iv', 'merchant of venice', 'taming of the shrew', 'a winters tale', 'merry wives of windsor', 'pericles', 'two gentlemen of verona'}


<div class="alert alert-block alert-info">
As much as it pains me, I think it is prudent to remove the articles 'The' and 'A', as this seems to be an area of frequent error and/or omission.</div>

In [24]:
renaming_sk = {'a comedy of errors':'comedy of errors', 'a winters tale':'winters tale', 'a midsummer nights dream':'midsummer nights dream'}
remakespeare_plays.Play = remakespeare_plays.Play.replace(renaming_sk)

In [25]:
renaming_wi = {'the comedy of errors':'comedy of errors', 'the winters tale':'winters tale', 'pericles, prince of tyre':'pericles', 'the merchant of venice':'merchant of venice', 'the merry wives of windsor':'merry wives of windsor', 'henry iv, part 1':'henry iv', 'the two gentlemen of verona':'two gentlemen of verona', 'the taming of the shrew':'taming of the shrew', 'a midsummer nights dream':'midsummer nights dream'}
wiki_df.Play = wiki_df.Play.replace(renaming_wi)

In [26]:
print(set(remakespeare_plays.Play).difference(wiki_df.Play))

set()


<div class="alert alert-block alert-info">
Combining the shakespeare plays dataset with the wiki play categories of Traditional and Modern</div>

In [27]:
shake_df = pd.merge(remakespeare_plays, wiki_df, how='left', on='Play')
shake_df.sample(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,Traditional,Modern
64305,64306,merry wives of windsor,69.0,1.1.135,SLENDER,"never come in mine own great chamber again else, of",comedy,comedy
57085,57086,macbeth,9.0,3.4.21,MACBETH,Thou art the nonpareil.,tragedy,tragedy
27497,27498,coriolanus,11.0,4.6.25,SICINIUS,"God-den, our neighbours.",tragedy,tragedy
37749,37750,henry v,1.0,3.0.13,Chorus,"Breasting the lofty surge: O, do but think",history,tragedy
88352,88353,romeo and juliet,15.0,5.3.100,ROMEO,Than with that hand that cut thy youth in twain,tragedy,tragedy


In [28]:
shake_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111396 entries, 0 to 111395
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Dataline          111396 non-null  int64  
 1   Play              111396 non-null  object 
 2   PlayerLinenumber  111393 non-null  float64
 3   ActSceneLine      105153 non-null  object 
 4   Player            111389 non-null  object 
 5   PlayerLine        111396 non-null  object 
 6   Traditional       111396 non-null  object 
 7   Modern            111396 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 7.6+ MB


<div class="alert alert-block alert-info">
Investigation of the missing values in the `ActSceneLine` column indicates that these appear to be stage directions. Consider keeping as titles can be obtained from stage direction, such as Anthony Boucher's <i>Exeunt Murderers</i> and Sara Woods's <i>Enter a Gentlewoman</i>.</div>

In [29]:
no_asl = shake_df['ActSceneLine'].isna()
shake_df[no_asl].sample(10)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,Traditional,Modern
39279,39280,henry v,13.0,,KING HENRY V,Enter MONTJOY,history,tragedy
8679,8680,"henry vi, part 2",5.0,,CADE,Putting on SIR HUMPHREY'S brigandine,history,tragedy
110847,110848,winters tale,24.0,,PAULINA,Enter a Gentleman,comedy,romance
15744,15745,as you like it,114.0,,KING,SCENE I. Orchard of Oliver's house.,comedy,comedy
12661,12662,alls well that ends well,9.0,,KING EDWARD IV,SCENE I. Rousillon. The COUNT's palace.,comedy,comedy
63309,63310,merchant of venice,28.0,,JESSICA,ACT IV,comedy,comedy
79314,79315,richard ii,33.0,,KING RICHARD II,"Flourish. Exeunt KING RICHARD II, QUEEN, DUKE OF AUMERLE, BUSHY, GREEN, and BAGOT",history,tragedy
19599,19600,antony and cleopatra,13.0,,MARK ANTONY,Exeunt,tragedy,tragedy
28575,28576,cymbeline,22.0,,QUEEN,Exit,tragedy,romance
34931,34932,hamlet,18.0,,LORD POLONIUS,Falls and dies,tragedy,tragedy


Note to Self: I particularly like these stage directions as titles:
- shake_df.loc[shake_df['Dataline']==3066]  *Dies*
- shake_df.loc[shake_df['Dataline']==33372] *They swear*



In [30]:
shake_df['ActSceneLine'] = shake_df['ActSceneLine'].fillna(method='backfill', axis=0)

In [31]:
shake_df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,Traditional,Modern
0,1,henry iv,,1.1.1,,ACT I,history,tragedy
1,2,henry iv,,1.1.1,,SCENE I. London. The palace.,history,tragedy
2,3,henry iv,,1.1.1,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others",history,tragedy
3,4,henry iv,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",history,tragedy
4,5,henry iv,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",history,tragedy


<div class="alert alert-block alert-info">There seems to be some issues with the PlayerLinenumber column that I discivered while looking through the data for the different plays. I also con't imagine this column being of much utility, even it was accurate. I will back fill the 3 missing values for now, and likely ignore/remove the column altogether at a later time. I will also backfill the 7 missing Players in the df, since it is likely the the next person to speak can be attributed to these scene directions.
</div>

In [32]:
shake_df['PlayerLinenumber'] = shake_df['PlayerLinenumber'].fillna(method='backfill', axis=0)
shake_df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,Traditional,Modern
0,1,henry iv,1.0,1.1.1,,ACT I,history,tragedy
1,2,henry iv,1.0,1.1.1,,SCENE I. London. The palace.,history,tragedy
2,3,henry iv,1.0,1.1.1,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others",history,tragedy
3,4,henry iv,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",history,tragedy
4,5,henry iv,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",history,tragedy


In [33]:
no_player = shake_df['Player'].isna()
shake_df[no_player].head(7)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,Traditional,Modern
0,1,henry iv,1.0,1.1.1,,ACT I,history,tragedy
1,2,henry iv,1.0,1.1.1,,SCENE I. London. The palace.,history,tragedy
2,3,henry iv,1.0,1.1.1,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others",history,tragedy
2015,2016,henry iv,10.0,3.2.183,,Eastcheap. The Boar's-Head Tavern.,history,tragedy
2016,2017,henry iv,10.0,3.2.183,,Enter FALSTAFF and BARDOLPH,history,tragedy
29459,29460,cymbeline,7.0,2.2.55,,An ante-chamber adjoining Imogen's apartments.,tragedy,romance
29460,29461,cymbeline,7.0,2.2.56,,Enter CLOTEN and Lords,tragedy,romance


In [34]:
shake_df['Player'] = shake_df['Player'].fillna(method='backfill', axis=0)

In [35]:
print(shake_df[shake_df['ActSceneLine'].isna()])

        Dataline          Play  PlayerLinenumber ActSceneLine   Player  \
111395    111396  winters tale              38.0          NaN  LEONTES   

       PlayerLine Traditional   Modern  
111395     Exeunt      comedy  romance  


In [36]:
shake_df['ActSceneLine'] = shake_df['ActSceneLine'].fillna(method='ffill', axis=0)

In [37]:
rename_again = {'henry vi, part 1':'henry vi part 1', 'henry vi, part 2':'henry vi part 2', 'henry vi, part 3':'henry vi part 3', 'the tempest':'tempest'}
shake_df.Play = shake_df.Play.replace(rename_again)

In [38]:
shake_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111396 entries, 0 to 111395
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Dataline          111396 non-null  int64  
 1   Play              111396 non-null  object 
 2   PlayerLinenumber  111396 non-null  float64
 3   ActSceneLine      111396 non-null  object 
 4   Player            111396 non-null  object 
 5   PlayerLine        111396 non-null  object 
 6   Traditional       111396 non-null  object 
 7   Modern            111396 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 7.6+ MB


<div class="alert alert-block alert-info">I was curious if there were common lines, particulaly those in the stage directions (like "Exit"). For now, I'll likely keep them in- I don't think they will have much influence. </div>

In [39]:
duplicates = shake_df.PlayerLine.duplicated()
dup_df = shake_df[duplicates]

In [40]:
duplicated_lines = {}

for line in dup_df.PlayerLine:
    if line in duplicated_lines:
        duplicated_lines[line] += 1
    else:
        duplicated_lines[line] = 1

In [41]:
print({k for k, v in duplicated_lines.items() if v >= 30})

{'Exeunt', 'DON', 'Enter a Messenger', 'ACT II', 'Exit', 'ACT III', 'ACT IV', 'ACT V', 'Reads', 'Aside', 'Dies', 'ACT I', 'ANTIPHOLUS'}


<div class="alert alert-block alert-info"> The list of Shakespearean titles contains titles derived from Shakespeare's sonnets, so I thought the inclusion of the sonnets would strengthen the dataset. The text of Shakespeare's sonnets has been curated by Prooffreader and downloaded from GitHub: https://github.com/Prooffreader/Datasets/tree/master/shakespeare_complete_works_data </div>

In [42]:
shakespeare_poems = pd.read_csv('../data/poem_lines.csv', encoding='latin-1')
shakespeare_poems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6043 entries, 0 to 6042
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  6043 non-null   int64  
 1   title               6043 non-null   object 
 2   intro_or_body       3886 non-null   object 
 3   sonnet_num          2157 non-null   float64
 4   stanza_num          3873 non-null   float64
 5   quatrain_num        2295 non-null   float64
 6   quatrain_bool       6043 non-null   int64  
 7   stanzasmall_bool    6043 non-null   int64  
 8   couplet_bool        6043 non-null   int64  
 9   tercet_bool         6043 non-null   int64  
 10  dedication_bool     6043 non-null   int64  
 11  subtitle_bool       6043 non-null   int64  
 12  line                6043 non-null   object 
 13  line_number_global  6039 non-null   float64
 14  line_number         3014 non-null   float64
 15  offset              2 non-null      float64
 16  rhyme 

In [43]:
shakespeare_poems.sample(5)

Unnamed: 0,id,title,intro_or_body,sonnet_num,stanza_num,quatrain_num,quatrain_bool,stanzasmall_bool,couplet_bool,tercet_bool,dedication_bool,subtitle_bool,line,line_number_global,line_number,offset,rhyme,contains_tags
5426,5427,Venus & Adonis,Body,,97.0,,0,0,0,0,0,0,The poor fool prays her that he may depart.,584.0,287.0,,,
5158,5159,Venus & Adonis,Body,,52.0,,0,0,0,0,0,0,"She puts on outward strangeness, seems unkind,",316.0,153.0,,,
4457,4458,Sonnets,,128.0,,3.0,1,0,0,0,0,0,"And situation with those dancing chips,",1789.0,13.0,,c,
2064,2065,The Rape of Lucrece,Body,,179.0,,0,0,0,0,0,0,Through crystal walls each little mote will peep;,1255.0,,,a,
3809,3810,Sonnets,,82.0,,2.0,1,0,0,0,0,0,And therefore art enforc'd to seek anew,1141.0,11.0,,a,


<div class="alert alert-block alert-info">
This dataset contains all of Shakespeare's poems. I have parsed out only the sonnets, including their line number in order to maintain the poems' orders. I am also forward-fillling the NaN line numbers in order to keep them together.</div>

In [44]:
sonnets = shakespeare_poems[shakespeare_poems.title == 'Sonnets']
sonnets = sonnets[['id', 'title', 'sonnet_num', 'line', 'line_number']].reset_index()
sonnets.drop(columns=['index'], inplace=True)
sonnets.head()

Unnamed: 0,id,title,sonnet_num,line,line_number
0,2670,Sonnets,1.0,"From fairest creatures we desire increase,",1.0
1,2671,Sonnets,1.0,"That thereby beauty's rose might never die,",
2,2672,Sonnets,1.0,"But as the riper should by time decease,",2.0
3,2673,Sonnets,1.0,His tender heir might bear his memory:,
4,2674,Sonnets,1.0,"But thou, contracted to thine own bright eyes,",3.0


In [45]:
sonnets['line_number'] = sonnets['line_number'].fillna(method='ffill', axis=0)
sonnets.head()

Unnamed: 0,id,title,sonnet_num,line,line_number
0,2670,Sonnets,1.0,"From fairest creatures we desire increase,",1.0
1,2671,Sonnets,1.0,"That thereby beauty's rose might never die,",1.0
2,2672,Sonnets,1.0,"But as the riper should by time decease,",2.0
3,2673,Sonnets,1.0,His tender heir might bear his memory:,2.0
4,2674,Sonnets,1.0,"But thou, contracted to thine own bright eyes,",3.0


<div class="alert alert-block alert-info"> Comparings this df to the main shake_df to see if they can/should be merged. The 154 sonnets are largely concerned with love, so the Traditional/Modern catagorization for (potential) sentiment analysis won'e match 1:1.
</div>

In [46]:
sonnets.sample(5)

Unnamed: 0,id,title,sonnet_num,line,line_number
28,2698,Sonnets,3.0,"Look in thy glass and tell the face thou viewest,",1.0
1032,3702,Sonnets,74.0,"The coward conquest of a wretch's knife,",13.0
731,3401,Sonnets,53.0,"And you, but one, can every shadow lend:",2.0
826,3496,Sonnets,60.0,"Like as the waves make towards the pibbled shore,",8.0
183,2853,Sonnets,14.0,"And yet methinks I have astronomy,",8.0


In [47]:
shake_df.sample(5)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,Traditional,Modern
44107,44108,king john,71.0,2.1.320,French Herald,"Enter English Herald, with trumpet",history,tragedy
43537,43538,king john,13.0,1.1.44,QUEEN ELINOR,Enter a Sheriff,history,tragedy
58696,58697,measure for measure,7.0,1.3.41,DUKE VINCENTIO,When evil deeds have their permissive pass,comedy,comedy
64753,64754,merry wives of windsor,1.0,2.1.16,MISTRESS PAGE,"Or any kind of light,",comedy,comedy
14090,14091,alls well that ends well,9.0,3.2.25,COUNTESS,BERTRAM.,comedy,comedy


<div class="alert alert-block alert-info"> Make the sonnets lower case and combine the `title` and `sonnet_num' column. Also converting the line_number column of the sonnets into a form that coincides with the plays `ActSceneLine` column. Finally, the shake_df has an ordered column called DataLine that ends at 111396. I am starting the sonnets id column at 111397 so that I can concatenante the two dfs together
</div>

In [49]:
sonnets['title'] = 'sonnet'
sonnets['sonnet_num'] = sonnets['sonnet_num'].astype(int).astype(str)

In [50]:
sonnets_df = sonnets
sonnets_df['title'] = sonnets['title'] + ' ' + sonnets['sonnet_num']
sonnets_df.drop(columns=['sonnet_num'], inplace=True)

In [51]:
sonnets_df.head()

Unnamed: 0,id,title,line,line_number
0,2670,sonnet 1,"From fairest creatures we desire increase,",1.0
1,2671,sonnet 1,"That thereby beauty's rose might never die,",1.0
2,2672,sonnet 1,"But as the riper should by time decease,",2.0
3,2673,sonnet 1,His tender heir might bear his memory:,2.0
4,2674,sonnet 1,"But thou, contracted to thine own bright eyes,",3.0


In [52]:
sonnets_df2 = sonnets_df.copy(deep=True)

In [53]:
sonnets_df2.id = sonnets_df2.id + 108727


In [54]:
sonnets_df2.head()

Unnamed: 0,id,title,line,line_number
0,111397,sonnet 1,"From fairest creatures we desire increase,",1.0
1,111398,sonnet 1,"That thereby beauty's rose might never die,",1.0
2,111399,sonnet 1,"But as the riper should by time decease,",2.0
3,111400,sonnet 1,His tender heir might bear his memory:,2.0
4,111401,sonnet 1,"But thou, contracted to thine own bright eyes,",3.0


In [55]:
sonnets_df2 = sonnets_df2.rename(columns={'id':'Dataline', 'title':'Play', 'line':'Line', 'line_number':'ActSceneLine'})



In [56]:
sonnets_df2['ActSceneLine'] = sonnets_df2['ActSceneLine'].astype(int).astype(str)
sonnets_df2['ActSceneLine'] = '0.0.' + sonnets_df2['ActSceneLine']

In [57]:
sonnets_df2.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine
0,111397,sonnet 1,"From fairest creatures we desire increase,",0.0.1
1,111398,sonnet 1,"That thereby beauty's rose might never die,",0.0.1
2,111399,sonnet 1,"But as the riper should by time decease,",0.0.2
3,111400,sonnet 1,His tender heir might bear his memory:,0.0.2
4,111401,sonnet 1,"But thou, contracted to thine own bright eyes,",0.0.3


In [58]:
shake_df2 = shake_df[['Dataline', 'Play', 'PlayerLine', 'ActSceneLine']]

In [59]:
shake_df2 = shake_df2.rename(columns={'PlayerLine':'Line'})

In [60]:
print(shake_df2.shape)
print(sonnets_df2.shape)

(111396, 4)
(2157, 4)


In [61]:
lines_df = pd.concat([shake_df2, sonnets_df2], ignore_index=True)
print('The lines_df has', lines_df.shape[0], 'rows and', lines_df.shape[1], 'columns.')

The lines_df has 113553 rows and 4 columns.


In [62]:
lines_df.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine
0,1,henry iv,ACT I,1.1.1
1,2,henry iv,SCENE I. London. The palace.,1.1.1
2,3,henry iv,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others",1.1.1
3,4,henry iv,"So shaken as we are, so wan with care,",1.1.1
4,5,henry iv,"Find we a time for frighted peace to pant,",1.1.2


In [63]:
lines_df.tail()

Unnamed: 0,Dataline,Play,Line,ActSceneLine
113548,113549,sonnet 154,"Which from Love's fire took heat perpetual,",0.0.13
113549,113550,sonnet 154,Growing a bath and healthful remedy,0.0.13
113550,113551,sonnet 154,"For men diseas'd, but I, my mistress' thrall,",0.0.14
113551,113552,sonnet 154,"Came there for cure, and this by that I prove:",0.0.14
113552,113553,sonnet 154,"Love's fire heats water, water cools not love.",0.0.1


In [65]:
datapath = '../data'

In [67]:
import os

In [68]:
datapath_lines_df = os.path.join(datapath, 'lines_cleaned.csv')
lines_df.to_csv(datapath_lines_df, index=False)

In [69]:
datapath_shake_df = os.path.join(datapath, 'shake_plays_cleaned.csv')
shake_df.to_csv(datapath_shake_df, index=False)

In [70]:
datapath_sonnets_df2 = os.path.join(datapath, 'shake_sonnets_cleaned.csv')
sonnets_df2.to_csv(datapath_sonnets_df2, index=False)