In [1]:
import pandas as pd
import numpy as np
import requests, re
import os

pd.set_option('display.max_colwidth', None)

In [2]:
all_titles = pd.read_csv('../data/uniq_titles.csv')

In [3]:
all_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Play        1461 non-null   object
 1   Title       1461 non-null   object
 2   Occurences  1461 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 34.4+ KB


In [4]:
all_titles.sample(10)

Unnamed: 0,Play,Title,Occurences
1024,winters tale,feather in the wind,3
1033,sonnets,a summer's lease,1
1339,hamlet,country matters,8
257,king lear,out of the storm,8
400,macbeth,the curtained sleep,1
997,twelfth night,my father's house,13
1218,hamlet,but thinking makes it so,1
16,antony and cleopatra,"new heaven, new earth",1
435,macbeth,malice domestic 4,1
1112,hamlet,the winds of heaven,2


Re-index in order to make a unique identifier for `all_titles`.

In [8]:
all_titles = all_titles.reset_index()

In [9]:
all_titles.rename(columns={'index':'uniq_id'}, inplace=True)

In [10]:
all_titles.head()

Unnamed: 0,uniq_id,Play,Title,Occurences
0,0,alls well that ends well,all's well that ends well,3
1,1,alls well that ends well,jing tao hai land de ren sheng (all's well that ends well),1
2,2,alls well that ends well,all's well that ends wrong,1
3,3,alls well that ends well,all's well,4
4,4,alls well that ends well,poor caroline,1


Strip all punctuation from the titles

In [11]:
strip_titles = all_titles.copy(deep=True)

In [12]:
strip_titles['Title'] = strip_titles['Title'].str.replace(r'[^\w\s]', '')

In [13]:
strip_titles.head()

Unnamed: 0,uniq_id,Play,Title,Occurences
0,0,alls well that ends well,alls well that ends well,3
1,1,alls well that ends well,jing tao hai land de ren sheng alls well that ends well,1
2,2,alls well that ends well,alls well that ends wrong,1
3,3,alls well that ends well,alls well,4
4,4,alls well that ends well,poor caroline,1


In [16]:
plays_sonnets = pd.read_csv('../data/lines_cleaned.csv')

In [17]:
plays_sonnets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113553 entries, 0 to 113552
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Dataline      113553 non-null  int64 
 1   Play          113553 non-null  object
 2   Line          113553 non-null  object
 3   ActSceneLine  113553 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.5+ MB


In [18]:
plays_sonnets.sample(10)

Unnamed: 0,Dataline,Play,Line,ActSceneLine
83537,83538,richard iii,The precedent was full as long a-doing:,3.6.7
45543,45544,king john,"Sir, sir, impatience hath his privilege.",4.3.32
172,173,henry iv,"No, thou shalt.",1.2.59
111900,111901,sonnet 37,As a decrepit father takes delight,0.0.1
43432,43433,henry viii,"God shall be truly known, and those about her",5.5.41
17185,17186,as you like it,me out of tune.,3.2.238
34815,34816,hamlet,"For we will fetters put upon this fear,",3.3.26
11042,11043,henry vi part 3,And would you not do much to do them good?,3.2.46
69525,69526,much ado about nothing,blind Cupid.,1.1.223
33412,33413,hamlet,Drabbing: you may go so far.,2.1.28


Convert all play and sonnet lines to lower case, and strip all punctuation

In [19]:
strip_plays = plays_sonnets.copy(deep=True)

In [20]:
strip_plays['Line'] = strip_plays['Line'].str.replace(r'[^\w\s]', '')

In [21]:
strip_plays['Line'] = strip_plays['Line'].str.lower()

In [22]:
strip_plays.sample(10)

Unnamed: 0,Dataline,Play,Line,ActSceneLine
43970,43971,king john,bedlam have done,2.1.187
60132,60133,measure for measure,shore of my modesty but my brother justice have i,3.2.243
57386,57387,macbeth,music and a song black spirits and c,4.1.43
19451,19452,antony and cleopatra,he is an absolute master,2.2.199
84284,84285,richard iii,hidest thou that forehead with a golden crown,4.4.140
99397,99398,troilus and cressida,armed and gone ere ye came to ilium helen was not,1.2.50
79454,79455,richard ii,hath broke his staff resignd his stewardship,2.2.60
68658,68659,midsummer nights dream,exeunt,4.1.106
52217,52218,king lear,i feel this pin prick would i were assured,4.7.64
107513,107514,two gentlemen of verona,unless i prove false traitor to myself,4.4.107


Concatenate all lines into a single string.

In [23]:
all_lines = ''

for line in strip_plays['Line']:
    line = line + ' '
    all_lines = all_lines + line
    

In [25]:
print(len(all_lines))
print(all_lines[:205])

4255876
act i scene i london the palace enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others so shaken as we are so wan with care find we a time for frighted peace to pant a


In [26]:
strip_plays.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine
0,1,henry iv,act i,1.1.1
1,2,henry iv,scene i london the palace,1.1.1
2,3,henry iv,enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others,1.1.1
3,4,henry iv,so shaken as we are so wan with care,1.1.1
4,5,henry iv,find we a time for frighted peace to pant,1.1.2


Search `all_lines` for identical matches of the list of unique titles

In [27]:
results = []
neg_results = []

for title in strip_titles['Title']:
    if title in all_lines:
        results.append(True)
        neg_results.append(False)
    else:
        results.append(False)
        neg_results.append(True)

In [28]:
matched_titles = strip_titles[results].reset_index(drop=True)

In [29]:
matched_titles.head()

Unnamed: 0,uniq_id,Play,Title,Occurences
0,0,alls well that ends well,alls well that ends well,3
1,3,alls well that ends well,alls well,4
2,6,alls well that ends well,the devil drives,5
3,8,alls well that ends well,edge of hazard,1
4,13,alls well that ends well,a mingled yarn,3


In [30]:
unmatched_titles = strip_titles[neg_results].reset_index(drop=True)

In [31]:
unmatched_titles.head()

Unnamed: 0,uniq_id,Play,Title,Occurences
0,1,alls well that ends well,jing tao hai land de ren sheng alls well that ends well,1
1,2,alls well that ends well,alls well that ends wrong,1
2,4,alls well that ends well,poor caroline,1
3,5,alls well that ends well,surviving childhood cancer,1
4,7,alls well that ends well,when the devil drives,1


In [34]:
print('Number of unique titles directly quoted from plays:', matched_titles.shape[0])
print('The above titles comprise', matched_titles.Occurences.sum(), 'unique works')
print('Number of mismatched titles', unmatched_titles.shape[0], 'due to figurative language')

Number of unique titles directly quoted from plays: 646
The above titles comprise 1620 unique works
Number of mismatched titles 815 due to figurative language


In [35]:
matched_titles.Occurences.describe()

count    646.000000
mean       2.507740
std        4.330791
min        1.000000
25%        1.000000
50%        1.000000
75%        3.000000
max       81.000000
Name: Occurences, dtype: float64

In [36]:
matched_titles.nlargest(10, 'Occurences')

Unnamed: 0,uniq_id,Play,Title,Occurences
161,308,king lear,full circle,81
277,566,merchant of venice,all that glitters,39
250,500,macbeth,all our yesterdays,24
280,580,merchant of venice,the quality of mercy,23
333,711,othello,sweet revenge,19
554,1277,hamlet,outrageous fortune,17
561,1287,hamlet,perchance to dream,17
109,200,julius caesar,the evil that men do,15
171,375,macbeth,the seeds of time,14
432,1002,twelfth night,midsummer madness,14


In [37]:
unmatched_titles.sample(25)

Unnamed: 0,uniq_id,Play,Title,Occurences
717,1269,hamlet,chemistry of conjugated cyclic compounds,1
233,449,macbeth,bubble bubble toil and trouble,1
795,1420,hamlet,adams profession and its conquest by eve,1
182,345,king lear,the circle comes full,1
308,597,merry wives of windsor,the falstaff saga,1
121,252,king lear,a fathers curse and other stories,1
761,1346,hamlet,cobbetts country book,1
561,974,titus andronicus,the gentle people,4
667,1186,hamlet,an arrant knave and other plays,1
377,710,othello,murder out of turn,1


I have noticed that improper articles (swapping 'the' for 'a' and visa versa) have prevented matches in previous work. Here I am investigating the potential impact of the article switches, and will strip them if it looks promising.

In [38]:
articles = ('a ', 'an ', 'the ')
result2 = unmatched_titles.Title.str.startswith(articles)

In [39]:
unmatched_articles = unmatched_titles[result2]
print(len(unmatched_articles))
unmatched_articles.sample(5)

226


Unnamed: 0,uniq_id,Play,Title,Occurences
473,851,romeo and juliet,the romeo flag,1
330,621,midsummer nights dream,a midsummer tempest,1
44,103,comedy of errors,a comedy of terrors,3
678,1206,hamlet,the method in the madness,1
643,1134,hamlet,a smile in the minds eye,1


In [40]:
for title in unmatched_articles.Title:  
    if title.startswith('an ') == True:
        unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('an ', '', 1)
    elif title.startswith('a ') == True:
        unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('a ', '', 1)
    elif title.startswith('the ') == True:
        unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('the ', '', 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('the ', '', 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('a ', '', 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_articles['Title'] = unmatch

In [41]:
unmatched_articles.sample(5)

Unnamed: 0,uniq_id,Play,Title,Occurences
260,524,macbeth,sound and the fury,1
223,422,macbeth,chishtis,1
562,975,titus andronicus,gentle people,1
667,1186,hamlet,arrant knave and other plays,1
301,583,merchant of venice,scabbardless sword,1


In [42]:
results = []
neg_results = []

for title in unmatched_articles['Title']:
    if title in all_lines:
        results.append(True)
        neg_results.append(False)
    else:
        results.append(False)
        neg_results.append(True)

In [43]:
matched_articles = unmatched_articles[results].reset_index(drop=True)

In [44]:
print('We matched', len(matched_articles), 'new titles')
matched_articles.sample(10)

We matched 47 new titles


Unnamed: 0,uniq_id,Play,Title,Occurences
7,260,king lear,little world of man,1
24,802,richard iii,son of york,1
44,1362,hamlet,witching time,2
20,735,richard ii,womans war,1
21,736,richard ii,spotless reputation,1
27,859,romeo and juliet,falconers voice,1
34,1028,winters tale,merry heart,5
40,1151,hamlet,danger of desire,1
6,253,king lear,fathers curse,1
22,752,richard ii,other eden,1


In [45]:
matched_titles.append(matched_articles)

Unnamed: 0,uniq_id,Play,Title,Occurences
0,0,alls well that ends well,alls well that ends well,3
1,3,alls well that ends well,alls well,4
2,6,alls well that ends well,the devil drives,5
3,8,alls well that ends well,edge of hazard,1
4,13,alls well that ends well,a mingled yarn,3
...,...,...,...,...
42,1293,hamlet,mortal coil,1
43,1317,hamlet,traveller returns,1
44,1362,hamlet,witching time,2
45,1379,hamlet,counterfeit presentment,1


In [38]:
dups = matched_titles.duplicated(subset='Title', keep=False)
matched_titles[dups]

Unnamed: 0,uniq_id,Play,Title,Occurences
12,26,antony and cleopatra,the gods themselves,6
13,27,king lear,the gods themselves,6
14,28,timon of athens,the gods themselves,6
15,29,troilus and cressida,the gods themselves,6
16,30,winters tale,the gods themselves,6
17,31,hamlet,the gods themselves,6
24,44,as you like it,under the greenwood tree,1
25,45,as you like it,under the greenwood tree,4
29,49,as you like it,thereby hangs a tale,4
30,50,merry wives of windsor,thereby hangs a tale,4


In [39]:
drop_me = matched_titles[dups].index.tolist()
print(drop_me)

[12, 13, 14, 15, 16, 17, 24, 25, 29, 30, 36, 37, 59, 60, 73, 74, 89, 90, 93, 94, 135, 136, 138, 139, 140, 141, 161, 162, 188, 189, 201, 202, 216, 217, 220, 221, 244, 245, 246, 247, 335, 336, 337, 338, 346, 347, 418, 419, 427, 428, 522, 523, 556, 557, 610, 611]


In [40]:
print(matched_titles.shape[0])
matched_titles.drop(drop_me, inplace=True)
print(matched_titles.shape[0])

646
590


In [41]:
new_rows = [pd.Series([26, 'multiple', 'the gods themselves', 6], index=matched_titles.columns),
                pd.Series([44, 'as you like it', 'under the greenwood tree', 5], index=matched_titles.columns),
                pd.Series([49, 'multiple', 'thereby hangs a tale', 4], index=matched_titles.columns),
                pd.Series([56, 'multiple', 'better days', 7], index=matched_titles.columns),
                pd.Series([104, 'multiple', 'heres a villain', 2], index=matched_titles.columns),
                pd.Series([119, 'multiple', 'times fool', 6], index=matched_titles.columns),
                pd.Series([157, 'multiple', 'exeunt murderers', 2], index=matched_titles.columns),
                pd.Series([162, 'henry vi part 2', 'kill all the lawyers', 6], index=matched_titles.columns),
                pd.Series([265, 'multiple', 'this little world', 5], index=matched_titles.columns),
                pd.Series([269, 'multiple', 'in such a night as this', 2], index=matched_titles.columns),
                pd.Series([271, 'multiple', 'in such a night', 2], index=matched_titles.columns),
                pd.Series([308, 'king lear', 'full circle', 82], index=matched_titles.columns),
                pd.Series([404, 'multiple', 'to sleep no more', 3], index=matched_titles.columns),
                pd.Series([427, 'multiple', 'enter two murderers', 2], index=matched_titles.columns),
                pd.Series([450, 'macbeth', 'double double', 5], index=matched_titles.columns),
                pd.Series([458, 'macbeth', 'fire burn', 2], index=matched_titles.columns),
                pd.Series([493, 'macbeth', 'tomorrow and tomorrow and tomorrow', 3], index=matched_titles.columns),
                pd.Series([495, 'macbeth', 'tomorrow and tomorrow', 3], index=matched_titles.columns),
                pd.Series([721, 'multiple', 'better angel', 4], index=matched_titles.columns),
                pd.Series([723, 'multiple', 'the better angel', 4], index=matched_titles.columns),
                pd.Series([742, 'multiple', 'the eye of heaven', 6], index=matched_titles.columns),
                pd.Series([986, 'twelfth night', 'lovers meeting', 5], index=matched_titles.columns),
                pd.Series([997, 'twelfth night', 'my fathers house', 14], index=matched_titles.columns),
                pd.Series([1190, 'hamlet', 'more things in heaven', 2], index=matched_titles.columns),
                pd.Series([1281, 'hamlet', 'a sea of troubles', 4], index=matched_titles.columns),
                pd.Series([1394, 'hamlet', 'what is a man', 2], index=matched_titles.columns)]

In [42]:
matched_titles = matched_titles.append(new_rows, ignore_index=True)

In [43]:
matched_titles.shape[0]

616

In [44]:
matched_titles.sample(25)

Unnamed: 0,uniq_id,Play,Title,Occurences
162,410,macbeth,painted devil,1
542,1366,hamlet,when churchyards yawn,1
439,1124,hamlet,minds i,1
240,580,merchant of venice,the quality of mercy,23
86,191,julius caesar,dogs of war,2
401,1041,sonnets,my tattered loving,1
38,90,as you like it,better strangers,1
512,1299,hamlet,the laws delay,2
426,1104,hamlet,so gracious is the time,1
24,54,as you like it,i must have liberty,1


In [45]:
print('Number of unique titles directly quoted from plays:', matched_titles.shape[0])
print('The above titles comprise', matched_titles.Occurences.sum(), 'unique works')

Number of unique titles directly quoted from plays: 616
The above titles comprise 1541 unique works


Subset the original `all_titles` df with the uniq_ids of the `matched_titles` titles in order to reintroduce punctuation.

In [46]:
clean_titles = matched_titles.copy(deep=True)

In [47]:
clean_titles.rename(columns={'Title':'Drop_Me'}, inplace=True)
clean_titles.head()

Unnamed: 0,uniq_id,Play,Drop_Me,Occurences
0,0,alls well that ends well,alls well that ends well,3
1,3,alls well that ends well,alls well,4
2,6,alls well that ends well,the devil drives,5
3,8,alls well that ends well,edge of hazard,1
4,13,alls well that ends well,a mingled yarn,3


In [49]:
clean_titles = pd.merge(clean_titles, all_titles[['uniq_id', 'Title']], on='uniq_id', how='left')
clean_titles.head()

Unnamed: 0,uniq_id,Play,Drop_Me,Occurences,Title
0,0,alls well that ends well,alls well that ends well,3,all's well that ends well
1,3,alls well that ends well,alls well,4,all's well
2,6,alls well that ends well,the devil drives,5,the devil drives
3,8,alls well that ends well,edge of hazard,1,edge of hazard
4,13,alls well that ends well,a mingled yarn,3,a mingled yarn


In [50]:
clean_titles.shape

(616, 5)

In [51]:
clean_titles = clean_titles.drop(columns=['Drop_Me'])
clean_titles.head()

Unnamed: 0,uniq_id,Play,Occurences,Title
0,0,alls well that ends well,3,all's well that ends well
1,3,alls well that ends well,4,all's well
2,6,alls well that ends well,5,the devil drives
3,8,alls well that ends well,1,edge of hazard
4,13,alls well that ends well,3,a mingled yarn


In [52]:
datapath = '../data'

In [53]:
import os

In [54]:
datapath_matched_titles = os.path.join(datapath, 'matched_titles.csv')
matched_titles.to_csv(datapath_matched_titles, index=False)

In [55]:
datapath_clean_titles = os.path.join(datapath, 'clean_titles.csv')
clean_titles.to_csv(datapath_clean_titles, index=False)

In [56]:
my_text = open(r'../data/all_lines.txt','w')
my_text.write(all_lines)
my_text.close()