In [1]:
import pandas as pd
import numpy as np
import requests, re
import os

pd.set_option('display.max_colwidth', None)

In [2]:
all_titles = pd.read_csv('../data/uniq_titles.csv')

In [3]:
all_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Play        1461 non-null   object
 1   Title       1461 non-null   object
 2   Occurences  1461 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 34.4+ KB


In [4]:
all_titles.sample(10)

Unnamed: 0,Play,Title,Occurences
834,romeo and juliet,romeo's tune,1
863,romeo and juliet,a plague on both your causes,1
1067,sonnets,a joy proposed,1
1346,hamlet,cobbett's country book,1
1248,hamlet,the play's the thing,16
304,king lear,the mystery of things,1
1017,winters tale,winter's tale,1
632,midsummer nights dream,"lord, what fools",2
992,twelfth night,knitters in the sun,2
1168,hamlet,fools of nature,1


Re-index in order to make a unique identifier for `all_titles`.

In [5]:
all_titles = all_titles.reset_index()

In [6]:
all_titles.rename(columns={'index':'uniq_id'}, inplace=True)

In [7]:
all_titles.head()

Unnamed: 0,uniq_id,Play,Title,Occurences
0,0,alls well that ends well,all's well that ends well,3
1,1,alls well that ends well,jing tao hai land de ren sheng (all's well that ends well),1
2,2,alls well that ends well,all's well that ends wrong,1
3,3,alls well that ends well,all's well,4
4,4,alls well that ends well,poor caroline,1


Strip all punctuation from the titles

In [8]:
strip_titles = all_titles.copy(deep=True)

In [9]:
strip_titles['Title'] = strip_titles['Title'].str.replace(r'[^\w\s]', '')

In [10]:
strip_titles.head()

Unnamed: 0,uniq_id,Play,Title,Occurences
0,0,alls well that ends well,alls well that ends well,3
1,1,alls well that ends well,jing tao hai land de ren sheng alls well that ends well,1
2,2,alls well that ends well,alls well that ends wrong,1
3,3,alls well that ends well,alls well,4
4,4,alls well that ends well,poor caroline,1


In [11]:
plays_sonnets = pd.read_csv('../data/lines_cleaned.csv')

In [12]:
plays_sonnets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113553 entries, 0 to 113552
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Dataline      113553 non-null  int64 
 1   Play          113553 non-null  object
 2   Line          113553 non-null  object
 3   ActSceneLine  113553 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.5+ MB


In [13]:
plays_sonnets.sample(10)

Unnamed: 0,Dataline,Play,Line,ActSceneLine
48913,48914,julius caesar,Where is he?,5.4.20
87751,87752,romeo and juliet,"Things that, to hear them told, have made me tremble,",4.1.87
83756,83757,richard iii,Unto a lineal true-derived course.,3.7.202
27649,27650,coriolanus,"As you threw caps up will he tumble down,",4.6.171
42895,42896,henry viii,"Well, sir, what follows?",5.1.93
30171,30172,cymbeline,"And we will fear no poison, which attends",3.3.83
58945,58946,measure for measure,All-hallond eve.,2.1.123
91026,91027,taming of the shrew,Exeunt all but HORTENSIO,4.5.78
82381,82382,richard iii,Alas! for whose sake did I that ill deed?,1.4.201
87373,87374,romeo and juliet,"Monday, my lord,",3.4.19


Convert all play and sonnet lines to lower case, and strip all punctuation

In [14]:
strip_plays = plays_sonnets.copy(deep=True)

In [15]:
strip_plays['Line'] = strip_plays['Line'].str.replace(r'[^\w\s]', '')

In [16]:
strip_plays['Line'] = strip_plays['Line'].str.lower()

In [17]:
strip_plays.sample(10)

Unnamed: 0,Dataline,Play,Line,ActSceneLine
12576,12577,henry vi part 3,what will the aspiring blood of lancaster,5.6.61
69338,69339,much ado about nothing,faith niece you tax signior benedick too much,1.1.40
82572,82573,richard iii,and shall the same give pardon to a slave,2.1.104
23819,23820,comedy of errors,here comes my man i think he brings the money,4.4.9
73882,73883,othello,i faith i fear it has,3.3.243
77796,77797,pericles,that thought you worthy of it,4.6.83
102789,102790,troilus and cressida,son of a whore fight for a whore he tempts judgment,5.7.21
106557,106558,two gentlemen of verona,i do not seek to quench your loves hot fire,2.7.21
51031,51032,king lear,de do de do de bless thee from whirlwinds,3.4.61
5116,5117,henry vi part 1,let me persuade you take a better course,4.1.134


Concatenate all lines into a single string.

In [18]:
all_lines = ''

for line in strip_plays['Line']:
    line = line + ' '
    all_lines = all_lines + line
    

In [19]:
print(len(all_lines))
print(all_lines[:200])

4255876
act i scene i london the palace enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others so shaken as we are so wan with care find we a time for frighted peace to p


In [20]:
strip_plays.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine
0,1,henry iv,act i,1.1.1
1,2,henry iv,scene i london the palace,1.1.1
2,3,henry iv,enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others,1.1.1
3,4,henry iv,so shaken as we are so wan with care,1.1.1
4,5,henry iv,find we a time for frighted peace to pant,1.1.2


Search `all_lines` for identical matches of the list of unique titles

In [21]:
results = []
neg_results = []

for title in strip_titles['Title']:
    if title in all_lines:
        results.append(True)
        neg_results.append(False)
    else:
        results.append(False)
        neg_results.append(True)

In [22]:
matched_titles = strip_titles[results].reset_index(drop=True)

In [23]:
matched_titles.head()

Unnamed: 0,uniq_id,Play,Title,Occurences
0,0,alls well that ends well,alls well that ends well,3
1,3,alls well that ends well,alls well,4
2,6,alls well that ends well,the devil drives,5
3,8,alls well that ends well,edge of hazard,1
4,13,alls well that ends well,a mingled yarn,3


In [24]:
unmatched_titles = strip_titles[neg_results].reset_index(drop=True)

In [25]:
unmatched_titles.head()

Unnamed: 0,uniq_id,Play,Title,Occurences
0,1,alls well that ends well,jing tao hai land de ren sheng alls well that ends well,1
1,2,alls well that ends well,alls well that ends wrong,1
2,4,alls well that ends well,poor caroline,1
3,5,alls well that ends well,surviving childhood cancer,1
4,7,alls well that ends well,when the devil drives,1


In [26]:
print('Number of unique titles directly quoted from plays:', matched_titles.shape[0])
print('The above titles comprise', matched_titles.Occurences.sum(), 'unique works')
print('Number of mismatched titles', unmatched_titles.shape[0], 'due to figurative language')

Number of unique titles directly quoted from plays: 646
The above titles comprise 1620 unique works
Number of mismatched titles 815 due to figurative language


In [27]:
matched_titles.Occurences.describe()

count    646.000000
mean       2.507740
std        4.330791
min        1.000000
25%        1.000000
50%        1.000000
75%        3.000000
max       81.000000
Name: Occurences, dtype: float64

In [28]:
matched_titles.nlargest(10, 'Occurences')

Unnamed: 0,uniq_id,Play,Title,Occurences
161,308,king lear,full circle,81
277,566,merchant of venice,all that glitters,39
250,500,macbeth,all our yesterdays,24
280,580,merchant of venice,the quality of mercy,23
333,711,othello,sweet revenge,19
554,1277,hamlet,outrageous fortune,17
561,1287,hamlet,perchance to dream,17
109,200,julius caesar,the evil that men do,15
171,375,macbeth,the seeds of time,14
432,1002,twelfth night,midsummer madness,14


In [36]:
unmatched_titles.sample(25)

Unnamed: 0,uniq_id,Play,Title,Occurences
718,1271,hamlet,to die or not to die,2
398,751,richard ii,phantom britain,1
629,1110,hamlet,for all that lives,1
624,1099,hamlet,poor poor ophelia,1
393,739,richard ii,fire in the blood,1
653,1149,hamlet,foul deeds will arise,1
337,650,much ado about nothing,much ado about nodding,1
658,1160,hamlet,to thine own self be true,3
593,1051,sonnets,the keepers of heavens gate,1
6,10,alls well that ends well,web of life,3


I have noticed that improper articles (swapping 'the' for 'a' and visa versa) have prevented matches in previous work. Here I am investigating the potential impact of the article switches, and will strip them if it looks promising.

In [112]:
articles = ('a ', 'an ', 'the ')
result2 = unmatched_titles.Title.str.startswith(articles)

In [113]:
unmatched_articles = unmatched_titles[result2]
print(len(unmatched_articles))
unmatched_articles.sample(5)

226


Unnamed: 0,uniq_id,Play,Title,Occurences
39,87,as you like it,the queen of the night,1
269,543,merchant of venice,the merchant of menace,3
318,608,midsummer nights dream,a midsummer nights marriage,1
271,545,merchant of venice,a modern portia,1
331,627,midsummer nights dream,a girdle round the earth,2


In [114]:
for title in unmatched_articles.Title:  
    if title.startswith('an ') == True:
        unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('an ', '', 1)
    elif title.startswith('a ') == True:
        unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('a ', '', 1)
    elif title.startswith('the ') == True:
        unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('the ', '', 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('the ', '', 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_articles['Title'] = unmatched_articles['Title'].str.replace('a ', '', 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_articles['Title'] = unmatch

In [116]:
unmatched_articles.sample(5)

Unnamed: 0,uniq_id,Play,Title,Occurences
487,874,romeo and juliet,ear to ground,2
62,147,henry v,mettle of the pasture,1
472,850,romeo and juliet,romeo error,1
434,804,richard iii,wanton nymph,1
304,592,merry wives of windsor,merry wives of massachusetts,1


In [136]:
results = []
neg_results = []

for title in unmatched_articles['Title']:
    if title in all_lines:
        results.append(True)
        neg_results.append(False)
    else:
        results.append(False)
        neg_results.append(True)

In [179]:
matched_articles = unmatched_articles[results].reset_index(drop=True)

In [180]:
print('We matched', len(matched_articles), 'new titles')
matched_articles.sample(10)

We matched 47 new titles


Unnamed: 0,uniq_id,Play,Title,Occurences
12,311,king lear,full circle,1
7,260,king lear,little world of man,1
46,1443,hamlet,wounded name,1
43,1317,hamlet,traveller returns,1
19,688,othello,passing strange,1
15,538,measure for measure,giants strength,1
11,309,king lear,full circle,1
5,247,king lear,serpents tooth,4
10,283,king lear,wanton boys,1
22,752,richard ii,other eden,1


In [181]:
matched_titles.append(matched_articles)

Unnamed: 0,uniq_id,Play,Title,Occurences
0,0,alls well that ends well,alls well that ends well,3
1,3,alls well that ends well,alls well,4
2,6,alls well that ends well,the devil drives,5
3,8,alls well that ends well,edge of hazard,1
4,13,alls well that ends well,a mingled yarn,3
...,...,...,...,...
42,1293,hamlet,mortal coil,1
43,1317,hamlet,traveller returns,1
44,1362,hamlet,witching time,2
45,1379,hamlet,counterfeit presentment,1


In [182]:
dups = matched_titles.duplicated(subset='Title', keep=False)
matched_titles[dups]

Unnamed: 0,uniq_id,Play,Title,Occurences
12,26,antony and cleopatra,the gods themselves,6
13,27,king lear,the gods themselves,6
14,28,timon of athens,the gods themselves,6
15,29,troilus and cressida,the gods themselves,6
16,30,winters tale,the gods themselves,6
17,31,hamlet,the gods themselves,6
24,44,as you like it,under the greenwood tree,1
25,45,as you like it,under the greenwood tree,4
29,49,as you like it,thereby hangs a tale,4
30,50,merry wives of windsor,thereby hangs a tale,4


In [183]:
drop_me = matched_titles[dups].index.tolist()
print(drop_me)

[12, 13, 14, 15, 16, 17, 24, 25, 29, 30, 36, 37, 59, 60, 73, 74, 89, 90, 93, 94, 135, 136, 138, 139, 140, 141, 161, 162, 188, 189, 201, 202, 216, 217, 220, 221, 244, 245, 246, 247, 335, 336, 337, 338, 346, 347, 418, 419, 427, 428, 522, 523, 556, 557, 610, 611]


In [185]:
print(matched_titles.shape[0])
matched_titles.drop(drop_me, inplace=True)
print(matched_titles.shape[0])

646
590


In [187]:
new_rows = [pd.Series([26, 'multiple', 'the gods themselves', 6], index=matched_titles.columns),
                pd.Series([44, 'as you like it', 'under the greenwood tree', 5], index=matched_titles.columns),
                pd.Series([49, 'multiple', 'thereby hangs a tale', 4], index=matched_titles.columns),
                pd.Series([56, 'multiple', 'better days', 7], index=matched_titles.columns),
                pd.Series([104, 'multiple', 'heres a villain', 2], index=matched_titles.columns),
                pd.Series([119, 'multiple', 'times fool', 6], index=matched_titles.columns),
                pd.Series([157, 'multiple', 'exeunt murderers', 2], index=matched_titles.columns),
                pd.Series([162, 'henry vi part 2', 'kill all the lawyers', 6], index=matched_titles.columns),
                pd.Series([265, 'multiple', 'this little world', 5], index=matched_titles.columns),
                pd.Series([269, 'multiple', 'in such a night as this', 2], index=matched_titles.columns),
                pd.Series([271, 'multiple', 'in such a night', 2], index=matched_titles.columns),
                pd.Series([308, 'king lear', 'full circle', 82], index=matched_titles.columns),
                pd.Series([404, 'multiple', 'to sleep no more', 3], index=matched_titles.columns),
                pd.Series([427, 'multiple', 'enter two murderers', 2], index=matched_titles.columns),
                pd.Series([450, 'macbeth', 'double double', 5], index=matched_titles.columns),
                pd.Series([458, 'macbeth', 'fire burn', 2], index=matched_titles.columns),
                pd.Series([493, 'macbeth', 'tomorrow and tomorrow and tomorrow', 3], index=matched_titles.columns),
                pd.Series([495, 'macbeth', 'tomorrow and tomorrow', 3], index=matched_titles.columns),
                pd.Series([721, 'multiple', 'better angel', 4], index=matched_titles.columns),
                pd.Series([723, 'multiple', 'the better angel', 4], index=matched_titles.columns),
                pd.Series([742, 'multiple', 'the eye of heaven', 6], index=matched_titles.columns),
                pd.Series([986, 'twelfth night', 'lovers meeting', 5], index=matched_titles.columns),
                pd.Series([997, 'twelfth night', 'my fathers house', 14], index=matched_titles.columns),
                pd.Series([1190, 'hamlet', 'more things in heaven', 2], index=matched_titles.columns),
                pd.Series([1281, 'hamlet', 'a sea of troubles', 4], index=matched_titles.columns),
                pd.Series([1394, 'hamlet', 'what is a man', 2], index=matched_titles.columns)]

In [188]:
matched_titles = matched_titles.append(new_rows, ignore_index=True)

In [189]:
matched_titles.shape[0]

616

In [190]:
matched_titles.sample(50)

Unnamed: 0,uniq_id,Play,Title,Occurences
235,551,merchant of venice,shylock,3
586,1448,hamlet,flights of angels,2
11,25,antony and cleopatra,immortal longings,1
293,711,othello,sweet revenge,19
438,1123,hamlet,minds eye,1
76,179,henry vi part 3,the guilty mind,1
280,684,othello,for daws to peck at,1
520,1319,hamlet,pale cast,1
405,1046,sonnets,heavens gate,2
113,259,king lear,foul weather,1


In [191]:
print('Number of unique titles directly quoted from plays:', matched_titles.shape[0])
print('The above titles comprise', matched_titles.Occurences.sum(), 'unique works')

Number of unique titles directly quoted from plays: 616
The above titles comprise 1541 unique works


In [192]:
datapath = '../data'

In [193]:
import os

In [195]:
datapath_matched_titles = os.path.join(datapath, 'titles_cleaned.csv')
matched_titles.to_csv(datapath_matched_titles, index=False)

