The purpose of this notebook is to clean and normalise the text from the corpus, and provide a document term matrix for further NLP.

Prior to this a frequency anaylsis and list of potential OCR errors was generated (see OLD.DataCleaning.inpy). These were then corrected with grep / sed in the text files themselves, the text files were then re-ingested into python

In [1]:
import pandas as pd
import nltk
nltk.download('punkt') 
import csv
from nltk.probability import FreqDist
import re
import string


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jakeb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#open the pickled corpus dataframe
import pandas as pd

df = pd.read_pickle('corpus_1.pkl')
df

Unnamed: 0,Filename,Type,Date,Volume,Issue,Pages,OCR batch,Text,OCR filename
0,CWSupp007.pdf,Christian Workers’ Supplement,1891-07,,,4,1,"THE POLYTECHNIC MAGAZINE,\n\nChristian Workers...",CWSupp007.pdf.tiff.txt
1,CWSupp008.pdf,Christian Workers’ Supplement,1891-09,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street,\...",CWSupp008.pdf.tiff.txt
2,CWSupp009.pdf,Christian Workers’ Supplement,1891-10,,,4,1,"THE POLYTECHNIC MAGAZINE,\n\nChristian Workers...",CWSupp009.pdf.tiff.txt
3,CWSupp010.pdf,Christian Workers’ Supplement,1891-11,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street, ...",CWSupp010.pdf.tiff.txt
4,CWSupp011.pdf,Christian Workers’ Supplement,1891-12,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street, ...",CWSupp011.pdf.tiff.txt
...,...,...,...,...,...,...,...,...,...
1720,Quintinian005.pdf,Quintinian,1892-08,,5,6,12,"Supplement to Podytechnte Magiesine,\n\nAugust...",Quintinian005.pdf.tiff.txt
1721,Quintinian006.pdf,Quintinian,1892-09,,6,2,12,"Supplement to Polytechnic Magazine, September ...",Quintinian006.pdf.tiff.txt
1722,Quintinian007.pdf,Quintinian,1892-10,,7,8,12,"[Supplement to Polytechnic Magazine, October 6...",Quintinian007.pdf.tiff.txt
1723,Quintinian008.pdf,Quintinian,1892-11,,8,8,12,"{Supplement to Polytechute Magazine, November ...",Quintinian008.pdf.tiff.txt


In [3]:
#check Text is a string
type(df.loc[100, 'Text'])

str

In [4]:
#function for cleaning text from https://github.com/adashofdata/nlp-in-python-tutorial/blob/master/1-Data-Cleaning.ipynb
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [5]:
#apply function above
data_clean = pd.DataFrame(df.Text.apply(round1))
data_clean

Unnamed: 0,Text
0,the polytechnic magazine\n\nchristian workers’...
1,the polytechnic magazine regent street\n\nchr...
2,the polytechnic magazine\n\nchristian workers’...
3,the polytechnic magazine regent street w\n\nc...
4,the polytechnic magazine regent street w\n\nc...
...,...
1720,supplement to podytechnte magiesine\n\naugust ...
1721,supplement to polytechnic magazine september ...
1722,\n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \...
1723,supplement to polytechute magazine november \...


In [6]:
#look at individual examples
issue = int(input('Enter an issue to check (0-1724):'))
print('Issue number:', issue)
print(data_clean.loc[issue, 'Text'])


Issue number: 1700
 

 

the
poly technic
magazine

 

the lord is our strength

september 
one shilling

 

 
be snaphappy
 buy ilford

    
 

ilford selochrome pan is the all
purpose film that millions of happy
snappers choose for the pick of the

pictures this panchromatic film is the

one to use

the famous film for faces and places
 

the polytechnic
magazine

the polytechnic founded by quintin hogg
 regent street london 

vol xcix—no  

september 

price one shilling

in and around the poly

sunday sunday  october is our
services annual harvest festival and our

autumn poly family gathering
it is also the beginning of the year in which
the poly continues on its way the speaker
will be the rev a w hopkins and we
feel sure that we will receive one of mr
hopkins’ vigorous addresses we hope
that we will have as large a representative
gathering as possible from all sides of
poly life

kinnaird lecture it is hoped that the
for  kinnaird lecture for

 will be held
either on tuesday  

In [7]:

#I looked at examples from throughout the run of magazines and 
#examples of other things to remove: “” — ’ \n\‘ © ° ‘ »£§
def clean_text_round2(text):
  
    text = re.sub('[‘’“”…—©°»£§]', '', text)
    text = re.sub('\n', ' ', text)#not sure about taking out the \n bit
    return text

round2 = lambda x: clean_text_round2(x)

In [8]:
data_clean = pd.DataFrame(data_clean.Text.apply(round2))
data_clean

Unnamed: 0,Text
0,the polytechnic magazine christian workers ...
1,the polytechnic magazine regent street chris...
2,the polytechnic magazine christian workers ...
3,the polytechnic magazine regent street w chr...
4,the polytechnic magazine regent street w chr...
...,...
1720,supplement to podytechnte magiesine august ...
1721,supplement to polytechnic magazine september ...
1722,...
1723,supplement to polytechute magazine november ...


In [9]:
#check some more examples
issue = int(input('Enter an issue to check (0-1724):'))
print('Issue number:', issue)
print(data_clean.loc[issue, 'Text'])

Issue number: 750
the polytechnic magazine  wednesday october        coming  wednesday oct pailiament in the great hall  physical development society in room  at new mem bers reception marlborough room   thursday oct german society room  at   dorcas society meets in the lecture hall welsh chapel great  castle street  mr and mrs green photographic society in room  shorthand socievy in room  at   friday oct our honorary dentist mr a ovey as on monday young women only dr lunns training class meet ings suspended for summer military pand practice foley street  grammar school titchfield street select choir meet at kings  weigh house at  reading circle the uses of biography mr scott durrant new men bers reception great hall   saturday oct football at merton hall ramblers tlarrow to pinner train leaves baker street at  our honorary dentist mr canton  baker street w can be seen between  and  am on production of members ticket  sunday oct prayer and praise meeting  am  service for young men by m

In [10]:
def clean_text_round3(text):
    text = text.strip()
    return text
round3 = lambda x: clean_text_round3(x)

In [11]:
data_clean = pd.DataFrame(data_clean.Text.apply(round3))
data_clean

Unnamed: 0,Text
0,the polytechnic magazine christian workers ...
1,the polytechnic magazine regent street chris...
2,the polytechnic magazine christian workers ...
3,the polytechnic magazine regent street w chr...
4,the polytechnic magazine regent street w chr...
...,...
1720,supplement to podytechnte magiesine august ...
1721,supplement to polytechnic magazine september ...
1722,ctober october the opening of a n...
1723,supplement to polytechute magazine november ...


In [12]:
#check some more examples
issue = int(input('Enter an issue to check (0-1724):'))
print('Issue number:', issue)
print(data_clean.loc[issue, 'Text'])

Issue number: 750
the polytechnic magazine  wednesday october        coming  wednesday oct pailiament in the great hall  physical development society in room  at new mem bers reception marlborough room   thursday oct german society room  at   dorcas society meets in the lecture hall welsh chapel great  castle street  mr and mrs green photographic society in room  shorthand socievy in room  at   friday oct our honorary dentist mr a ovey as on monday young women only dr lunns training class meet ings suspended for summer military pand practice foley street  grammar school titchfield street select choir meet at kings  weigh house at  reading circle the uses of biography mr scott durrant new men bers reception great hall   saturday oct football at merton hall ramblers tlarrow to pinner train leaves baker street at  our honorary dentist mr canton  baker street w can be seen between  and  am on production of members ticket  sunday oct prayer and praise meeting  am  service for young men by m

In [13]:
#add tokenized words to main dataframe
df['tokenized_words'] = data_clean.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)

In [14]:
df

Unnamed: 0,Filename,Type,Date,Volume,Issue,Pages,OCR batch,Text,OCR filename,tokenized_words
0,CWSupp007.pdf,Christian Workers’ Supplement,1891-07,,,4,1,"THE POLYTECHNIC MAGAZINE,\n\nChristian Workers...",CWSupp007.pdf.tiff.txt,"[the, polytechnic, magazine, christian, worker..."
1,CWSupp008.pdf,Christian Workers’ Supplement,1891-09,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street,\...",CWSupp008.pdf.tiff.txt,"[the, polytechnic, magazine, regent, street, c..."
2,CWSupp009.pdf,Christian Workers’ Supplement,1891-10,,,4,1,"THE POLYTECHNIC MAGAZINE,\n\nChristian Workers...",CWSupp009.pdf.tiff.txt,"[the, polytechnic, magazine, christian, worker..."
3,CWSupp010.pdf,Christian Workers’ Supplement,1891-11,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street, ...",CWSupp010.pdf.tiff.txt,"[the, polytechnic, magazine, regent, street, w..."
4,CWSupp011.pdf,Christian Workers’ Supplement,1891-12,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street, ...",CWSupp011.pdf.tiff.txt,"[the, polytechnic, magazine, regent, street, w..."
...,...,...,...,...,...,...,...,...,...,...
1720,Quintinian005.pdf,Quintinian,1892-08,,5,6,12,"Supplement to Podytechnte Magiesine,\n\nAugust...",Quintinian005.pdf.tiff.txt,"[supplement, to, podytechnte, magiesine, augus..."
1721,Quintinian006.pdf,Quintinian,1892-09,,6,2,12,"Supplement to Polytechnic Magazine, September ...",Quintinian006.pdf.tiff.txt,"[supplement, to, polytechnic, magazine, septem..."
1722,Quintinian007.pdf,Quintinian,1892-10,,7,8,12,"[Supplement to Polytechnic Magazine, October 6...",Quintinian007.pdf.tiff.txt,"[ctober, october, the, opening, of, a, new, po..."
1723,Quintinian008.pdf,Quintinian,1892-11,,8,8,12,"{Supplement to Polytechute Magazine, November ...",Quintinian008.pdf.tiff.txt,"[supplement, to, polytechute, magazine, novemb..."


In [15]:
#Initial frequency analysis

#extract words from tokenized_words
words = []
for word in (df['tokenized_words']):
    words.append(word)

#but the above gives us a list of lists, so we need to extract the words from that
#see https://stackoverflow.com/questions/38666973/pandas-nltk-tokenizing-unhashable-type-list
allWords = []
for wordList in words:
    allWords += wordList

#to do the frequency analysis    
fdist = FreqDist(allWords)
print(fdist.most_common(200))


[('the', 2046700), ('of', 961412), ('and', 953567), ('to', 807100), ('a', 684763), ('in', 543380), ('for', 310853), ('on', 262614), ('that', 252021), ('was', 251089), ('at', 249354), ('is', 248284), ('be', 241663), ('we', 208327), ('by', 204697), ('as', 198979), ('with', 187638), ('it', 182118), ('our', 170704), ('will', 170108), ('i', 158933), ('have', 150967), ('this', 143030), ('he', 139287), ('are', 139159), ('not', 133420), ('his', 131083), ('j', 123477), ('which', 122080), ('w', 121254), ('from', 115926), ('mr', 115408), ('all', 114572), ('were', 108781), ('but', 105535), ('h', 103131), ('who', 102645), ('members', 101688), ('had', 99817), ('their', 93361), ('one', 92436), ('c', 88852), ('e', 87644), ('has', 87277), ('been', 85165), ('very', 81744), ('you', 80219), ('they', 80145), ('an', 78991), ('or', 77672), ('first', 72123), ('poly', 71145), ('so', 70865), ('b', 69841), ('club', 69318), ('g', 68030), ('pass', 65181), ('f', 61833), ('there', 61016), ('us', 61005), ('time', 597

In [16]:
#NLTK stopwords

nltk.download('stopwords')
from nltk.corpus import stopwords  
stop = set(stopwords.words('english')) 

df['tokenized_words'] = df['tokenized_words'].apply(lambda x: [item for item in x if item not in stop])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jakeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
#repeat frequency analysis to see effect of stopwords

#extract words from tokenized_words
words = []
for word in (df['tokenized_words']):
    words.append(word)

#but the above gives us a list of lists, so we need to extract the words from that
#see https://stackoverflow.com/questions/38666973/pandas-nltk-tokenizing-unhashable-type-list
allWords = []
for wordList in words:
    allWords += wordList

#to do the frequency analysis    
fdist = FreqDist(allWords)
print(fdist.most_common(5000))



In [18]:
#remove additonal stopwords that were identified during OCR error review

stop = ('j', 'ae', 'wee','eee', 'te', 'st', 'roth', 'ot', 'oS', 'fe', 'wes', 'ves', 'wv', 'ts', 'ws', 'oc', 'aes', 'th', 'ane', 'tes', 'ww', 'yy', 'tt', 'ase', 'vv', 'rs', 'oes', 'www', 'oa')
df['tokenized_words'] = df['tokenized_words'].apply(lambda x: [item for item in x if item not in stop])


In [19]:
#extract words again to check them

#extract words from tokenized_words
words = []
for word in (df['tokenized_words']):
    words.append(word)

#but the above gives us a list of lists, so we need to extract the words from that
#see https://stackoverflow.com/questions/38666973/pandas-nltk-tokenizing-unhashable-type-list
allWords = []
for wordList in words:
    allWords += wordList

#

In [20]:
#checking that addional stopwords have been removed 
allWords = nltk.Text(allWords)
allWords.concordance('eee')

no matches


In [21]:
#checking updated frequency distribution
fdist = FreqDist(allWords)
print(fdist.most_common(5000))



In [22]:
#when examining the OCR there were a lot of short words that were OCR artefacts (e.g. ooo).
#  get list of short words to see if any more should be added to stop words
short_words = set()
for w in allWords:
    wordlength = len(w)
    if wordlength <4:
        short_words.add(w)



In [23]:
for word in sorted(short_words):
    print (word)

aa
aaa
aab
aac
aad
aae
aaf
aag
aah
aai
aaj
aak
aal
aan
aao
aap
aar
aas
aat
aau
aav
aaw
aay
aa¥
ab
aba
abb
abc
abd
abe
abh
abi
abk
abl
abm
abn
abo
abp
abr
abs
abt
abu
abx
aby
ac
aca
acb
acc
acd
ace
acg
ach
aci
acj
ack
acl
acm
acn
aco
acp
acq
acr
acs
act
acu
acv
acw
acx
acy
acz
ac¢
ad
ada
adb
adc
add
ade
adg
adh
adi
adj
adk
adl
adm
adn
ado
adp
adr
ads
adu
adv
adw
ady
adz
ad®
adé
aea
aeb
aec
aed
aee
aef
aeg
aeh
aei
ael
aem
aen
aep
aer
aet
aeu
aev
aew
aex
aey
ae¢
af
afa
afc
afd
afe
aff
afh
afi
afk
afl
afm
afo
afp
afr
afs
aft
afu
af¢
af¥
ag
aga
agb
agd
age
agg
agh
agi
agk
agl
agm
agn
ago
agp
agr
ags
agt
agu
agw
agy
agé
ah
aha
ahb
ahd
ahe
ahg
ahh
ahi
ahk
ahl
ahn
aho
ahp
ahr
ahs
aht
ahw
ahy
ai
aia
aib
aic
aid
aie
aif
aig
aih
aii
aij
aik
ail
aim
aio
aip
air
ais
ait
aiv
aiw
aix
aiy
aj
aja
ajc
ajd
aje
ajg
aji
ajl
ajm
ajo
ajp
ajr
ajs
ajt
ajw
ak
aka
akc
ake
akf
akg
aki
akl
ako
aks
akt
aku
aky
al
ala
alb
alc
ald
ale
alf
alg
alh
ali
alj
alk
alm
aln
alo
alp
alr
als
alt
alu
alv
alw
aly
al¥
al€
ama
amb

In [24]:
#compare short words with dictionary to look for candidates to add to stop words 


#adapted from 'Natural Language Processing With Python' p60
#edited so it is including punctuation and numbers
#compare word list with corpus of English words and identify words not included to check for spelling errors


nltk.download('words')

def unusual_words (text):
    text_vocab = set(text)
    english_vocab = set(nltk.corpus.words.words())
    unusual = text_vocab.difference(english_vocab)
    return sorted(unusual)

spelling = unusual_words(short_words)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\jakeb\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [25]:
for words in spelling:
    print (words)

aaa
aab
aac
aad
aae
aaf
aag
aah
aai
aaj
aak
aan
aao
aap
aar
aas
aat
aau
aav
aaw
aay
aa¥
ab
abc
abd
abe
abh
abi
abk
abl
abm
abn
abo
abp
abr
abs
abt
abx
ac
aca
acb
acc
acd
acg
aci
acj
ack
acl
acm
acn
aco
acp
acq
acr
acs
acu
acv
acw
acx
acy
acz
ac¢
ada
adb
adc
adg
adh
adi
adj
adk
adl
adm
adn
adp
adr
ads
adu
adv
adw
ad®
adé
aea
aeb
aec
aed
aee
aef
aeg
aeh
aei
ael
aem
aen
aep
aet
aeu
aev
aew
aex
aey
ae¢
af
afa
afc
afd
afe
aff
afh
afi
afk
afl
afm
afo
afp
afr
afs
afu
af¢
af¥
ag
agb
agd
agg
agh
agi
agk
agl
agm
agn
agp
agr
ags
agt
agu
agw
agé
ahb
ahd
ahe
ahg
ahh
ahi
ahk
ahl
ahn
ahp
ahr
ahs
aht
ahw
ahy
aia
aib
aic
aie
aif
aig
aih
aii
aij
aik
aio
aip
ais
aiv
aiw
aix
aiy
aj
aja
ajc
ajd
aje
ajg
aji
ajl
ajm
ajo
ajp
ajr
ajs
ajt
ajw
akc
akf
akg
aki
akl
aks
akt
aky
alc
ald
alg
alh
ali
alj
alm
alr
als
alu
alv
alw
al¥
al€
amb
amc
amd
amf
amg
aml
amm
amn
amo
amp
amr
ams
anb
anc
anf
ang
anh
anj
ank
anl
ano
anp
anq
ans
anu
anv
an¢
ané
ao
aoa
aob
aoc
aod
aoe
aof
aog
aok
aol
aon
aoo
aop
aor
aos
aot
aow
aoy
ao

In [30]:
#note, need to be cautious with this as some might be acronyms
#also look for punctuation like € to remove

#need to write the list of unusual words to file to read more easily
filename='shortUnusualWords.txt'
with open (filename, 'w') as f:
    for words in spelling:
        f.write (words+'\n')


In [31]:
#I think for now let's get rid of them


df['tokenized_words'] = df['tokenized_words'].apply(lambda x: [item for item in x if item not in spelling])

In [32]:
#lets see what it looks like now
df

Unnamed: 0,Filename,Type,Date,Volume,Issue,Pages,OCR batch,Text,OCR filename,tokenized_words
0,CWSupp007.pdf,Christian Workers’ Supplement,1891-07,,,4,1,"THE POLYTECHNIC MAGAZINE,\n\nChristian Workers...",CWSupp007.pdf.tiff.txt,"[polytechnic, magazine, christian, workers, re..."
1,CWSupp008.pdf,Christian Workers’ Supplement,1891-09,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street,\...",CWSupp008.pdf.tiff.txt,"[polytechnic, magazine, regent, street, christ..."
2,CWSupp009.pdf,Christian Workers’ Supplement,1891-10,,,4,1,"THE POLYTECHNIC MAGAZINE,\n\nChristian Workers...",CWSupp009.pdf.tiff.txt,"[polytechnic, magazine, christian, workers, re..."
3,CWSupp010.pdf,Christian Workers’ Supplement,1891-11,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street, ...",CWSupp010.pdf.tiff.txt,"[polytechnic, magazine, regent, street, w, chr..."
4,CWSupp011.pdf,Christian Workers’ Supplement,1891-12,,,4,1,"THE POLYTECHNIC MAGAZINE, 809, Regent Street, ...",CWSupp011.pdf.tiff.txt,"[polytechnic, magazine, regent, street, w, chr..."
...,...,...,...,...,...,...,...,...,...,...
1720,Quintinian005.pdf,Quintinian,1892-08,,5,6,12,"Supplement to Podytechnte Magiesine,\n\nAugust...",Quintinian005.pdf.tiff.txt,"[supplement, podytechnte, magiesine, august, p..."
1721,Quintinian006.pdf,Quintinian,1892-09,,6,2,12,"Supplement to Polytechnic Magazine, September ...",Quintinian006.pdf.tiff.txt,"[supplement, polytechnic, magazine, september,..."
1722,Quintinian007.pdf,Quintinian,1892-10,,7,8,12,"[Supplement to Polytechnic Magazine, October 6...",Quintinian007.pdf.tiff.txt,"[ctober, october, opening, new, polytechnic, s..."
1723,Quintinian008.pdf,Quintinian,1892-11,,8,8,12,"{Supplement to Polytechute Magazine, November ...",Quintinian008.pdf.tiff.txt,"[supplement, polytechute, magazine, november, ..."


In [39]:
df.loc[1,'tokenized_words',]

['polytechnic',
 'magazine',
 'regent',
 'street',
 'christian',
 'workers',
 'supplement',
 'september',
 'one',
 'penny',
 'warchworb',
 'good',
 'courage',
 'lord',
 'strength',
 'aimforward',
 'institute',
 'christ',
 'waapside',
 'hotes',
 'motto',
 'month',
 'motive',
 'love',
 'christ',
 'constraineth',
 'us',
 'cor',
 'v',
 'lord',
 'seeth',
 'man',
 'seeth',
 'man',
 'looketh',
 'outward',
 'appearance',
 'lord',
 'looketh',
 'heart',
 'sam',
 'thou',
 'thou',
 'fasteth',
 'anoint',
 'thine',
 'head',
 'wash',
 'thy',
 'face',
 'thou',
 'appear',
 'unto',
 'men',
 'fast',
 'unto',
 'thy',
 'father',
 'secret',
 'thy',
 'father',
 'seeth',
 'secret',
 'shall',
 'reward',
 'thee',
 'openly',
 'matt',
 'light',
 'body',
 'eye',
 'therefore',
 'thine',
 'eye',
 'single',
 'thy',
 'whole',
 'body',
 'shall',
 'full',
 'light',
 'matt',
 'good',
 'tree',
 'bring',
 'forth',
 'evil',
 'fruit',
 'neither',
 'corrupt',
 'tree',
 'bring',
 'forth',
 'good',
 'fruit',
 'wherefore',
 'fru

In [40]:
#looks good to me, let's pickle it as removing the short words took ages!
df.to_pickle("corpus_2.pkl")