### Imports, Reading in files & Defining functions

In [9]:
import os
import re
import io

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

from termcolor import colored

def PDFtoString(filePath, 
                pdfFolder='/Users/saranmedical-smile/Desktop/Language-of-Science/articles/AJS pdf files/1946to1966'): 
    
    out = io.StringIO()
    with open(os.path.join(pdfFolder,filePath), 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, out, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    #string = out.getvalue()
    return out.getvalue() 

In [63]:
#Define the folder in which all the pdfs are in 
# - assuming the type of analysis needs to be different depending on the journal / time period

#baseFolder = '/home/ishi/Desktop/franzosi/Language-of-Science/articles/AJS pdf files/1946to1966'
pdfFolder = '/Users/saranmedical-smile/Desktop/Language-of-Science/articles/AJS pdf files/1946to1966'
txtFolder = '/Users/saranmedical-smile/Desktop/Language-of-Science/corpus/AJS_1946to1966'

txtPaths = os.listdir(txtFolder) # list of all txt files
pdfPaths = os.listdir(pdfFolder) # list of all pdf files

txtPaths.sort(), pdfPaths.sort()

for indx, f in enumerate(filePaths[:5]):
    print(colored(indx, 'green'), colored(f, 'magenta'), '\n')
    

string = PDFtoString(os.path.join(pdfPaths[35])) #Example output of PDFtoString

[32m0[0m [35mAJS_1946_51_4_Antipathy and Social Behavior.txt[0m 

[32m1[0m [35mAJS_1946_51_4_Definitions of Economic Terms in the Dictionary of Sociology.txt[0m 

[32m2[0m [35mAJS_1946_51_4_Ethnic Behavior in Industry-Sponsorship and Rejection in a New England Factory.txt[0m 

[32m3[0m [35mAJS_1946_51_4_Intermarriages Between German Pioneers and Other Nationalities in Minnesota in 1860 and 1870.txt[0m 

[32m4[0m [35mAJS_1946_51_4_Methods of Measurement of Aesthetic Folkways.txt[0m 



### Print the corpus file

In [64]:
with open(os.path.join(baseFolder, filePaths[6]), 'r') as f:
    data = f.read()
print(data[:1000])# The header looks terrible - might make sense to use pdf2text just to extract header info

Pale AM ee ee ee at eta
are now only accessible on

the Chicago Journals website at

EVR LeU
Pee ot AR eee ea aaa
are now only accessible on
the Chicago Journals website at

VESTA a
The sociological literature on class sys-
tems in American communities raises a
number of problems for research at the
childhood level. The present study is of the
general question concerning to what extent
and in what observable ways the factor of
social status affects the social development
of children. Specifically, is the social-class
position of the family a contributing factor
in determining a child’s choice of friends
or the child’s reputation among his age-
mates? If so, how does its operation vary
with the increasing age of the child?

A community was selected for which the
status structure was known and in which
families whose children would be the sub-
jects of the research had already been class-
typed. Children of two age levels were
chosen as subjects.
Group A, 2 per cent of the population, is

### Regex Method 1 - re.compile.finditer(string)

This returns the indices at which the pattern occurs

In [20]:
pattern = re.compile(r'\d[ ]? \w') #Important to start regex expression with r --> raw string
matches = pattern.finditer(string)
for match in matches:
    pass#print(match)

### Regex Method 2 - re.findall(pattern, string)

This returns the pattern itself

In [62]:
%%time
#re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)
for indx, file in enumerate(pdfPaths):
    print(indx, colored(file, 'red'))
    string = PDFtoString(os.path.join(pdfFolder, file))
    print(colored(re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0], 'blue'))
    print(colored('Abstract match:', 'magenta'), colored(re.search(r'ABSTRACT', string), 'magenta'))
    matches = re.finditer(r'CHART', string)
    for match in matches:
        print(colored('Chart match:', 'green'), colored(match, 'green'))

0 [31mAJS_1946_51_4_Antipathy and Social Behavior.pdf[0m
[34mAccessed: 06-06-2016 18:17 UTC[0m
[35mAbstract match:[0m [35m<re.Match object; span=(1064, 1072), match='ABSTRACT'>[0m
1 [31mAJS_1946_51_4_Definitions of Economic Terms in the Dictionary of Sociology.pdf[0m
[34mAccessed: 06-06-2016 18:25 UTC[0m
[35mAbstract match:[0m [35m<re.Match object; span=(1139, 1147), match='ABSTRACT'>[0m
[32mChart match:[0m [32m<re.Match object; span=(39031, 39036), match='CHART'>[0m
[32mChart match:[0m [32m<re.Match object; span=(39837, 39842), match='CHART'>[0m
2 [31mAJS_1946_51_4_Ethnic Behavior in Industry-Sponsorship and Rejection in a New England Factory.pdf[0m
[34mAccessed: 06-06-2016 18:19 UTC[0m
[35mAbstract match:[0m [35m<re.Match object; span=(1158, 1166), match='ABSTRACT'>[0m
[32mChart match:[0m [32m<re.Match object; span=(5933, 5938), match='CHART'>[0m
3 [31mAJS_1946_51_4_Intermarriages Between German Pioneers and Other Nationalities in Minnesota in 186

[34mAccessed: 06-06-2016 19:13 UTC[0m
[35mAbstract match:[0m [35mNone[0m
36 [31mAJS_1966_71_4_The Altruism Scale A Measure of Co-operative, Individualistic, and Competitive Interpersonal Orientation.pdf[0m
[34mAccessed: 06-06-2016 19:10 UTC[0m
[35mAbstract match:[0m [35m<re.Match object; span=(1432, 1440), match='ABSTRACT'>[0m
CPU times: user 54.7 s, sys: 792 ms, total: 55.5 s
Wall time: 59.6 s


### Split string by re.compile(pattern, string) return val

In [28]:
access_ts = r'(Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC)'
#JSTOR_msg = r'\n \nREFERENCES \nLinked references are available on JSTOR for this article:\nhttp://www.jstor.org/stable/2774936?seq=1&cid=pdf-reference#references_tab_contents \nYou may need to log in to JSTOR to access the linked references.\n \nYour use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at\n\nhttp://about.jstor.org/terms\n\n \n\nJSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted\n\ndigital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about\n\nJSTOR, please contact support@jstor.org.\n\nThe University of Chicago Press is collaborating with JSTOR to digitize, preserve and extend access to\nAmerican Journal of Sociology\n\n'
JSTOR_msg = r'[A-Za-z]*, [0-9]* [A-Za-z]* [0-9]* [0-9][0-9]:[0-9][0-9]:[0-9][0-9] UTC'
str_split = re.split(access_ts, string)
# 0 --> Header w/out access date; 1 --> access_ts; 2 --> Body

In [29]:
matches = re.finditer(JSTOR_msg, str_split[2])
for match in matches:
    print(match)

<re.Match object; span=(883, 912), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(4964, 4993), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(9452, 9481), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(13959, 13988), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(18628, 18657), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(21470, 21499), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(25908, 25937), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(30498, 30527), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(34398, 34427), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(38880, 38909), match='Mon, 06 Jun 2016 19:10:40 UTC'>
<re.Match object; span=(43347, 43376), match='Mon, 06 Jun 2016 19:10:40 UTC'>


In [30]:
JSTOR_msg = r'[A-Za-z]*,\s*[0-9]*\s*[A-Za-z]*\s*[0-9]*\s*[0-9][0-9]:[0-9][0-9]:[0-9][0-9]\s*UTC\s*'
matches = re.finditer(JSTOR_msg, str_split[2])
for indx, match in enumerate(matches):
    print(indx, match)

0 <re.Match object; span=(883, 913), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
1 <re.Match object; span=(4964, 4994), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
2 <re.Match object; span=(9452, 9482), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
3 <re.Match object; span=(13959, 13989), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
4 <re.Match object; span=(18628, 18658), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
5 <re.Match object; span=(21470, 21500), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
6 <re.Match object; span=(25908, 25938), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
7 <re.Match object; span=(30498, 30528), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
8 <re.Match object; span=(34398, 34428), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
9 <re.Match object; span=(38880, 38910), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>
10 <re.Match object; span=(43347, 43377), match='Mon, 06 Jun 2016 19:10:40 UTC\n'>


In [48]:
AJS_mark = r'\d*\s*THE\s*AMERICAN\s*JOURNAL\s*OF\s*SOCIOLOGY\s*\d*\s*'
AJS_matches = re.finditer(AJS_mark, string)
for indx, match in enumerate(AJS_matches):
    print(indx, match)

0 <re.Match object; span=(5388, 5428), match='408 THE AMERICAN JOURNAL OF SOCIOLOGY\n\n '>
1 <re.Match object; span=(21894, 21935), match='412 THE AMERICAN JOURNAL OF SOCIOLOGY\n\n 8'>
2 <re.Match object; span=(30922, 30962), match='414 THE AMERICAN JOURNAL OF SOCIOLOGY\n\n '>
3 <re.Match object; span=(39304, 39344), match='416 THE AMERICAN JOURNAL OF SOCIOLOGY\n\n '>


In [57]:
for match in matches:
    print(match)

In [71]:
for table in re.finditer(r'Table \d*', string):
    print(table)

<re.Match object; span=(3854, 3861), match='Table 1'>
<re.Match object; span=(4183, 4190), match='Table 1'>
<re.Match object; span=(13019, 13026), match='Table 1'>
<re.Match object; span=(15229, 15236), match='Table 1'>
<re.Match object; span=(17843, 17850), match='Table 2'>
<re.Match object; span=(19813, 19820), match='Table 3'>


In [81]:
print(table), table.group()

<re.Match object; span=(19813, 19820), match='Table 3'>


(None, 'Table 3')