### Imports, Reading in files & Defining functions

In [11]:
import os
import io
import regex as re
import numpy as np
from tqdm import tqdm
from termcolor import colored
from colorama import Back, Style

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = [word.upper() for word in stopwords.words('english')]
hard_coded_non_section_words = ['URL', 'UTC', 'JSTOR', 'AMERICAN', 'JOURNAL', 'SOCIOLOGY', 'ABSTRACT', 'TABLE', 'CHART',
                                'AMERICAN JOURNAL OF SOCIOLOGY', 'UNIVERSITY OF CHICAGO', 'AMERICA', 'JOURNAL OF SOCIOLOGY',
                                'REFERENCES', 'AMERICAN SOCIOLOGICAL REVIEW', 'AMERICAN SOCIOLOGICAL ASSOCIATION']

def PDFtoString(filePath, pdfFolder=None):
    
    out = io.StringIO()
    if pdfFolder is not None:
        filePath = os.path.join(pdfFolder, filePath)
    with open(filePath, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, out, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for indx, page in enumerate(PDFPage.create_pages(doc)):
            #if indx != 0:
            interpreter.process_page(page)
            
    return out.getvalue() 

def printMetaInfo(convertedStrings, pdfPaths, journal = 'AJS'):
    
    # Print the regex matches for access date, abstract, table/chart, and references
    for indx, string in enumerate(convertedStrings):
        print(indx, colored(pdfPaths[indx], 'red'))
        if re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string):
            print('\t', colored(re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0], 'blue'))
        print('\t', colored('Abstract match:', 'magenta'), colored(re.search(r'ABSTRACT', string), 'magenta'))
        
        # Chart, table, or figure matches
        matches = re.finditer(r'CHART', string)
        for match in matches:
            print('\t', colored('Chart match:', 'green'), colored(match, 'green'))
        matches = re.finditer(r'TABLE', string)
        for match in matches:
            print('\t', colored('Table match:', 'green'), colored(match, 'green'))
        matches = re.finditer(r'FIG|FIGURE|Fig', string)
        for match in matches:
            print('\t', colored('Figure match:', 'green'), colored(match, 'green'))  
            
        print('\t', colored('Reference match:', 'magenta'), colored(re.search(r'REFERENCES', string), 'magenta'))
        
        # This finds the title words which aren't stop words & aren't digits and upper cases them all
        if journal == 'AJS':
            titles = [re.findall(r'(?<=AJS_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in pdfPaths]
        elif journal == 'ASR':
            titles = [re.findall(r'(?<=ASR_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in pdfPaths]
        title_words = [i for i in titles[indx].upper().split() if i not in stop_words and not i.isdigit()]
        
        # Then adds them to stop words & other hard-coded regularly occuring words
        non_section_words = hard_coded_non_section_words + title_words + stop_words 
        non_section_words += [title.upper() for title in titles] + [title.strip('The ').upper() for title in titles]
        
        matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z]{2,}){0,}', string)
        sections = dict()
        
        for match in matches:
            if match.group() not in non_section_words and match not in sections:
                sections[match.group()] = match.span()
        for section in sections:
            print('\t', colored('Section match:', 'blue'), colored([section, sections[section]], 'blue'))

def createOutputStrings(convertedStrings, folder = None, journal = 'AJS'):
    out = list() #Output strings go here, hopefully one out per pdf s.t. len(out) == len(convertedStrings)
    for indx, string in enumerate(convertedStrings):
        # Define meta-information
        outString = '-------------\n||Meta-info||\n-------------\n'

        # The second condition is more restrictive, so check that that's not None.
        header = article = None
        if re.search(r'extend access to\n(.)+?(?=C)(.)+?(?=\w{2,})', string) is not None:
            header = string[:re.search(r'extend access to\n(.)+?(?=C)', string).end()].strip()
            if re.search(r' Your use of the JSTOR', header) is not None:
                header = header[:re.search(r' Your use of the JSTOR', header).span()[0]]
            article = string[re.search(r'extend access to\n(.)+?(?=C)(.)+?(?=\w{2,})', string).end():]
            outString += re.sub(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', '', header)

        # Add access date
        outString += '\n\n' + re.findall(r'Accessed: \d{2}-\d{2}-\d{4} \d{2}:\d{2} UTC', string)[0] + '\n'

        # Add abstract info
        abstract = re.search(r'ABSTRACT', string)
        if abstract is not None:
            outString += 'Abstract match at {}'.format(abstract.span()) + '\n'
        elif abstract is None:
            outString += 'No abstract found.\n'

        # Add chart info
        chart_matches = re.finditer(r'CHART', string)    
        if chart_matches is not None:
            for match in chart_matches:
                outString += 'Chart match at {}'.format(match.span()) + '\n'

        # Add table info 
        table_matches = re.finditer(r'TABLE', string)
        if table_matches is not None:
            for match in table_matches:
                outString += 'Chart match at {}'.format(match.span()) + '\n'
                
        # Add figure info
        figure_matches = re.finditer(r'FIG|FIGURE|Fig', string)
        if figure_matches is not None:
            for match in figure_matches:
                outString += 'Figure match at {}'.format(match.span()) + '\n'

        # Add references info
        references = re.search(r'REFERENCES', string)
        if references is not None:
            outString += 'Reference match at {}'.format(references.span()) + '\n'
        elif references is None:
            outString += 'No reference section found.\n'

        # Add section info 

        ### First, look for the title words and tokenize them.
        if journal == 'AJS':
            titles = [re.findall(r'(?<=AJS_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in folder]
        elif journal == 'ASR':
            titles = [re.findall(r'(?<=ASR_\d{4}_\d{1,3}_\d{1,2}_).*(?=.pdf)', file_name)[0] for file_name in folder]
        title_words = [i for i in titles[indx].upper().split() if i not in stop_words and not i.isdigit()]

        ### Then add them to stop words & other hard-coded regularly occuring words
        non_section_words = hard_coded_non_section_words + title_words + stop_words
        matches = re.finditer(r'[A-Z]{4,}(\s+?[A-Z0-9]{2,}){0,}', string)

        if matches is not None:
            sections = dict()
            for match in matches:
                if match.group() not in non_section_words and match not in sections:
                    sections[match.group()] = match.span()
        for section in sections:
            outString += 'Section header "{}" found at {}'.format(section, sections[section]) + '\n'

        # Add in the article itself
        outString += '\n-----------\n||Article||\n-----------\n'
        if article is not None:
            outString += article
        elif article is None: #If regex wasn't able to split it according to the JSTOR access message just put the full article
            outString += string
        # Append the newly made string to the list. Again, there should be 1 per pdf
        out.append(outString)
    return out
    
def writeOut(out, pdfFolder = None, outFolder = None):
    for indx, file in tqdm(enumerate(out)):
        writeFilePath = 'corpus/{}/{}.txt'.format(outFolder, pdfFolder[indx][:-4])
        with open(writeFilePath, 'w') as f:
            f.write(file)
    print('done!')

def highlight(pattern, text, print_output=True):
    output = text
    lookforward = 0
    for match in pattern.finditer(text):
        start, end = match.start() + lookforward, match.end() + lookforward
        output = output[:start] + Back.YELLOW + Style.BRIGHT + output[start:end] + Style.RESET_ALL + output[end:]
        lookforward = len(output) - len(text)  

    if print_output:
        print(output)
    else:
        return output

## American Journal of Sociology articles

In [2]:
%%time
# AJS articles - split into 3 periods

pre1946 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/pre1946/'
pre1946pdfs = os.listdir(pre1946) # list of all the pdf files 
pre1946pdfs.sort() # sort by year (and title)

l946to1966 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/1946to1966/'
l946to1966pdfs = os.listdir(l946to1966) # list of all the pdf files
l946to1966pdfs.sort()

post1971 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/AJS pdf files/post1971/'
post1971pdfs = os.listdir(post1971) # list of all the pdf files
post1971pdfs.sort()

# Convert the articles to strings... this is the time-consuming step
convertedStrings_pre1946 = [PDFtoString(os.path.join(pre1946, file)) for file in tqdm(pre1946pdfs) if file[-4:] == '.pdf']
convertedStrings_1946to1966 = [PDFtoString(os.path.join(l946to1966, file)) for file in tqdm(l946to1966pdfs) if file[-4:] == '.pdf']
convertedStrings_post1971 = [PDFtoString(os.path.join(post1971, file)) for file in tqdm(post1971pdfs) if file[-4:] == '.pdf']

100%|██████████| 63/63 [01:44<00:00,  1.66s/it]
100%|██████████| 37/37 [00:58<00:00,  1.58s/it]
100%|██████████| 68/68 [04:02<00:00,  3.57s/it]

CPU times: user 5min 47s, sys: 8.96 s, total: 5min 56s
Wall time: 6min 45s





### String pre-processing

In [3]:
# Remove the double lines / extra space characters & footer download / use notice
convertedStrings_pre1946 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_pre1946]
pattern = re.compile(r'This content downloaded from (.)+\n(.)+\x0c')
#highlight(pattern, convertedStrings_pre1946[4])
convertedStrings_pre1946 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_pre1946]
printMetaInfo(convertedStrings_pre1946, pdfPaths = pre1946pdfs)

0 [31mAJS_1896_1_4_Anti-Monopoly Legislation in the US.pdf[0m
	 [34mAccessed: 08-05-2016 22:57 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [35mReference match:[0m [35mNone[0m
	 [34mSection match:[0m [34m['ANTI', (32901, 32905)][0m
	 [34mSection match:[0m [34m['MONOPOLY LEGISLATION IN THE UNITED  STATES', (897, 939)][0m
	 [34mSection match:[0m [34m['RECENT', (942, 948)][0m
	 [34mSection match:[0m [34m['MONOPOLY LEGISLA TION', (32907, 32928)][0m
	 [34mSection match:[0m [34m['XXIV', (7166, 7170)][0m
	 [34mSection match:[0m [34m['MONOPOLY LEGISLATION', (9796, 9816)][0m
	 [34mSection match:[0m [34m['MONOPOL', (18830, 18837)][0m
	 [34mSection match:[0m [34m['LEGISLA TION', (28510, 28522)][0m
	 [34mSection match:[0m [34m['XIII', (16283, 16287)][0m
	 [34mSection match:[0m [34m['XVII', (16416, 16420)][0m
	 [34mSection match:[0m [34m['HALLE', (21144, 21149)][0m
	 [34mSection match:[0m [34m['MONVOPOL', (28499, 28507)][0m
	 [34mSection

	 [34mSection match:[0m [34m['AMERICAN  JOURNAL OF SOCIOLOGY  VOLUME XXVI JANUARY', (885, 936)][0m
	 [34mSection match:[0m [34m['NUMBER', (942, 948)][0m
	 [34mSection match:[0m [34m['SOCIOLOGY AND THE SOCIAL SCIENCES  ROBERT', (952, 993)][0m
	 [34mSection match:[0m [34m['PARK', (997, 1001)][0m
	 [34mSection match:[0m [34m['SOCIOLOGY AND', (1029, 1042)][0m
	 [34mSection match:[0m [34m['SCIENTIFIC', (1045, 1055)][0m
	 [34mSection match:[0m [34m['HISTORY', (40802, 40809)][0m
	 [34mSection match:[0m [34m['XVII', (10093, 10097)][0m
	 [34mSection match:[0m [34m['HISTORICAL AND SOCIOLOGICAL FACTS', (13638, 13671)][0m
	 [34mSection match:[0m [34m['HUMAN NATURE AND LAW', (30790, 30810)][0m
	 [34mSection match:[0m [34m['NATURAL HISTORY', (40811, 40826)][0m
	 [34mSection match:[0m [34m['XLIV', (43772, 43776)][0m
30 [31mAJS_1921_26_4_The Comparative Role of the Group Concept.pdf[0m
	 [34mAccessed: 10-05-2016 17:14 UTC[0m
	 [35mAbstract match:[0

	 [34mSection match:[0m [34m['AMERICAN  JOURNAL OF SOCIOLOGY  VOLUME XLI JANUARY', (931, 981)][0m
	 [34mSection match:[0m [34m['NUMBER', (987, 993)][0m
	 [34mSection match:[0m [34m['SOCIAL MOBILITY AND SOCIAL DISTANCE AMONG  HUNGARIAN IMMIGRANTS IN DETROIT', (997, 1071)][0m
	 [34mSection match:[0m [34m['ERDMANN DOANE BEYNON  ABSTRACT', (1074, 1104)][0m
	 [34mSection match:[0m [34m['HUNGARIAN IMMIGRANTS IN DETROIT', (26656, 26687)][0m
	 [34mSection match:[0m [34m['UNIVERSITY OF MICHIGAN', (31668, 31690)][0m
	 [34mSection match:[0m [34m['XIII', (32042, 32046)][0m
52 [31mAJS_1936_41_4_The Length of Time Required for the Stabilization of a Population.pdf[0m
	 [34mAccessed: 03-06-2016 20:01 UTC[0m
	 [35mAbstract match:[0m [35m<regex.Match object; span=(1007, 1015), match='ABSTRACT'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(9343, 9348), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(11074, 11079), match='TABLE'

In [4]:
convertedStrings_1946to1966 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_1946to1966]
pattern = re.compile(r'[A-Z]{4,}(\s+?[A-Z0-9]{2,}){0,}')
highlight(pattern, convertedStrings_1946to1966[10])
convertedStrings_1946to1966 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_1946to1966]
printMetaInfo(convertedStrings_1946to1966, pdfPaths = l946to1966pdfs)

      Job-Seeking and the Readjustment Allowance for Veterans
Author(s): Henry J. Meyer and  Erwin O. Smigel
Source: American Journal of Sociology, Vol. 56, No. 4 (Jan., 1951), pp. 341-347
Published by: The University of Chicago Press
Stable URL: http://www.jstor.org/stable/2771697
Accessed: 06-06-2016 18:32 UTC Your use of the [43m[1mJSTOR[0m archive indicates your acceptance of the Terms & Conditions of Use, available at http://about.jstor.org/terms [43m[1mJSTOR[0m is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted digital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about [43m[1mJSTOR[0m, please contact support@jstor.org. The University of Chicago Press is collaborating with [43m[1mJSTOR[0m to digitize, preserve and extend access to
American Journal of Sociology This content downloaded from 1

	 [34mSection match:[0m [34m['TYPES OF INTEGRATION AND THEIR MEASUREMENT  WERNER', (895, 945)][0m
	 [34mSection match:[0m [34m['LANDECKER', (949, 958)][0m
	 [34mSection match:[0m [34m['CULTURAL INTEGRATION', (7244, 7264)][0m
	 [34mSection match:[0m [34m['NORMATIVE INTEGRATION', (13672, 13693)][0m
	 [34mSection match:[0m [34m['XLVII', (15568, 15573)][0m
	 [34mSection match:[0m [34m['COMMUNICATIVE INTEGRATION', (18328, 18353)][0m
	 [34mSection match:[0m [34m['XLIX', (21928, 21932)][0m
	 [34mSection match:[0m [34m['XIII', (22137, 22141)][0m
	 [34mSection match:[0m [34m['FUNCTIONAL INTEGRATION', (27705, 27727)][0m
	 [34mSection match:[0m [34m['UNIVERSITY OF MICHIGAN', (37174, 37196)][0m
13 [31mAJS_1951_56_4_Witch Beliefs and Social Structure.pdf[0m
	 [34mAccessed: 06-06-2016 18:27 UTC[0m
	 [35mAbstract match:[0m [35m<regex.Match object; span=(1003, 1011), match='ABSTRACT'>[0m
	 [35mReference match:[0m [35mNone[0m
	 [34mSection match:[0m

	 [35mReference match:[0m [35m<regex.Match object; span=(306, 316), match='REFERENCES'>[0m
	 [34mSection match:[0m [34m['DRUGGISTS', (30205, 30214)][0m
	 [34mSection match:[0m [34m['PROBLEMS OF', (1161, 1172)][0m
	 [34mSection match:[0m [34m['MARGINAL OCCUPATION  THELMA HERMAN MCCORMACK  ABSTRACT', (1176, 1230)][0m
	 [34mSection match:[0m [34m['XXIV', (20540, 20544)][0m
	 [34mSection match:[0m [34m['COMPARISONS OF PRESTIGE RATINGS AND RANKS GIVEN BY  PHARMACY STUDENTS', (24028, 24097)][0m
	 [34mSection match:[0m [34m['NATIONAL SAMPLE AND  SUBGROUPS WITHIN NATIONAL SAMPLE', (24099, 24152)][0m
	 [34mSection match:[0m [34m['OCCUPATIONS CISTS TOTAL', (24156, 24179)][0m
	 [34mSection match:[0m [34m['PHARMA', (24203, 24209)][0m
	 [34mSection match:[0m [34m['NATIONAL SAMPLE', (29610, 29625)][0m
	 [34mSection match:[0m [34m['XIII', (25335, 25339)][0m
	 [34mSection match:[0m [34m['PRESTIGE RATINGS AND RANKS GIVEN TO SELECTED GROUP OF OCCU', (27658,

	 [34mSection match:[0m [34m['NATHAN KEYFITZ  ABSTRACT', (915, 939)][0m
	 [34mSection match:[0m [34m['XIII', (18021, 18025)][0m
	 [34mSection match:[0m [34m['UNIVERSITY OF TORONTO', (30086, 30107)][0m
29 [31mAJS_1961_66_4_Theory, Measurement, and Replication in the Social Sciences.pdf[0m
	 [34mAccessed: 06-06-2016 18:53 UTC[0m
	 [35mAbstract match:[0m [35m<regex.Match object; span=(994, 1002), match='ABSTRACT'>[0m
	 [35mReference match:[0m [35mNone[0m
	 [34mSection match:[0m [34m['THEORY', (22631, 22637)][0m
	 [34mSection match:[0m [34m['MEASUREMENT', (22639, 22650)][0m
	 [34mSection match:[0m [34m['REPLICATION  IN THE SOCIAL SCIENCES', (936, 971)][0m
	 [34mSection match:[0m [34m['BLALOCK', (980, 987)][0m
	 [34mSection match:[0m [34m['XXVI', (4402, 4406)][0m
	 [34mSection match:[0m [34m['MEASUREMENT OF MASS', (6029, 6048)][0m
	 [34mSection match:[0m [34m['XLIX', (6988, 6992)][0m
	 [34mSection match:[0m [34m['MEASUREMENT OF POWER', 

In [5]:
convertedStrings_post1971 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_post1971]
convertedStrings_post1971 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_post1971]
pattern = re.compile(r'[A-Z]{4,}(\s+?[A-Z0-9]{2,}){0,}')
highlight(pattern, convertedStrings_post1971[2])
printMetaInfo(convertedStrings_post1971, pdfPaths = post1971pdfs)

      An Empirical Study of Military-Industrial Linkages
Author(s): Stanley Lieberson
Source: American Journal of Sociology, Vol. 76, No. 4 (Jan., 1971), pp. 562-584
Published by: The University of Chicago Press
Stable URL: http://www.jstor.org/stable/2776429
Accessed: 08-06-2016 02:59 UTC Your use of the [43m[1mJSTOR[0m archive indicates your acceptance of the Terms & Conditions of Use, available at http://about.jstor.org/terms [43m[1mJSTOR[0m is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted digital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about [43m[1mJSTOR[0m, please contact support@jstor.org. The University of Chicago Press is collaborating with [43m[1mJSTOR[0m to digitize, preserve and extend access to
American Journal of Sociology  C    An Empirical Study of Military-Industrial  Link

	 [34mSection match:[0m [34m['DETERMINANTS OF FEMALE RETIREMENT', (12151, 12184)][0m
	 [34mSection match:[0m [34m['METHODOLOGY', (21681, 21692)][0m
	 [34mSection match:[0m [34m['MGLS', (56229, 56233)][0m
	 [34mSection match:[0m [34m['RESULTS', (41033, 41040)][0m
	 [34mSection match:[0m [34m['MEANS AND STANDARD DEVIATIONS OF AGE', (42786, 42822)][0m
	 [34mSection match:[0m [34m['SPECIFI', (42823, 42830)][0m
	 [34mSection match:[0m [34m['LABOR FORCE PARTICIPATION OF  WOMEN AND OF RESIDUAL RETIREMENT RATES', (42832, 42900)][0m
	 [34mSection match:[0m [34m['YEAR AND BY REGION  MEAN FEMALE  PARTICIPATION RATES', (42905, 42957)][0m
	 [34mSection match:[0m [34m['MEAN RETIREMENT RATES', (43017, 43038)][0m
	 [34mSection match:[0m [34m['NOTE', (43615, 43619)][0m
	 [34mSection match:[0m [34m['CONCLUSIONS', (56872, 56883)][0m
23 [31mAJS_1986_91_4_Economic Segmentation and Politics.pdf[0m
	 [34mAccessed: 08-06-2016 03:51 UTC[0m
	 [35mAbstract match:

	 [35mReference match:[0m [35m<regex.Match object; span=(86188, 86198), match='REFERENCES'>[0m
	 [34mSection match:[0m [34m['INTRODUCTION', (1846, 1858)][0m
	 [34mSection match:[0m [34m['GENESIS AND DEVELOPMENT OF TWO ECOLOGIES', (11135, 11175)][0m
	 [34mSection match:[0m [34m['LCRC', (36683, 36687)][0m
	 [34mSection match:[0m [34m['FIGURE AND TROPE IN THE LANGUAGE OF ECOLOGY', (38169, 38212)][0m
	 [34mSection match:[0m [34m['ASSOCIATIVE RELATIONS OF', (44709, 44733)][0m
	 [34mSection match:[0m [34m['HUMAN ECOLOGY', (44735, 44748)][0m
	 [34mSection match:[0m [34m['BOUNDARY WORK IN DISCIPLINES', (51883, 51911)][0m
	 [34mSection match:[0m [34m['SUBDISCIPLINES', (51913, 51927)][0m
	 [34mSection match:[0m [34m['SCHOOLS', (51934, 51941)][0m
	 [34mSection match:[0m [34m['DISSOLUTION OF CLASSICAL HUMAN ECOLOGY', (67196, 67234)][0m
	 [34mSection match:[0m [34m['CONCLUSION', (80436, 80446)][0m
35 [31mAJS_1996_101_4_Markets and Inequality in Trans

	 [35mAbstract match:[0m [35mNone[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(24705, 24710), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(63366, 63371), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(68817, 68822), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(77891, 77896), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(86920, 86925), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(89310, 89315), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(119771, 119776), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(126810, 126815), match='TABLE'>[0m
	 [35mReference match:[0m [35m<regex.Match object; span=(138805, 138815), match='REFERENCES'>[0m
	 [34mSection match:[0m [34m['INTRODUCTION', (2104, 2116)][0m
	 [34mSection match:[0m [34m['STANFORD PROJECT ON EMERGING COMPAN

### Writing the files out to .txt files

In [12]:
out_pre1946 = createOutputStrings(convertedStrings_pre1946, folder = pre1946pdfs)
out_1946to1966 = createOutputStrings(convertedStrings_1946to1966, folder = l946to1966pdfs)
out_post1971 = createOutputStrings(convertedStrings_post1971, folder = post1971pdfs)

writeOut(out_pre1946, pdfFolder = pre1946pdfs, outFolder = 'AJS_pre1946')
writeOut(out_1946to1966, pdfFolder = l946to1966pdfs, outFolder = 'AJS_1946to1966')
writeOut(out_post1971, pdfFolder = post1971pdfs, outFolder = 'AJS_post1971')

63it [00:00, 4426.97it/s]
37it [00:00, 4184.13it/s]
68it [00:00, 2327.09it/s]

done!
done!
done!





## American Sociological Review articles

In [7]:
ASRpre1946 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/ASR pdf files/pre1946/'
ASRpre1946pdfs = os.listdir(ASRpre1946) # list of all the pdf files 
ASRpre1946pdfs.sort() # sort by year (and title)

ASRpost1946 = '/Users/Praveens/Desktop/ishan/Language-of-Science/articles/ASR pdf files/post1946/'
ASRpost1946pdfs = os.listdir(ASRpost1946) # list of all the pdf files
ASRpost1946pdfs.sort()

# Convert the articles to strings... this is the time-consuming step
convertedStrings_ASRpre1946 = [PDFtoString(os.path.join(ASRpre1946, file)) for file in tqdm(ASRpre1946pdfs)]
convertedStrings_ASRpost1946 = [PDFtoString(os.path.join(ASRpost1946, file)) for file in tqdm(ASRpost1946pdfs)]

100%|██████████| 16/16 [00:17<00:00,  1.07s/it]
100%|██████████| 121/121 [06:13<00:00,  3.08s/it]


In [8]:
# Remove the double lines / extra space characters & footer download / use notice
convertedStrings_ASRpre1946 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_ASRpre1946]
convertedStrings_ASRpre1946 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_ASRpre1946]
printMetaInfo(convertedStrings_ASRpre1946, pdfPaths = ASRpre1946pdfs, journal = 'ASR')

0 [31mASR_1936_1_1_A Critical Study of the Criterion of Internal Consistency in Personality Scale Construction.pdf[0m
	 [34mAccessed: 14-06-2016 20:15 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [32mChart match:[0m [32m<regex.Match object; span=(10135, 10140), match='CHART'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(6174, 6179), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(8183, 8188), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(10159, 10164), match='TABLE'>[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(10201, 10206), match='TABLE'>[0m
	 [35mReference match:[0m [35m<regex.Match object; span=(332, 342), match='REFERENCES'>[0m
	 [34mSection match:[0m [34m['CRITICAL STUDY OF THE CRITERION OF\n INTERNAL CONSISTENCY IN PERSONALITY  SCALE CONSTRUCTION  RAYMOND', (1192, 1292)][0m
	 [34mSection match:[0m [34m['SLETTO', (1296, 1302)][0m
	 [34mSection match:[0m [34m['FUTURE L

In [9]:
convertedStrings_ASRpost1946 = [' '.join(re.split('\n\n+', string)) for string in convertedStrings_ASRpost1946]
convertedStrings_ASRpost1946 = [' '.join(re.split(r'This content downloaded from (.)+\n(.)+\x0c', string)) for string in convertedStrings_ASRpost1946]
printMetaInfo(convertedStrings_ASRpost1946, pdfPaths = ASRpost1946pdfs, journal = 'ASR')

0 [31mASR_1946_11_1_A Note on Consistency in Questionnaire Responses.pdf[0m
	 [34mAccessed: 16-06-2016 04:45 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [35mReference match:[0m [35mNone[0m
	 [34mSection match:[0m [34m['CONSISTENCY IN QUESTIONNAIRE RESPONSES', (8622, 8660)][0m
	 [34mSection match:[0m [34m['CONCLUSIONS', (1757, 1768)][0m
	 [34mSection match:[0m [34m['NOTE ON CONSISTENCY IN QUESTIONNAIRE  RESPONSES  JOHN', (3079, 3132)][0m
	 [34mSection match:[0m [34m['CUBER AND JOHN', (3136, 3150)][0m
	 [34mSection match:[0m [34m['GERBERICH', (3154, 3163)][0m
	 [34mSection match:[0m [34m['SOCIOLOGISTS', (3192, 3204)][0m
	 [34mSection match:[0m [34m['PROBLEM', (4257, 4264)][0m
	 [34mSection match:[0m [34m['ADMINISTRATION OF THE QUESTIONNAIRES', (4570, 4606)][0m
	 [34mSection match:[0m [34m['PREPARATION OF THE QUESTIONNAIRES', (5338, 5371)][0m
	 [34mSection match:[0m [34m['RESULTS', (6905, 6912)][0m
	 [34mSection match:[0m [34m['IMPL

	 [34mSection match:[0m [34m['DISCUSSION  REINHARD BENDIX', (937, 964)][0m
17 [31mASR_1951_16_1_Identification as the Basis for a Theory of Motivation.pdf[0m
	 [34mAccessed: 16-06-2016 05:00 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [35mReference match:[0m [35m<regex.Match object; span=(294, 304), match='REFERENCES'>[0m
	 [34mSection match:[0m [34m['IDENTIFICATION AS THE BASIS FOR', (1152, 1183)][0m
	 [34mSection match:[0m [34m['THEORY  OF MOTIVATION', (1186, 1207)][0m
	 [34mSection match:[0m [34m['NELSON', (1210, 1216)][0m
	 [34mSection match:[0m [34m['FOOTE', (1220, 1225)][0m
	 [34mSection match:[0m [34m['THEORY OF MOTIVATION', (31584, 31604)][0m
18 [31mASR_1951_16_1_Learning Theory and Socialization.pdf[0m
	 [34mAccessed: 16-06-2016 05:02 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [35mReference match:[0m [35m<regex.Match object; span=(271, 281), match='REFERENCES'>[0m
	 [34mSection match:[0m [34m['DORRIAN APPLE', (1165, 1178)

	 [34mSection match:[0m [34m['MINORITY COURSE', (15358, 15373)][0m
	 [34mSection match:[0m [34m['MORTON', (1144, 1150)][0m
	 [34mSection match:[0m [34m['KING', (1154, 1158)][0m
35 [31mASR_1956_21_1_Trends in Residential Segregation of Nonwhites in American Cities, 1940-1950.pdf[0m
	 [34mAccessed: 16-06-2016 05:14 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [32mTable match:[0m [32m<regex.Match object; span=(9902, 9907), match='TABLE'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(8705, 8708), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(13238, 13241), match='FIG'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(14210, 14213), match='Fig'>[0m
	 [35mReference match:[0m [35m<regex.Match object; span=(318, 328), match='REFERENCES'>[0m
	 [34mSection match:[0m [34m['TRENDS IN RESIDENTIAL SEGREGATION OF NONWHITES', (14081, 14127)][0m
	 [34mSection match:[0m [34m['TRENDS IN RESIDENTIAL SEGREGATION OF N

	 [34mSection match:[0m [34m['RANCOROUS CONFLICT IN COMMUNITY POLITICS  WILLIAM', (1140, 1189)][0m
	 [34mSection match:[0m [34m['GAMSON', (1193, 1199)][0m
	 [34mSection match:[0m [34m['RANCOROUS CONFLICT', (38438, 38456)][0m
	 [34mSection match:[0m [34m['STUDY DESIGN', (17848, 17860)][0m
	 [34mSection match:[0m [34m['RESULTS', (35375, 35382)][0m
	 [34mSection match:[0m [34m['RANCOROUS CONFLICT AND  POLITICAL INSTABILITY', (36307, 36352)][0m
	 [34mSection match:[0m [34m['RANCOROUS CONFLICT AND  STRUCTURAL CONDUCIVENESS', (36484, 36532)][0m
	 [34mSection match:[0m [34m['RANCOROUS CONFLICT AND CLEAVAGE', (38470, 38501)][0m
	 [34mSection match:[0m [34m['DISCUSSION', (42256, 42266)][0m
	 [34mSection match:[0m [34m['AMERICAN SOCIOLOGICAL REVIEW  TABLE', (42499, 42534)][0m
	 [34mSection match:[0m [34m['RANCOROUS CONFLICT AND POLITICAL\n INSTABILITY CONTROLLED FOR SIZE OF', (42538, 42606)][0m
	 [34mSection match:[0m [34m['DIVISION OF LABOR', (4836

	 [34mSection match:[0m [34m['RESULTS', (34574, 34581)][0m
	 [34mSection match:[0m [34m['DISCUSSION', (45482, 45492)][0m
	 [34mSection match:[0m [34m['DISABILITY AS PROCESS', (55578, 55599)][0m
	 [34mSection match:[0m [34m['DISABILITY AND DEVIANCE', (56927, 56950)][0m
	 [34mSection match:[0m [34m['NORMATIVE ADAPTATIONS\n OF ROLE BEHAVIOR', (56952, 56991)][0m
	 [34mSection match:[0m [34m['LAWRENCE', (56995, 57003)][0m
	 [34mSection match:[0m [34m['HABER', (57007, 57012)][0m
	 [34mSection match:[0m [34m['RICHARD', (57016, 57023)][0m
	 [34mSection match:[0m [34m['SMITH', (57027, 57032)][0m
61 [31mASR_1971_36_1_Some Social Implications of High Density Housing.pdf[0m
	 [34mAccessed: 21-06-2016 19:19 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(19845, 19848), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(24654, 24657), match='Fig'>[0m
	 [32mFigure match:[0m [32m

	 [34mSection match:[0m [34m['WORKING WIVES', (54849, 54862)][0m
	 [34mSection match:[0m [34m['LINDA', (925, 930)][0m
	 [34mSection match:[0m [34m['WAITE', (934, 939)][0m
	 [34mSection match:[0m [34m['INTRODUCTION', (2496, 2508)][0m
	 [34mSection match:[0m [34m['PROBLEM', (2514, 2521)][0m
	 [34mSection match:[0m [34m['MAJOR HYPOTHESES', (5552, 5568)][0m
	 [34mSection match:[0m [34m['METHOD AND DATA', (9893, 9908)][0m
	 [34mSection match:[0m [34m['MODEL AND METHODOLOGY', (10036, 10057)][0m
	 [34mSection match:[0m [34m['ECLF', (19312, 19316)][0m
	 [34mSection match:[0m [34m['RESULTS', (24095, 24102)][0m
	 [34mSection match:[0m [34m['SUMMARY', (54234, 54241)][0m
	 [34mSection match:[0m [34m['DETERMINANTS OF ADMINISTRATIVE CONTROL', (60538, 60576)][0m
	 [34mSection match:[0m [34m['TEST OF', (60581, 60588)][0m
	 [34mSection match:[0m [34m['THEORY WITH JAPANESE FACTORIES', (60591, 60621)][0m
	 [34mSection match:[0m [34m['PHELPS TRACY

	 [34mSection match:[0m [34m['ROBERT', (65444, 65450)][0m
	 [34mSection match:[0m [34m['MARE', (65454, 65458)][0m
	 [34mSection match:[0m [34m['SCHOOLING', (5934, 5943)][0m
	 [34mSection match:[0m [34m['MARRIAGE', (5945, 5953)][0m
	 [34mSection match:[0m [34m['ASSORTATIVE MATING', (5960, 5978)][0m
	 [34mSection match:[0m [34m['CHANGES IN EDUCATIONAL  ASSORTATIVE MATING', (8873, 8915)][0m
	 [34mSection match:[0m [34m['DATA AND METHODS', (16222, 16238)][0m
	 [34mSection match:[0m [34m['PUMS', (22823, 22827)][0m
	 [34mSection match:[0m [34m['RESULTS', (45710, 45717)][0m
	 [34mSection match:[0m [34m['TRENDS IN SCHOOLING AND  EDUCATIONAL ASSORTATIVE MATING', (28472, 28527)][0m
	 [34mSection match:[0m [34m['TIMING OF MARRIAGE', (45723, 45741)][0m
	 [34mSection match:[0m [34m['EDUCATIONAL ASSORTATIVE  MATING', (45793, 45824)][0m
	 [34mSection match:[0m [34m['SUMMARY AND CONCLUSION', (59064, 59086)][0m
90 [31mASR_1991_56_1_Group Differences 

	 [34mSection match:[0m [34m['KEIRETSU NETWORKS AND  CORPORATE PERFORMANCE IN JAPAN', (976, 1029)][0m
	 [34mSection match:[0m [34m['BACKGROUND', (5790, 5800)][0m
	 [34mSection match:[0m [34m['KEIRETSU NETWORKS AND CORPORATE PERFORMANCE', (82747, 82790)][0m
	 [34mSection match:[0m [34m['METHODS', (30170, 30177)][0m
	 [34mSection match:[0m [34m['NEEDS', (41814, 41819)][0m
	 [34mSection match:[0m [34m['REGRESSION ANALYSES', (47252, 47271)][0m
	 [34mSection match:[0m [34m['TSCSREG', (55748, 55755)][0m
	 [34mSection match:[0m [34m['CONCLUSIONS', (76207, 76218)][0m
	 [34mSection match:[0m [34m['NEEDS MT', (86839, 86847)][0m
102 [31mASR_1996_61_1_On Realization in Everyday Life-The Forecasting of Bad News as a Social Relation.pdf[0m
	 [34mAccessed: 22-06-2016 19:06 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [35mReference match:[0m [35m<regex.Match object; span=(93506, 93516), match='REFERENCES'>[0m
	 [34mSection match:[0m [34m['EVERYDAY LI

	 [34mAccessed: 22-06-2016 19:41 UTC[0m
	 [35mAbstract match:[0m [35mNone[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(17817, 17820), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(19449, 19452), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(49378, 49381), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(55288, 55291), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(59820, 59823), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(62259, 62262), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(65234, 65237), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(65644, 65647), match='Fig'>[0m
	 [32mFigure match:[0m [32m<regex.Match object; span=(67833, 67836), match='Fig'>[0m
	 [35mReference match:[0m [35m<regex.Match object; span=(96029, 96039), match='REFERENCES'>[0m
	 [34mSection ma

In [13]:
out_ASRpre1946 = createOutputStrings(convertedStrings_ASRpre1946, folder = ASRpre1946pdfs, journal = 'ASR')
out_ASRpost1946 = createOutputStrings(convertedStrings_ASRpost1946, folder = ASRpost1946pdfs, journal = 'ASR')

writeOut(out_ASRpre1946, pdfFolder = ASRpre1946pdfs, outFolder = 'ASR_pre1946')
writeOut(out_ASRpost1946, pdfFolder = ASRpost1946pdfs, outFolder = 'ASR_post1946')

16it [00:00, 3326.17it/s]
121it [00:00, 3567.34it/s]

done!
done!



