Author: Amy Weng

This file filters through all the clean EEBO CSV files for date ranges and words. 

Based in part on the 2021 Data+ team's text filtering code: https://github.com/ABeeShake/Ethical-Consumption-Before-Capitalism/blob/main/topic%20modelling/code_actualrunning_awsvm.py

In [2]:
import os
import re
import pandas as pd

directory = '/home/rapiduser/Materials/'
file = directory + '/restoration.csv'

In [None]:
#code for adding all DATED texts within a certain date range to one CSV file
def filterYear(folder,filename,start,end):
    count = 0 
    
    outFileName = directory + "\\" + filename
    outFile = pd.DataFrame(columns=('title','author','publisher','date','text'))
    
    for cleanFile in os.listdir(folder):
        
        #open and read each CSV 
        readFile = pd.read_csv(folder + "\\" + cleanFile) 
        
        #read the entries of each row (each row is its own text)
        for i in range(len(readFile.index)):
            row = readFile[i:(i+1)]
            
            #convert the dates into the same format and exclude non-dated texts 
            #then check if date is within the given range
            if isinstance(row.iloc[0].date,str):
                date = (row.iloc[0].date)
            else:
                date = (row.iloc[0].date).astype(str)
            
            if ((date !='Date Not Found') & (date !='[\'Date Not Found\']')):
                if (date and date.strip()):
                    datenum = re.search('\d{4}', date)  
                    
                    if (datenum != None):
                        date = int(str(datenum.group(0)))
                        
                        if (int(date) in range(start, end+1)):  
                            d = pd.DataFrame([(row.iloc[0].title,
                                                row.iloc[0].author,
                                                row.iloc[0].publisher,
                                                date,
                                                row.iloc[0].text)],
                                                columns=('title','author','publisher','date','text'))
                            
                            outFile = outFile.append(d,ignore_index=True)
            
            count+=1
            if (count % 10000 == 0):
                print(count)
    
    outFile.to_csv(outFileName)

In [11]:
#code for adding all DATED texts w/ keywords to one CSV file
def filterWord(outFile,word):
    count = 0 
    
    outFileName = directory + outFile
    outFile = pd.DataFrame(columns=('title','author','publisher','date','text'))
    
    readFile = pd.read_csv(file)
        
    for i in range(len(readFile.index)):
        row = readFile[i:(i+1)]
    
        text = row.iloc[0].text
    
        if (re.search(word, text) != None):
            
            d = pd.DataFrame([(row.iloc[0].title,
                                row.iloc[0].author,
                                row.iloc[0].publisher,
                                row.iloc[0].date,
                                text)],
                                columns=('title','author','publisher','date','text'))
            
            outFile = outFile.append(d,ignore_index=True)
            
        count+=1
        if (count % 10000 == 0):
            print(count)
    print("Total number of texts:",count)
    outFile.to_csv(outFileName)

In [6]:
economics = re.compile("economy|economic|money|monies|bullion|bullionist|mercantile|fiscal|taxation|exchange|corporation|corporate|company|merchant|joint-stock|merchants|currency|consumption|consume|consuming|consumables|trade|traffic|traffique|commerce|commercial|price|prices|spending|east indies|east india company|east-india company|eat-india company|east-india-company|monopoly|monopolization|monopolies|monopolium|monopolion|monopolie|monopolist|monopolizer|monopolize|monopolizes|monopolye|monopolyes|debt|credit|debtor|creditor|decoctor|bank|banks|usury|interest rate|interest|interest rates|importation|exportation|coin|employment|austerity|goods|treasure|commodities|income|commodity|revenue|land|profitable|unprofitable|industry|work|usurer|estate|property|substance|consumer|free trade")

In [12]:
filterWord("economics.csv",economics)

10000
20000
30000
Total number of texts: 31905


In [3]:
#code for adding all DATED texts within a certain date range to one CSV file
def filterFileByYear(inFile, outFile,start,end):

    outFileName = directory + outFile
    outFile = pd.DataFrame(columns=('title','author','publisher','date','text'))
    
    readFile = pd.read_csv(directory + inFile) 
    
    #read the entries of each row (each row is its own text)
    for i in range(len(readFile.index)):
        row = readFile[i:(i+1)]
    
        date = row.iloc[0].date
        
        if (int(date) in range(start, end+1)):  
            d = pd.DataFrame([(row.iloc[0].title,
                                row.iloc[0].author,
                                row.iloc[0].publisher,
                                date,
                                row.iloc[0].text)],
                                columns=('title','author','publisher','date','text'))
            
            outFile = outFile.append(d,ignore_index=True)
        
    outFile.to_csv(outFileName)
    
    f = pd.read_csv(outFileName)
    print("Total number of texts is:",len(f.index))  

In [9]:
# 1st phase 
filterFileByYear("economics.csv","1st phase.csv",1660,1678)

Total number of texts is: 10460


In [11]:
# 2nd phase 
filterFileByYear("economics.csv","2nd phase.csv",1679,1685)

Total number of texts is: 6986


In [4]:
# 3rd phase
filterFileByYear("economics.csv","3rd phase.csv",1688,1694)

Total number of texts is: 6697


In [5]:
filterFileByYear("economics.csv","post-restoration.csv",1695,1700)

Total number of texts is: 5419
