Author: Amy Weng

This file filters through all the clean EEBO CSV files for date ranges and words. 

Based in part on the 2021 Data+ team's text filtering code: https://github.com/ABeeShake/Ethical-Consumption-Before-Capitalism/blob/main/topic%20modelling/code_actualrunning_awsvm.py

In [2]:
import os
import re
import pandas as pd

directory = '/home/rapiduser/Materials'
file = directory + '/restoration.csv'
texts = directory + '/Texts/'

In [1]:
#code for adding all DATED texts within a certain date range to one CSV file
def filterYear(folder,filename,start,end):    
    outFileName = directory + "\\" + filename
    outFile = pd.DataFrame(columns=('title','author','publisher','date','text'))
    
    for cleanFile in os.listdir(folder):
        
        #open and read each CSV 
        readFile = pd.read_csv(folder + "\\" + cleanFile) 
        
        #read the entries of each row (each row is its own text)
        for i in range(len(readFile.index)):
            df = readFile[i:(i+1)]
            
            #convert the dates into the same format and exclude non-dated texts 
            #then check if date is within the given range
            if isinstance(df.iloc[0].date,str):
                date = (df.iloc[0].date)
            else:
                date = (df.iloc[0].date).astype(str)
            
            if ((date !='Date Not Found') & (date !='[\'Date Not Found\']')):
                if (date and date.strip()):
                    datenum = re.search('\d{4}', date)  
                    
                    if (datenum != None):
                        date = int(str(datenum.group(0)))
                        
                        if (int(date) in range(start, end+1)):                              
                            
                            outFile = outFile.append(df,ignore_index=True)
    outFile.to_csv(outFileName)

In [3]:
#code for adding all DATED texts w/ keywords to one CSV file
def filterWord(outFile,infile,word):
    
    outFileName = texts + outFile
    outFile = pd.DataFrame(columns=('title','author','publisher','date','text'))
    
    readFile = pd.read_csv(infile)
        
    for i in range(len(readFile.index)):
        df = readFile[i:(i+1)]
    
        text = df.iloc[0].text
    
        if (re.search(word, text) != None):
            outFile = outFile.append(df,ignore_index=True)
        
    outFile.to_csv(outFileName)

    f = pd.read_csv(outFileName)
    print("Total number of texts is:",len(f.index)) 
    

In [4]:
eic = re.compile('east india company|east-india company|eat-india company|eat india company|east-india-company')

In [4]:
monopoly = re.compile('monopolie|monopolies|monopolion|monopolist|monopolium|monopolization|monopolize|monopolizer|monopolizes|monopoly|monopolion|monopolye|monopolyes|monoply|monopolise|monopolising|monopolists|monopolizers|monopolised|monoopolies|monopolits|monopolers')

In [6]:
filterWord("monopoly.csv",file,monopoly)

Total number of texts is: 1194


In [5]:
filterWord("eic_monopoly.csv",'/home/rapiduser/Materials/Texts/eic.csv',monopoly)

Total number of texts is: 106


In [5]:
#code for adding all DATED texts within a certain date range to one CSV file
def filterFileByYear(outFile,start,end):

    outFileName = texts + outFile
    outFile = pd.DataFrame(columns=('title','author','publisher','date','text'))
    
    readFile = pd.read_csv(file)
    
    #read the entries of each row (each row is its own text)
    for i in range(len(readFile.index)):
        df = readFile[i:(i+1)]
    
        date = df.iloc[0].date
        
        if (int(date) in range(start, end+1)):  
            outFile = outFile.append(df,ignore_index=True)
        
    outFile.to_csv(outFileName)
    
    f = pd.read_csv(outFileName)
    print("Total number of texts is:",len(f.index))  

In [12]:
# 1st phase 
filterFileByYear("1st phase.csv",1660,1678)

Total number of texts is: 10790


In [13]:
# 2nd phase 
filterFileByYear("2nd phase.csv",1679,1685)

Total number of texts is: 7321


In [14]:
# 3rd phase
filterFileByYear("3rd phase.csv",1688,1694)

Total number of texts is: 7025


In [6]:
filterFileByYear("post-restoration.csv",1695,1714)

Total number of texts is: 5665
