<h1> Parsing, cleaning, and storing PDFs </h1>

This notebook shows how I parsed the PDFs of the CRS reports, did some preliminary cleaning, and then put them into a MongoDB collection.

In [1]:
from pymongo import MongoClient
import pickle
import numpy as np

Opening up a connection to MongoDB

In [2]:
client = MongoClient('mongodb://localhost:27017/')

In [3]:
db = client.project_4_database

These are the locations of all of the PDFs from the CRS reports.

In [4]:
pdf_locs = ['/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/*.pdf',
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/terror/*.pdf",
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/homesec/*.pdf",
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/intel/*.pdf",
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/mideast/*.pdf",
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/misc/*.pdf",
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/row/*.pdf",
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/secrecy/*.pdf",
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/space/*.pdf",
            "/Users/jonathanjramirez/Documents/metis_project_4/fas.org/sgp/crs/weapons/*.pdf",
           ]

In [5]:
topics = ['crs']

The below code will give the file paths to all of the PDFs in the folders above.

In [6]:
import glob

total_pdf_file_paths = []

for loc in pdf_locs[1:]:
    
    pdfs_paths = glob.glob(loc)
    
    for path in pdfs_paths:

        if ('R' in path):
            total_pdf_file_paths.append(path)
    topics.append(loc[loc.index('crs') + len('crs/'):loc.index('/*')])

Saving the URLs for later Flask App use.

In [7]:
urls = [file[50:] for file in total_pdf_file_paths]

In [8]:
with open("urls.pkl", "wb") as output_file:
    pickle.dump(urls, output_file)

Now to find the file names (all starting with R).

In [9]:
import re

In [10]:
file_names = []
for path in total_pdf_file_paths:
    file_names.append(re.search(r'R[\w]*',path)[0])
    

Textract is an awesome PDF parsing library that will make data collection a lot easier.

In [11]:
import textract
import string

In [12]:
class crs_parser():
    
    def __init__(self):
        self.articles = []
        self.titles = []
        
    def convert_to_string(self,pdfs):
        
        while pdfs:
            
            try:
                pdf = pdfs.pop()

                #print('Processing {}'.format(pdf))
                #print('{} documents left'.format(len(pdfs)))

                text = (textract.
                        process(pdf).
                        decode('utf-8').
                        replace('\n', ' ').
                        replace('.htm', ' ').
                        replace('.gov', ' '))

                text = ''.join(x for x in text if x in string.printable)
                text = ''.join([i for i in text if not i.isdigit()])

                self.articles.append(text)

                name = re.search(r'R[\w]*',pdf)[0]

                #print(name)

                self.titles.append(name)
                
            except:
                
                pass

            
    def article_tups(self):
        
        self.art_tups = []
        
        for i,article in enumerate(self.articles):
            
            self.art_tups.append((self.titles[i], article))
        #print(self.art_tups)
            
    def load_articles(self,articles):
                
        self.convert_to_string(articles)
        
        self.article_tups()
        
        
    def to_mongo(self, db_client, collection):

        for i,pdf in enumerate(self.articles):

            db_client.collection.insert_one({self.titles[i]:pdf})    
        
    def get_art_tups(self):
        
        return self.art_tups

In [13]:
crs = crs_parser()

In [16]:
crs.load_articles(total_pdf_file_paths)

<h1> Investigate duplicate texts </h1>

Here, let's just do our due diligence and make sure that there weren't any duplicate texts.

In [17]:
data_dict = {}

for i,article in enumerate(crs.articles):
    
    title = crs.titles[i]
    data_dict[title] = {'name': title, 'text': article, 'url': urls[i]}

In [18]:
import pandas as pd

crs_df = pd.DataFrame(pd.DataFrame.from_dict(list(data_dict[crs.titles[0]].items())).T.iloc[1,:]).T
crs_df.columns = ['name', 'text', 'url']

for datum in crs.titles[1:]:
    
    new_row = pd.DataFrame(pd.DataFrame.from_dict(list(data_dict[datum].items())).T.iloc[1,:]).T
    new_row.columns = ['name', 'text', 'url']
    crs_df = crs_df.append(new_row,ignore_index= True)


In [19]:
total_txts = np.array(crs_df['text'])

In [20]:
def find_uniques(df):
    
    unique_ids = []

    for text in df['text'].unique():

        unique_ids.append(list(df[df['text'] == text]['name'])[0])
            
    return unique_ids

In [21]:
uniques = find_uniques(crs_df)

In [22]:
with open("uniques.pkl", "wb") as output_file:
    pickle.dump(uniques, output_file)

In [24]:
for pdf in uniques:

     db.raw_pdfs.insert_one({pdf:data_dict[pdf]})

In [25]:
with open("crs_pdfs.pkl", "wb") as output_file:
    pickle.dump(crs, output_file)

<h1> Cleaning and Storing</h1>

In this section, we do some basic cleaning such as removing stop words, symbols, and multiple ellipses.

In [26]:
import pickle

crs = pickle.load(open('crs_pdfs.pkl', 'rb'))

In [27]:
import spacy



In [28]:
class cleaner():
    
    def __init__(self, articles, user_det_stopwords = []):
        
        self.articles = articles
        self.abbreviations = []
        self.user_det_stopwords = user_det_stopwords
        
    def doc_cleaner(self, doc):

        alt_stop_words = self.user_det_stopwords + ['republican', 'Republican', 'democrat', 'Democrat'] 
        cleaned = (doc.
        replace('%', ' percent ').
        replace('):', ' ').
        replace('.gov', ' '))

        for word in alt_stop_words:

            cleaned = (cleaned.replace(word, ''))

        text = re.sub(r'\.\.+', '', cleaned)
        text = re.sub(r'\.', '', cleaned)
        text = re.sub('[()]', '', text)

        text = ''.join(x for x in text if x in string.printable)
        text = ''.join([i for i in text if not i.isdigit()])
        text = ''.join([i for i in text if text not in string.punctuation])

        self.abbreviations += (re.findall(r"\b[A-Z]{3}\b", text))

        return text
    
    def clean_articles(self):
        
        for i,article in enumerate(self.articles):
            
            self.articles[i] = self.doc_cleaner(doc = article['text'])
            
        self.abbreviations = set(self.abbreviations)
        
        for i,article in enumerate(self.articles):
            
            listed_abb = list(self.abbreviations)
            
            for word in listed_abb:
            
                self.articles[i] = re.sub(word, '',article)
            
            
    def add_stopwords(self, stop_words):
        
        self.user_det_stop_words += stop_words

In [29]:
cl = cleaner(articles = [db.raw_pdfs.find({})[i][unique] for i,unique in enumerate(uniques)])

In [30]:
cl.clean_articles()

In [32]:
for i, article in enumerate(uniques):
    
    db.cleaned_pdfs.insert_one({article: cl.articles[i]})