# Early game running start

### Source - https://medium.com/@rqaiserr/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f

In [None]:
import pandas as pd
import numpy as np
import regex as re
import requests
import nltk
import PyPDF2 
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#nltk.download('punkt') # necessary jaxzz to tokenize
# for downloading pdf's

## Step 1: Clean Main dataset and Download State PDF Contracts Remote->Local

In [None]:
# Import raw dataset from kaggle
data = pd.read_csv('../raw/contracts.csv', encoding='utf-8')
# Change column headers to utilize underlines instead of whitespace.
data.columns = data.columns.str.replace(' ', '_')
# Use list comprehension to make all headers lowercase as well.
data.columns = [x.lower() for x in data.columns]
# Change contract_url header to say just that...
data.rename(columns={'contract_pdf': 'contract_url'}, inplace=True)
# Output first 5 rows (head)
data.head()

In [None]:
# How many rows? (before filter)
len(data.index)

In [None]:
# Filter and create a df with only contracts that have an attached url
data['contract_url'].replace('', np.nan, inplace=True)
data.dropna(subset=['contract_url'], inplace=True)
# How many rows? (after filter should  be 45,775)
len(data.index)

In [None]:
# See where were at again...
data.head()

In [None]:
# Now use regex to filter and change column to have JUST the url itself:
# Just grab group 1
remove_junk = re.compile(r"{'url':\s'(.*)'}", re.IGNORECASE)
data['contract_url'] = data['contract_url'].apply(lambda x: re.search(remove_junk, x).group(1))

In [None]:
# At this point for each row we'll download the pdf and store the pdf locally.
    
def download_pdf(row):
    url=row['contract_url']
    response = requests.get(url) # We'll grab the response.text (html output of page), grab REAL pdf link and download.
    # Use regex to pull the link out....Let's hope all chicago contracts follow same html format ;)
    pdf_regex = re.compile(r'<iframe src="(.*)"\sname=', re.IGNORECASE) # find link; this regex should be standard on all state pages.
    new_link = re.search(pdf_regex, response.text).group(1) # REAL pdf link
    r = requests.get(new_link, allow_redirects=True, stream=True)
    with open('../raw/chicago_pdfs/' + str(row['specification_number']) + '-' + str(row['vendor_id']) + '.pdf', 'wb') as f:
        f.write(r.content)

# Download pdf for each row using its contract_url
data.apply(download_pdf, axis=1)



# VERIFIED EXTRACTION CODE TO DOWNLOAD PDF FROM STATE SITES (pain in the butt, took me way too long...)

# url = 'http://ecm.cityofchicago.org/eSMARTContracts/service/DPSWebDocumentViewer?sid=ESMART&id={2488393F-CCF9-476E-808A-9FBF3C25E0D6}'
# response = requests.get(url) # We'll grab the response.text (html output of page), grab real pdf link and download.
# # Use regex to pull the link out....Let's hope all chicago contracts follow same html format ;)
# pdf_regex = re.compile(r'<iframe src="(.*)"\sname=', re.IGNORECASE) # find link; this regex should be standard on all state pages.
# new_link = re.search(pdf_regex, response.text).group(1)
# r = requests.get(new_link, allow_redirects=True, stream=True)
# with open('test.pdf', 'wb') as f:
#     f.write(r.content)

In [None]:
# This  function says for each row we have an associated pddf, extract text from it. (see what happens!)
def pdf_to_text(row): 
    # Grab what the filename should be
    filename = '../raw/chicago_pdfs/' + str(row['specification_number']) + '-' + str(row['vendor_id']) + '.pdf'
        
# For each row take text from downloaded pdf associated file/delete it?
data['text_list'] = data.apply(pdf_to_text, axis=1)

## STEP 2: Local_PDF -> List of strings representing OCR output

In [None]:
# UPDATE: Was having a bad time getting this to work on small_example.pdf, try this tomorrow 
# https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/

# PDF->TEXT (sample/test)
filename = '../raw/chicago_pdfs/small_example.pdf'
#open allows you to read the file
# pdfFileObj = open(filename,'rb')
# #The pdfReader variable is a readable object that will be parsed
# pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# #discerning the number of pages will allow us to parse through all #the pages
# num_pages = pdfReader.numPages
# count = 0
text = ""
# #The while loop will read each page
# while count < num_pages:
#     pageObj = pdfReader.getPage(count)
#     count +=1
#     text += pageObj.extractText()
# Check if PyPDF can acqire the text easily. If not it's most likely a scanned image put in a pdf file...This has potential problems...the logic that is.
if text == '':
    # If here  means our PyPDF extract failed, and we need to do something more advanced....tesseract!
    text = textract.process(filename)#, method='tesseract', encoding='ascii', language='eng') #method='pdfminer', language='eng')#

# Now we have a text variable which contains all the text derived #from our PDF file. Type print(text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n' etc.
print(text)
# Now, we will clean our text variable, and return it as a list of keywords.
#----------------SEPARATE PARSED INTO LIST
#The word_tokenize() function will break our text phrases into #individual words

tokens = word_tokenize(text)

#we'll create a new list which contains punctuation we wish to clean
punctuations = ['(',')',';',':','[',']',',']

#We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
stop_words = stopwords.words('english')

#We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
keywords = [word for word in tokens if not word in stop_words and not word in punctuations]

In [None]:
keywords # This  is the list of all parsed information, which is attempted to be parsed further.

In [None]:
# https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/

In [None]:
# First objective: Create column that contains text from pdf->text (tesseract?)
# (Or better option than tesseract for python if exists...)

# Late game

In [None]:
# Second objective: Make a column utilizing web scraping on 

In [None]:
# Third objective: Create truth column that deduces whether or not a contract
# was ACCEPTED/DENIED utiilizing regex and looking at column created above.


In [None]:
# Fourth objective: Create a list of prioritized vendors to imititate based off:
# -How many contracts they've acquired
# -Value of contracts (ie: give higher weight/importance of imitation to
    # high-paying contract obtainers)

In [None]:
# Once curated dataset created, upload to Kaggle as a kernel and call TIDY_contract_data.csv/xlsx


In [None]:
# Create a text classifier that finds and verifies a piece ofo text exists in
# all contracts (ie: a standard or substandard that is followed but sometimes  forgotten)

In [None]:
# OR create a feature bayesian inference to discern acceptance or denial of contract