# Data Preparation and Basic Text Analysis - Topic Modeling Pt. 1

### #1. Zotero API + PDF Text Miner
We need this so we don't have to bring down all of the pdfs in our library to our local machines, and we can always run our data prep on the most up-to-date corpus.

In [1]:
import fitz
from PIL import Image
import pytesseract
from nltk.stem import WordNetLemmatizer
from pdf2image import convert_from_path
import ftfy
import nltk.corpus
from nltk.collocations import *
from nltk.corpus import stopwords
from gensim import models, corpora
import pandas as pd
import logging
import requests
import pprint
import datetime
import glob
import urllib.request
import os
import re
import time
import math
import json

In [2]:
def zoteroCrawler(key, groupid):
    multiplier = 0
    zoterogroup = []
    # get number of items in group
    headers = {"Zotero-API-Version":"3",'Connection':'close', "Zotero-API-Key":key}
    checkurl = "https://api.zotero.org/groups/" +groupid
    rcheck = requests.get(checkurl, headers=headers)
    items = rcheck.json()['meta']['numItems']
    print(items)
    pages = math.ceil((items/100))
    while pages > 0:
        url = "https://api.zotero.org/groups/" +groupid + "/items" + "?limit=100&start=" + str(multiplier)
        print(url)
        r = requests.get(url, headers=headers)
        rj = r.json() #jsonified version of our Zotero group
        print(len(rj))
        for i in rj:
            zoterogroup.append(i)
        multiplier = multiplier + 100
        pages = pages - 1
    return zoterogroup

In [3]:
api_key = "N6yPwqH9VQFt8ZKBKCAFf8KV"
group_id = "2808857"

In [4]:
rj= zoteroCrawler(api_key, group_id)

1172
https://api.zotero.org/groups/2808857/items?limit=100&start=0
100
https://api.zotero.org/groups/2808857/items?limit=100&start=100
100
https://api.zotero.org/groups/2808857/items?limit=100&start=200
100
https://api.zotero.org/groups/2808857/items?limit=100&start=300
100
https://api.zotero.org/groups/2808857/items?limit=100&start=400
100
https://api.zotero.org/groups/2808857/items?limit=100&start=500
100
https://api.zotero.org/groups/2808857/items?limit=100&start=600
100
https://api.zotero.org/groups/2808857/items?limit=100&start=700
100
https://api.zotero.org/groups/2808857/items?limit=100&start=800
100
https://api.zotero.org/groups/2808857/items?limit=100&start=900
100
https://api.zotero.org/groups/2808857/items?limit=100&start=1000
100
https://api.zotero.org/groups/2808857/items?limit=100&start=1100
20


In [5]:
def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)    
    file = open(filename + ".pdf", 'wb')
    file.write(response.read())
    file.close()

In [6]:
# now what? get the attachments
# inputs are jsonified version of your zotero group, base url to zotero group items, and api key
# going to have to loop through and run this for every page I think
def attachmentGrabber(rj, url, key):
    counter = 0
    to_extract = []
    citation_list = []
    for i in rj:
        if counter < 100:
            try:
                item_key = rj[counter]["key"]
                headers = {"Zotero-API-Version":"3",'Connection':'close', "Zotero-API-Key":key}
                attach = requests.get(url + item_key + "/file", headers=headers)
                if "200" in str(attach):
                    download_file(attach.url, f'Document_{item_key}')
                    to_extract.append(f'Document_{item_key}.pdf')
                    print("GOOD" + " 1 " + str(item_key))
                else:
                    attach = requests.get(rj[counter]['links']['attachment']['href'] + "/file", headers=headers)
                    print(attach)
                    if "200" in str(attach):
                        download_file(attach.url, f'Document_{item_key}')
                        to_extract.append(f'Document_{item_key}.pdf')
                        print("GOOD" + " " + str(item_key))
                    else:
                        print("that didn't work" + " " + attach.url)
            except Exception as e:
                print(e)
            counter +=1
    return to_extract

In [7]:
url = "https://api.zotero.org/groups/2808857/items/"
to_extract  = attachmentGrabber(rj, url, api_key)

GOOD 1 47GCMTBN
<Response [200]>
GOOD 3X676VED
GOOD 1 QCQS4ST2
<Response [200]>
GOOD JBHVWUZE
<Response [200]>
GOOD JI4B2NX5
GOOD 1 P3ZBU43W
<Response [200]>
GOOD JFQB362K
GOOD 1 DUDUGGU3
<Response [200]>
GOOD TMZ5ADAT
GOOD 1 TF4SN283
<Response [200]>
GOOD RU3DU96U
GOOD 1 M22IF67B
<Response [200]>
GOOD XRPE4C5V
GOOD 1 DSCUFYAG
<Response [200]>
GOOD A72X8Q5P
GOOD 1 DHKVRINE
<Response [200]>
GOOD UW8IXCF9
GOOD 1 6ES6XHAP
<Response [200]>
GOOD KW9E2UTE
GOOD 1 MWQEBJBU
<Response [200]>
GOOD 79SRZNQ3
GOOD 1 FPBF5IKX
<Response [200]>
GOOD J7HBXYBQ
GOOD 1 2LRB28HQ
GOOD 1 SX39863M
<Response [200]>
GOOD 9JV4R9EW
GOOD 1 Q73YL96D
GOOD 1 XVX4XMR2
<Response [200]>
GOOD PKB9CB7I
<Response [200]>
GOOD 7TIFU594
GOOD 1 TAAJRZEP
GOOD 1 RH5JP7XX
GOOD 1 LTZN6E4P
<Response [200]>
GOOD D6YVIFBC
GOOD 1 3E578E54
GOOD 1 IB7PWZVG
GOOD 1 TMFT472T
GOOD 1 YJRJLQSA
<Response [200]>
GOOD UTLT344I
<Response [200]>
GOOD DMKRZS8E
<Response [200]>
GOOD ZFPUQZNE
<Response [200]>
HTTP Error 403: Forbidden
<Response [200]>

In [8]:
#with open('to_extract_backup.txt', 'w') as f:
 #   f.write(json.dumps(to_extract))

In [9]:
#with open('to_extract_backup.txt', 'r', encoding='utf-8') as f:
 #   data = f.read()

In [10]:
#to_extract = json.loads(data)

The attachments have been procured. Let's do something with them. Thanks to [PDF Text Miner](https://github.com/prldc/pdf_text_miner) for this function.

In [11]:
def extract_pdfs(list):  # You can easily extract a list from a .csv with pandas.
    d = {'file_name': ['dummy'], 'file_text': ['dummy'], 'ocr': [False]}
    df = pd.DataFrame(d, columns=['file_name', 'file_text', 'ocr'])
    count = 1
    for pdf in list:
        try:
            ext = os.path.splitext(pdf)[1][1:].strip()  # Gets file extension.
            if ext == 'pdf':  # Guarantees that the file is a .pdf, otherwise the program will crash when extracting text.
                ocr = False
                name = pdf.split('.pdf')[0]
                doc = fitz.open(f"{name}.pdf")
                text_file = open(f"{name}.txt", 'w')
                number_of_pages = doc.page_count
                for page_n in range(number_of_pages):  # Extracts text from each page.
                    page = doc.load_page(page_n)
                    page_content = page.get_text("text")
                    text_file.write(page_content)
                if os.stat(
                        f"{name}.txt").st_size < 2000:  # Assumes file lacks OCR based on .txt file size, starts Tesseract.
                    ocr = True
                    os.remove(f"{name}.txt")  # Removes the previously scraped .txt.
                    tess_file = f"{name}.pdf"
                    pages = convert_from_path(tess_file, 500)
                    image_counter = 1
                    for page in pages:  # Converts the PDF to image.
                        filename = f"{name}page_{str(image_counter)}.jpg"
                        page.save(filename, 'JPEG')
                        image_counter = image_counter + 1
                    filelimit = image_counter - 1
                    outfile = f"{name}.txt"
                    f = open(outfile, "a")
                    for i in range(1, filelimit + 1):  # Applies OCR to each image, saves text file.
                        filename = f"{name}page_{str(i)}.jpg"
                        text = str((pytesseract.image_to_string(Image.open(filename), lang="por")))
                        text = text.replace('-\n', '')
                        f.write(text)
                    f.close()
                text = open(f"{name}.txt", 'r')
                txt = " ".join(text.readlines())
                df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)    
                end = datetime.datetime.now()
                print(
                    f"Finished {name} at {end}. OCR = {ocr}. {count} files read. {round(count * 100 / len(list), 2)}% done.")
        except Exception as e:
            print(f'Did not finish {pdf}... check out that one.')
            print(e)
        count = count + 1

    return df

In [12]:
out = extract_pdfs(to_extract) # look at the frame.append method and change to concat

  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_47GCMTBN at 2023-03-12 21:17:38.149271. OCR = False. 1 files read. 1.05% done.
Finished Document_3X676VED at 2023-03-12 21:17:38.225014. OCR = False. 2 files read. 2.11% done.
Finished Document_QCQS4ST2 at 2023-03-12 21:17:38.348600. OCR = False. 3 files read. 3.16% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_JBHVWUZE at 2023-03-12 21:17:38.431438. OCR = False. 4 files read. 4.21% done.
Finished Document_JI4B2NX5 at 2023-03-12 21:17:38.481473. OCR = False. 5 files read. 5.26% done.
Finished Document_P3ZBU43W at 2023-03-12 21:17:38.553210. OCR = False. 6 files read. 6.32% done.
Finished Document_JFQB362K at 2023-03-12 21:17:38.616059. OCR = False. 7 files read. 7.37% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_DUDUGGU3 at 2023-03-12 21:17:38.701947. OCR = False. 8 files read. 8.42% done.
Finished Document_TMZ5ADAT at 2023-03-12 21:17:38.780565. OCR = False. 9 files read. 9.47% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_TF4SN283 at 2023-03-12 21:17:39.103099. OCR = False. 10 files read. 10.53% done.
Finished Document_RU3DU96U at 2023-03-12 21:17:39.237380. OCR = False. 11 files read. 11.58% done.
Finished Document_M22IF67B at 2023-03-12 21:17:39.297298. OCR = False. 12 files read. 12.63% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_XRPE4C5V at 2023-03-12 21:17:39.359673. OCR = False. 13 files read. 13.68% done.
Finished Document_DSCUFYAG at 2023-03-12 21:17:39.446093. OCR = False. 14 files read. 14.74% done.
Finished Document_A72X8Q5P at 2023-03-12 21:17:39.549448. OCR = False. 15 files read. 15.79% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_DHKVRINE at 2023-03-12 21:17:39.636784. OCR = False. 16 files read. 16.84% done.
Finished Document_UW8IXCF9 at 2023-03-12 21:17:39.732200. OCR = False. 17 files read. 17.89% done.
Finished Document_6ES6XHAP at 2023-03-12 21:17:39.814228. OCR = False. 18 files read. 18.95% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_KW9E2UTE at 2023-03-12 21:17:39.899996. OCR = False. 19 files read. 20.0% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_MWQEBJBU at 2023-03-12 21:17:40.220044. OCR = False. 20 files read. 21.05% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_79SRZNQ3 at 2023-03-12 21:17:40.516874. OCR = False. 21 files read. 22.11% done.
Finished Document_FPBF5IKX at 2023-03-12 21:17:40.660017. OCR = False. 22 files read. 23.16% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_J7HBXYBQ at 2023-03-12 21:17:40.800869. OCR = False. 23 files read. 24.21% done.
Did not finish Document_2LRB28HQ.pdf... check out that one.
cannot open broken document


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_SX39863M at 2023-03-12 21:17:41.007799. OCR = False. 25 files read. 26.32% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_9JV4R9EW at 2023-03-12 21:17:41.232924. OCR = False. 26 files read. 27.37% done.
Did not finish Document_Q73YL96D.pdf... check out that one.
cannot open broken document
Finished Document_XVX4XMR2 at 2023-03-12 21:17:41.374884. OCR = False. 28 files read. 29.47% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_PKB9CB7I at 2023-03-12 21:17:41.447570. OCR = False. 29 files read. 30.53% done.
Finished Document_7TIFU594 at 2023-03-12 21:17:41.506311. OCR = False. 30 files read. 31.58% done.
Finished Document_TAAJRZEP at 2023-03-12 21:17:41.561036. OCR = False. 31 files read. 32.63% done.
Did not finish Document_RH5JP7XX.pdf... check out that one.
cannot open broken document
Finished Document_LTZN6E4P at 2023-03-12 21:17:41.643907. OCR = False. 33 files read. 34.74% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_D6YVIFBC at 2023-03-12 21:17:41.728373. OCR = False. 34 files read. 35.79% done.
Did not finish Document_3E578E54.pdf... check out that one.
cannot open broken document
Finished Document_IB7PWZVG at 2023-03-12 21:17:41.792076. OCR = False. 36 files read. 37.89% done.
Did not finish Document_TMFT472T.pdf... check out that one.
cannot open broken document
Finished Document_YJRJLQSA at 2023-03-12 21:17:41.927337. OCR = False. 38 files read. 40.0% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_UTLT344I at 2023-03-12 21:17:42.066421. OCR = False. 39 files read. 41.05% done.
Finished Document_DMKRZS8E at 2023-03-12 21:17:42.158436. OCR = False. 40 files read. 42.11% done.
Finished Document_ZFPUQZNE at 2023-03-12 21:17:42.256831. OCR = False. 41 files read. 43.16% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_G4923VSV at 2023-03-12 21:17:42.317216. OCR = False. 42 files read. 44.21% done.
Finished Document_EV273D9P at 2023-03-12 21:17:42.370145. OCR = False. 43 files read. 45.26% done.
Finished Document_PDUA89RL at 2023-03-12 21:17:42.414755. OCR = False. 44 files read. 46.32% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_VZGACXNN at 2023-03-12 21:17:42.544722. OCR = False. 45 files read. 47.37% done.
Finished Document_YGFJAA5F at 2023-03-12 21:17:42.684419. OCR = False. 46 files read. 48.42% done.
Finished Document_KN8U5AXF at 2023-03-12 21:17:42.744806. OCR = False. 47 files read. 49.47% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_NDH25D7X at 2023-03-12 21:17:42.800990. OCR = False. 48 files read. 50.53% done.
Finished Document_BM6L3JUC at 2023-03-12 21:17:42.864686. OCR = False. 49 files read. 51.58% done.
Finished Document_TJR42XC5 at 2023-03-12 21:17:42.918728. OCR = False. 50 files read. 52.63% done.
Finished Document_RSB2STPR at 2023-03-12 21:17:42.981317. OCR = False. 51 files read. 53.68% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_3JVVCGH6 at 2023-03-12 21:17:43.058086. OCR = False. 52 files read. 54.74% done.
Finished Document_334IKY7D at 2023-03-12 21:17:43.113379. OCR = False. 53 files read. 55.79% done.
Finished Document_NDTF8ZQG at 2023-03-12 21:17:43.179179. OCR = False. 54 files read. 56.84% done.
Finished Document_UR68UWWC at 2023-03-12 21:17:43.241170. OCR = False. 55 files read. 57.89% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_P7ZQXA49 at 2023-03-12 21:17:43.337091. OCR = False. 56 files read. 58.95% done.
Finished Document_HE9X7NKT at 2023-03-12 21:17:43.416657. OCR = False. 57 files read. 60.0% done.
Finished Document_4S7CU23C at 2023-03-12 21:17:43.473366. OCR = False. 58 files read. 61.05% done.
Finished Document_QKHD29TS at 2023-03-12 21:17:43.530887. OCR = False. 59 files read. 62.11% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_NYRUWCRI at 2023-03-12 21:17:43.595361. OCR = False. 60 files read. 63.16% done.
Finished Document_5H6TVKRX at 2023-03-12 21:17:43.776182. OCR = False. 61 files read. 64.21% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_WSFZDI5V at 2023-03-12 21:17:43.905069. OCR = False. 62 files read. 65.26% done.
Finished Document_J5WD5TBV at 2023-03-12 21:17:43.959896. OCR = False. 63 files read. 66.32% done.
Finished Document_ZH6G29C7 at 2023-03-12 21:17:44.023463. OCR = False. 64 files read. 67.37% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_PGPVWJGT at 2023-03-12 21:17:44.121116. OCR = False. 65 files read. 68.42% done.
Finished Document_8MUZLDEB at 2023-03-12 21:17:44.179747. OCR = False. 66 files read. 69.47% done.
Finished Document_REYPV3ZC at 2023-03-12 21:17:44.230858. OCR = False. 67 files read. 70.53% done.
Finished Document_J2HMCSYG at 2023-03-12 21:17:44.307111. OCR = False. 68 files read. 71.58% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_WE83RECQ at 2023-03-12 21:17:44.362950. OCR = False. 69 files read. 72.63% done.
Finished Document_P6HD9YXC at 2023-03-12 21:17:44.423685. OCR = False. 70 files read. 73.68% done.
Finished Document_CP48TTDC at 2023-03-12 21:17:44.470523. OCR = False. 71 files read. 74.74% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_CI6ZIKFF at 2023-03-12 21:17:44.613780. OCR = False. 72 files read. 75.79% done.
Finished Document_ZLF6SLHS at 2023-03-12 21:17:44.674856. OCR = False. 73 files read. 76.84% done.
Finished Document_AB4GLPL8 at 2023-03-12 21:17:44.733078. OCR = False. 74 files read. 77.89% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_AP5DVV5Z at 2023-03-12 21:17:45.152205. OCR = False. 75 files read. 78.95% done.
Finished Document_I64NHKAU at 2023-03-12 21:17:45.308597. OCR = False. 76 files read. 80.0% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_R5MQWTF6 at 2023-03-12 21:17:45.403587. OCR = False. 77 files read. 81.05% done.
Finished Document_AM87JDAZ at 2023-03-12 21:17:45.526290. OCR = False. 78 files read. 82.11% done.
Finished Document_9TTYWUNW at 2023-03-12 21:17:45.575020. OCR = False. 79 files read. 83.16% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_H5FD5NJN at 2023-03-12 21:17:45.650747. OCR = False. 80 files read. 84.21% done.
Finished Document_YJ2Z9CDU at 2023-03-12 21:17:45.738824. OCR = False. 81 files read. 85.26% done.
Finished Document_K92VUF7U at 2023-03-12 21:17:45.812661. OCR = False. 82 files read. 86.32% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_NBEFP27R at 2023-03-12 21:17:45.862703. OCR = False. 83 files read. 87.37% done.
Finished Document_XJZTU57V at 2023-03-12 21:17:45.926522. OCR = False. 84 files read. 88.42% done.
Finished Document_C8WB7LEN at 2023-03-12 21:17:46.000662. OCR = False. 85 files read. 89.47% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_2JGSDMU8 at 2023-03-12 21:17:46.106056. OCR = False. 86 files read. 90.53% done.
Finished Document_6LZN3M3H at 2023-03-12 21:17:46.204442. OCR = False. 87 files read. 91.58% done.
Finished Document_ECWZZCYD at 2023-03-12 21:17:46.270155. OCR = False. 88 files read. 92.63% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_DI5M2UVW at 2023-03-12 21:17:46.335924. OCR = False. 89 files read. 93.68% done.
Finished Document_NA63NB35 at 2023-03-12 21:17:46.442449. OCR = False. 90 files read. 94.74% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


Finished Document_QIHZDINR at 2023-03-12 21:17:46.568144. OCR = False. 91 files read. 95.79% done.
Finished Document_266KRB6Z at 2023-03-12 21:17:46.631380. OCR = False. 92 files read. 96.84% done.
Finished Document_QWHY5CPT at 2023-03-12 21:17:46.692921. OCR = False. 93 files read. 97.89% done.
Finished Document_RK8DMZHV at 2023-03-12 21:17:46.846676. OCR = False. 94 files read. 98.95% done.
Finished Document_76RTMKVS at 2023-03-12 21:17:46.891274. OCR = False. 95 files read. 100.0% done.


  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)
  df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)


In [13]:
out = out.drop_duplicates(subset='file_text')
out # we are able to scrape the vast majority of the articles without a problem

Unnamed: 0,file_name,file_text,ocr
0,dummy,dummy,False
1,Document_47GCMTBN,Managing future urbanization\n growth patterns...,False
3,Document_QCQS4ST2,Smart city re-imagined: City planning and GeoA...,False
5,Document_JI4B2NX5,...,False
6,Document_P3ZBU43W,Using Natural Language\n Processing to Read Pl...,False
...,...,...,...
86,Document_QIHZDINR,Accelerating the world's research.\n The End o...,False
87,Document_266KRB6Z,Futures 36 (2004) 1077–1094\n www.elsevier.com...,False
88,Document_QWHY5CPT,The Development of Optimization Methods in Gen...,False
89,Document_RK8DMZHV,REVIEW\n Open Access\n The core academic and s...,False


In [39]:
#for rapid topic modeling/cleaning prototyping:
#out.to_csv(r"extracted_text.csv")

### #2. Cleaning up the text
Using strategies based on [this article](https://monkeylearn.com/blog/text-cleaning/).

Here, before we do any other cleaning tasks, this is where we need to match and remove references with the information we got from the Scholarcy API above.

#### Resources:
- [String comparison in Python](https://note.nkmk.me/en/python-str-compare/)
- [Potentially useful example on StackOverflow](https://stackoverflow.com/questions/39551029/if-else-statement-for-finding-the-index-of-a-character-in-a-string)

In [310]:
def referenceChecker(extracted_df):
    ref_excluded = {}
    counter = 0
    string3=""
    matches=0
    nomatch=0
    for name in extracted_df['file_name']:
        if "dummy" not in name:
            text_file = open(str(name) +".txt", "r")
            data = text_file.read()      # Read whole file to a string
            text_file.close()         # Close file
            string1 = data.replace('\n'," ")
            string2 = string1.replace('\t',"")
            coverp_s = string2.find("Electronic Delivery Cover Sheet")
            coverp_e = string2.find("Part 201.14")
            if coverp_s:
                string3  = string2.replace(string2[coverp_s:coverp_e], "")
            else:
                string3 = string2
            starti = string3.find("References")
            startc = string3.find("REFERENCES")
            if starti:
                string4 = string3.replace(string3[starti:], "")
            elif startc:
                string4 = string3.replace(string3[startc:], "")
            else:
                string4=string3
            ref_excluded[name] = string4
    return ref_excluded

In [311]:
ref_ex = referenceChecker(out) #could be refined

### Cleaning Tasks
- Case Normalization
- Remove Unicode Characters
    - In the future, we may want to experiment with using [ftfy](https://github.com/rspeer/python-ftfy), which fixes text encoding issues, in this pipeline. We may also be interested in exploring [scrubadub](https://scrubadub.readthedocs.io/en/stable/index.html), which redacts potential PII from text.
- Remove Stopwords
- Lemmatize

In [312]:
def textCleaner(ref_ex_dict):
    full_corpus = []
    stop = stopwords.words('english')
    stop.extend(['et','al',"chicago","university","press","copyrighted","unauthorized","could", 'u','x','fig','eg'])
    for i in ref_ex_dict.keys():
        data = ref_ex_dict[i]
        da = data.lower()
        d = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", da)
        all_words = nltk.word_tokenize(d)
        words = [w for w in all_words if w not in stop]
        words = [w for w in words if w.isalpha()]
        lemmatizer = WordNetLemmatizer()
        word_out = []
        for word in words:
            a = lemmatizer.lemmatize(word)
            word_out.append(a)
        ref_ex_dict[i] = word_out
        full_corpus.append(word_out)
    return ref_ex_dict, full_corpus

In [313]:
a = textCleaner(ref_ex)

In [314]:
corpusdict = a[0]
corpus = a[1] # corpus in the same way that gensim defines it
full_corpus = [x for xs in corpus for x in xs] #flattened

## Basic analysis

#### A quick frequency distribution of the *most common words* in the corpus, and the *most common two and three word collocations* in the corpus.

In [315]:
word_fd = nltk.FreqDist(full_corpus)
word_fd

FreqDist({'urban': 3007, 'data': 2331, 'city': 1969, 'ai': 1744, 'planning': 1345, 'model': 1167, 'system': 945, 'technology': 799, 'study': 796, 'process': 765, ...})

In [316]:
bigram_fd = nltk.FreqDist(nltk.bigrams(full_corpus))
bigram_fd

FreqDist({('urban', 'planning'): 377, ('big', 'data'): 272, ('smart', 'city'): 232, ('urban', 'design'): 170, ('machine', 'learning'): 166, ('social', 'medium'): 162, ('land', 'use'): 161, ('articial', 'intelligence'): 149, ('computer', 'vision'): 123, ('deep', 'learning'): 119, ...})

In [317]:
trigram_fd = nltk.FreqDist(nltk.trigrams(full_corpus))
trigram_fd

FreqDist({('smart', 'sustainable', 'city'): 61, ('datadriven', 'smart', 'sustainable'): 61, ('urban', 'land', 'use'): 46, ('treatment', 'eff', 'ect'): 45, ('urban', 'technological', 'innovation'): 42, ('smart', 'sustainable', 'urbanism'): 41, ('land', 'use', 'planning'): 37, ('reading', 'material', 'published'): 36, ('material', 'published', 'posting'): 36, ('published', 'posting', 'copying'): 36, ...})

#### Using some of nltk's built in functions to get more information about the collocation scores according to association measures.
See more information about the nltk collocation methodology [here](https://www.nltk.org/api/nltk.collocations.html)

In [318]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder(word_fd, bigram_fd)
finder.score_ngrams(bigram_measures.raw_freq)

[(('urban', 'planning'), 0.0018434847240151782),
 (('big', 'data'), 0.001330047334037476),
 (('smart', 'city'), 0.0011344521378554944),
 (('urban', 'design'), 0.0008312795837734225),
 (('machine', 'learning'), 0.0008117200641552244),
 (('social', 'medium'), 0.0007921605445370261),
 (('land', 'use'), 0.0007872706646324766),
 (('articial', 'intelligence'), 0.0007285921057778821),
 (('computer', 'vision'), 0.0006014552282595939),
 (('deep', 'learning'), 0.0005818957086413958),
 (('smart', 'sustainable'), 0.0005574463091186481),
 (('ai', 'technology'), 0.000547666549309549),
 (('urban', 'development'), 0.0005427766694049994),
 (('local', 'government'), 0.0005134373899777022),
 (('neural', 'network'), 0.0004840981105504049),
 (('artificial', 'intelligence'), 0.0004645385909322067),
 (('eff', 'ect'), 0.0004547588311231076),
 (('urban', 'data'), 0.00044986895121855805),
 (('walking', 'satisfaction'), 0.00044986895121855805),
 (('urban', 'service'), 0.000440089191409459),
 (('sustainable', 'ci

### Starting some topic modeling work in earnest
#### `gensim` first!

In [304]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [319]:
#some gensim-specific preprocessing
dictionary = corpora.Dictionary(corpus) #unflattened corpus
dictionary.filter_extremes(no_below=2, no_above=0.50)
corpus = [dictionary.doc2bow(text) for text in corpus] # vectorizing

2023-03-12 23:07:59,235 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-03-12 23:07:59,438 : INFO : built Dictionary<17102 unique tokens: ['abandonment', 'ability', 'abortion', 'absence', 'absorb']...> from 74 documents (total 204504 corpus positions)
2023-03-12 23:07:59,439 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<17102 unique tokens: ['abandonment', 'ability', 'abortion', 'absence', 'absorb']...> from 74 documents (total 204504 corpus positions)", 'datetime': '2023-03-12T23:07:59.438992', 'gensim': '4.3.0', 'python': '3.10.4 (main, Mar 31 2022, 03:38:35) [Clang 12.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2023-03-12 23:07:59,461 : INFO : discarding 9872 tokens: [('abortion', 1), ('abudayyeh', 1), ('abukhader', 1), ('aburumman', 1), ('accommodates', 1), ('adaptative', 1), ('afterward', 1), ('ahmadabad', 1), ('alhadidi', 1), ('alnsour', 1)]...
2023-03-12 23:07:59,462 : INFO : keeping 7230 tokens which were in no le

In [320]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 7230
Number of documents: 74


In [321]:
# Set training parameters.
num_topics = 7
chunksize = 2000
passes = 20
iterations = 400
#save these params with no below 2 no above .5

In [322]:
#run model
model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
    )

2023-03-12 23:09:01,906 : INFO : using autotuned alpha, starting with [0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715]
2023-03-12 23:09:01,914 : INFO : using serial LDA version on this node
2023-03-12 23:09:01,922 : INFO : running online (multi-pass) LDA training, 7 topics, 20 passes over the supplied corpus of 74 documents, updating model once every 74 documents, evaluating perplexity every 74 documents, iterating 400x with a convergence threshold of 0.001000
2023-03-12 23:09:03,075 : INFO : -9.675 per-word bound, 817.2 perplexity estimate based on a held-out corpus of 74 documents with 153230 words
2023-03-12 23:09:03,076 : INFO : PROGRESS: pass 0, at document #74/74
2023-03-12 23:09:03,961 : INFO : optimized alpha [0.14779961, 0.155903, 0.16828609, 0.1411346, 0.13582204, 0.17002149, 0.13924336]
2023-03-12 23:09:03,970 : INFO : topic #4 (0.136): 0.019*"ai" + 0.007*"computer" + 0.006*"design" + 0.005*"science" + 0.005*"human" + 0.004*"sustainable" +

2023-03-12 23:09:07,822 : INFO : topic diff=0.300259, rho=0.408248
2023-03-12 23:09:08,491 : INFO : -7.733 per-word bound, 212.7 perplexity estimate based on a held-out corpus of 74 documents with 153230 words
2023-03-12 23:09:08,492 : INFO : PROGRESS: pass 5, at document #74/74
2023-03-12 23:09:08,850 : INFO : optimized alpha [0.06695892, 0.068093024, 0.07820139, 0.06727493, 0.062190406, 0.07787726, 0.053132262]
2023-03-12 23:09:08,858 : INFO : topic #6 (0.053): 0.062*"ai" + 0.012*"service" + 0.009*"local" + 0.008*"government" + 0.008*"participant" + 0.007*"challenge" + 0.005*"nature" + 0.005*"people" + 0.005*"resource" + 0.005*"table"
2023-03-12 23:09:08,860 : INFO : topic #4 (0.062): 0.017*"science" + 0.014*"smart" + 0.013*"sustainable" + 0.010*"sustainability" + 0.009*"design" + 0.008*"urbanism" + 0.007*"computer" + 0.007*"ai" + 0.006*"datadriven" + 0.006*"big"
2023-03-12 23:09:08,867 : INFO : topic #1 (0.068): 0.008*"goal" + 0.008*"space" + 0.008*"land" + 0.006*"spatial" + 0.005*"

2023-03-12 23:09:13,682 : INFO : topic #6 (0.044): 0.069*"ai" + 0.013*"service" + 0.009*"local" + 0.009*"government" + 0.008*"participant" + 0.007*"challenge" + 0.005*"disaster" + 0.005*"perception" + 0.005*"nature" + 0.005*"people"
2023-03-12 23:09:13,683 : INFO : topic #0 (0.053): 0.017*"ai" + 0.008*"human" + 0.008*"intelligence" + 0.007*"virtual" + 0.007*"space" + 0.006*"articial" + 0.006*"world" + 0.005*"condition" + 0.005*"medium" + 0.005*"community"
2023-03-12 23:09:13,687 : INFO : topic #1 (0.057): 0.009*"goal" + 0.008*"land" + 0.008*"space" + 0.006*"spatial" + 0.005*"figure" + 0.005*"growth" + 0.004*"noise" + 0.004*"value" + 0.004*"human" + 0.004*"ai"
2023-03-12 23:09:13,692 : INFO : topic #2 (0.061): 0.012*"image" + 0.008*"value" + 0.008*"street" + 0.008*"walking" + 0.008*"algorithm" + 0.006*"satisfaction" + 0.006*"figure" + 0.005*"point" + 0.005*"proportion" + 0.005*"performance"
2023-03-12 23:09:13,695 : INFO : topic #5 (0.063): 0.009*"planner" + 0.007*"service" + 0.007*"rob

2023-03-12 23:09:17,811 : INFO : topic #1 (0.052): 0.009*"goal" + 0.008*"land" + 0.008*"space" + 0.006*"spatial" + 0.005*"figure" + 0.005*"growth" + 0.005*"noise" + 0.005*"value" + 0.004*"human" + 0.004*"ai"
2023-03-12 23:09:17,832 : INFO : topic #2 (0.053): 0.013*"image" + 0.009*"value" + 0.008*"street" + 0.008*"walking" + 0.008*"algorithm" + 0.006*"satisfaction" + 0.006*"figure" + 0.005*"point" + 0.005*"proportion" + 0.005*"performance"
2023-03-12 23:09:17,844 : INFO : topic #5 (0.057): 0.009*"planner" + 0.007*"robot" + 0.007*"role" + 0.007*"service" + 0.007*"innovation" + 0.006*"automated" + 0.006*"automation" + 0.005*"different" + 0.005*"governance" + 0.005*"management"
2023-03-12 23:09:17,851 : INFO : topic diff=0.032406, rho=0.242536
2023-03-12 23:09:18,420 : INFO : -7.710 per-word bound, 209.3 perplexity estimate based on a held-out corpus of 74 documents with 153230 words
2023-03-12 23:09:18,421 : INFO : PROGRESS: pass 16, at document #74/74
2023-03-12 23:09:18,616 : INFO : opt

In [323]:
#print it out
pp = pprint.PrettyPrinter()
pp.pprint(model.print_topics(10))

2023-03-12 23:09:21,317 : INFO : topic #0 (0.046): 0.014*"ai" + 0.008*"human" + 0.008*"intelligence" + 0.008*"space" + 0.007*"virtual" + 0.006*"articial" + 0.006*"world" + 0.006*"design" + 0.006*"condition" + 0.005*"medium"
2023-03-12 23:09:21,323 : INFO : topic #1 (0.051): 0.009*"goal" + 0.009*"land" + 0.008*"space" + 0.006*"spatial" + 0.005*"growth" + 0.005*"figure" + 0.005*"value" + 0.005*"noise" + 0.004*"human" + 0.004*"ai"
2023-03-12 23:09:21,326 : INFO : topic #2 (0.049): 0.013*"image" + 0.009*"value" + 0.008*"street" + 0.008*"walking" + 0.008*"algorithm" + 0.006*"satisfaction" + 0.006*"figure" + 0.005*"point" + 0.005*"proportion" + 0.005*"performance"
2023-03-12 23:09:21,328 : INFO : topic #3 (0.047): 0.011*"learning" + 0.006*"paper" + 0.006*"clustering" + 0.005*"ml" + 0.005*"anns" + 0.005*"prediction" + 0.005*"deep" + 0.004*"unsupervised" + 0.004*"review" + 0.004*"trafc"
2023-03-12 23:09:21,330 : INFO : topic #4 (0.051): 0.017*"science" + 0.017*"smart" + 0.012*"sustainable" + 0

[(0,
  '0.014*"ai" + 0.008*"human" + 0.008*"intelligence" + 0.008*"space" + '
  '0.007*"virtual" + 0.006*"articial" + 0.006*"world" + 0.006*"design" + '
  '0.006*"condition" + 0.005*"medium"'),
 (1,
  '0.009*"goal" + 0.009*"land" + 0.008*"space" + 0.006*"spatial" + '
  '0.005*"growth" + 0.005*"figure" + 0.005*"value" + 0.005*"noise" + '
  '0.004*"human" + 0.004*"ai"'),
 (2,
  '0.013*"image" + 0.009*"value" + 0.008*"street" + 0.008*"walking" + '
  '0.008*"algorithm" + 0.006*"satisfaction" + 0.006*"figure" + 0.005*"point" + '
  '0.005*"proportion" + 0.005*"performance"'),
 (3,
  '0.011*"learning" + 0.006*"paper" + 0.006*"clustering" + 0.005*"ml" + '
  '0.005*"anns" + 0.005*"prediction" + 0.005*"deep" + 0.004*"unsupervised" + '
  '0.004*"review" + 0.004*"trafc"'),
 (4,
  '0.017*"science" + 0.017*"smart" + 0.012*"sustainable" + 0.011*"design" + '
  '0.009*"sustainability" + 0.008*"urbanism" + 0.007*"computer" + 0.007*"big" '
  '+ 0.006*"datadriven" + 0.006*"field"'),
 (5,
  '0.010*"planner

### The results are getting better with a larger corpus + higher # of passes + smaller number of topics
- Try bigrams.
- Let's see if scikit-learn does any better, and if not, what we might do to tune our data and parameters to highlight the sorts of patterns that might tell us something *new* about the corpus

### Optional File Cleanup

In [2]:
for f in glob.glob("Document_*"):
    os.remove(f)