# Data Preparation and Basic Text Analysis - Topic Modeling Pt. 1

### #1. Zotero API + PDF Text Miner
We need this so we don't have to bring down all of the pdfs in our library to our local machines, and we can always run our data prep on the most up-to-date corpus.

In [1]:
import fitz
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import ftfy
import nltk.corpus
from nltk.collocations import *
from nltk.corpus import stopwords
import pandas as pd
import datetime
import glob
import requests
import urllib.request
import os
import re
import time

In [2]:
# We're going to grab the first page of results (25) for testing, so we won't specify a header for pagination.
# The pagination thing is something we will have to implement to make this fully operational.
headers = {"Zotero-API-Version":"3", "Zotero-API-Key":"N6yPwqH9VQFt8ZKBKCAFf8KV"}
url = "https://api.zotero.org/groups/2808857/items/"
r = requests.get(url, headers=headers)
rj = r.json() #jsonified version of our Zotero group 

In [3]:
def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)    
    file = open(filename + ".pdf", 'wb')
    file.write(response.read())
    file.close()

In [4]:
# now what? get the attachments
# inputs are jsonified version of your zotero group, base url to zotero group items, and headers
# going to have to loop through and run this for every page I think
def attachmentGrabber(rj, url, headers):
    counter = 0
    to_extract = []
    citation_list = []
    for i in rj:
        try:
            item_key = rj[counter]["key"]
            attach = requests.get(url + item_key + "/file", headers=headers)
            download_file(attach.url, f'Document_{item_key}')
            to_extract.append(f'Document_{item_key}.pdf')
            print("GOOD" + " " + str(item_key))
        except Exception as e: 
            try:
                attach = requests.get(rj[counter]['links']['attachment']['href'] + "/file", headers=headers)
                download_file(attach.url, f'Document_{item_key}')
                to_extract.append(f'Document_{item_key}.pdf')
                print("GOOD" + " " + str(item_key))
            except Exception as e:
                print(str(e) + " " + "that didn't work" + " " + attach.url)
        counter +=1
    return to_extract

In [5]:
to_extract  = attachmentGrabber(rj, url, headers)

GOOD HE9X7NKT
GOOD IKPJMQZG
GOOD FJ9SMXH4
GOOD 82CHIVGP
GOOD ZFPUQZNE
GOOD SYN9YTMS
GOOD G4923VSV
GOOD VZGACXNN
GOOD YGFJAA5F
GOOD KN8U5AXF
GOOD NDH25D7X
GOOD V7ZLSYZU
GOOD BM6L3JUC
GOOD LD9QKVIN
GOOD 5I39ZRDQ
GOOD PLBJZCVX
GOOD 334IKY7D
GOOD KEUSLH5T
GOOD 4S7CU23C
GOOD QKHD29TS
GOOD QKT4INVU
GOOD NYRUWCRI
GOOD 5H6TVKRX
GOOD WSFZDI5V
GOOD ZH6G29C7


Before we extract the PDFs, this is where we should use the Scholarcy API to grab their references. https://ref.scholarcy.com/api/ We should play around with different return types to see what will be easiest. Part of me thinks it will be easiest to isolate references and compare them with the text in the PDF if we use the ```/extract``` endpoint to output in JSON.

In [6]:
scholarcy_headers = {"accept":"application/json","Authorization":"Bearer ", "Content-Type": 'multipart/form-data'}
alt_headers = {"accept":"application/json","Authorization":"Bearer ", "Content-Type": 'multipart/form-data',"engine": "v2"}
url = "https://ref.scholarcy.com/api/references/extract"
refdict = {}
for i in to_extract:
    time.sleep(2)
    print("Working on " + i + "...")
    try:
        file = {'file': open(i, 'rb')}
        r = requests.post(url, files=file, data=scholarcy_headers)
        print(r)
        refdict[i] = r.json()
        print(i + " " + "is good to go")
    except Exception as e:
        print(str(e) + " " + "Trying API v2 instead...")
        try:
            r2 = requests.post(url, files=file,data=alt_headers)
            print(r2)
            refdict[i] = r2.json()
            print(i + " " + "is good to go")
        except Exception as e:
            print("That didn't work either...")


Working on Document_HE9X7NKT.pdf...
<Response [200]>
Document_HE9X7NKT.pdf is good to go
Working on Document_IKPJMQZG.pdf...
<Response [200]>
Document_IKPJMQZG.pdf is good to go
Working on Document_FJ9SMXH4.pdf...
<Response [200]>
Document_FJ9SMXH4.pdf is good to go
Working on Document_82CHIVGP.pdf...
<Response [200]>
Document_82CHIVGP.pdf is good to go
Working on Document_ZFPUQZNE.pdf...
<Response [200]>
Document_ZFPUQZNE.pdf is good to go
Working on Document_SYN9YTMS.pdf...
<Response [503]>
Expecting value: line 1 column 1 (char 0) Trying API v2 instead...
<Response [200]>
Document_SYN9YTMS.pdf is good to go
Working on Document_G4923VSV.pdf...
<Response [200]>
Document_G4923VSV.pdf is good to go
Working on Document_VZGACXNN.pdf...
<Response [200]>
Document_VZGACXNN.pdf is good to go
Working on Document_YGFJAA5F.pdf...
<Response [200]>
Document_YGFJAA5F.pdf is good to go
Working on Document_KN8U5AXF.pdf...
<Response [200]>
Document_KN8U5AXF.pdf is good to go
Working on Document_NDH25D

In [7]:
refdict["Document_PLBJZCVX.pdf"] #example of what an output from scholarcy looks like

{'filename': 'Document_PLBJZCVX.pdf',
 'metadata': {'arxiv': None,
  'doi': '10.1016/j.eiar.2020.106515',
  'isbn': None,
  'date': 2021},
 'references': ['Abdo, J.B., Zeadally, S., 2020. Multi-Utility Market: Framework for a Blockchain Exchange Platform for Sustainable Development. arXiv preprint arXiv:2007.07096. Jul 14.',
  'Aina, Y.A., Wafer, A., Ahmed, F., Alshuwaikhat, H.M., 2019. Top-down sustainable urban development? Urban governance transformation in Saudi Arabia. Cities. 90, 272–281. Jul 1.',
  'Ali, R., Kuriqi, A., Abubaker, S., Kisi, O., 2019. Hydrologic alteration at the upper and middle part of the yangtze river, China: towards sustainable water resource management under increasing water exploitation. Sustainability 11 (19), 5176.',
  'Bazan-Krzywoszanska, A., Mrowczynska, M., Tront, S., 2019. GIS technology, 3D models and mathematical models as a tool for assessing development capabilities of flood risk land to make arrangements of municipal planning documents. J. Ecol.

The attachments have been procured. Let's do something with them. Thanks to [PDF Text Miner](https://github.com/prldc/pdf_text_miner) for this function.

In [8]:
def extract_pdfs(list):  # You can easily extract a list from a .csv with pandas.
    d = {'file_name': ['dummy'], 'file_text': ['dummy'], 'ocr': [False]}
    df = pd.DataFrame(d, columns=['file_name', 'file_text', 'ocr'])
    count = 1
    for pdf in list:
        try:
            ext = os.path.splitext(pdf)[1][1:].strip()  # Gets file extension.
            if ext == 'pdf':  # Guarantees that the file is a .pdf, otherwise the program will crash when extracting text.
                ocr = False
                name = pdf.split('.pdf')[0]
                doc = fitz.open(f"{name}.pdf")
                text_file = open(f"{name}.txt", 'w')
                number_of_pages = doc.pageCount
                for page_n in range(number_of_pages):  # Extracts text from each page.
                    page = doc.load_page(page_n)
                    page_content = page.get_text("text")
                    text_file.write(page_content)
                if os.stat(
                        f"{name}.txt").st_size < 2000:  # Assumes file lacks OCR based on .txt file size, starts Tesseract.
                    ocr = True
                    os.remove(f"{name}.txt")  # Removes the previously scraped .txt.
                    tess_file = f"{name}.pdf"
                    pages = convert_from_path(tess_file, 500)
                    image_counter = 1
                    for page in pages:  # Converts the PDF to image.
                        filename = f"{name}page_{str(image_counter)}.jpg"
                        page.save(filename, 'JPEG')
                        image_counter = image_counter + 1
                    filelimit = image_counter - 1
                    outfile = f"{name}.txt"
                    f = open(outfile, "a")
                    for i in range(1, filelimit + 1):  # Applies OCR to each image, saves text file.
                        filename = f"{name}page_{str(i)}.jpg"
                        text = str((pytesseract.image_to_string(Image.open(filename), lang="por")))
                        text = text.replace('-\n', '')
                        f.write(text)
                    f.close()
                text = open(f"{name}.txt", 'r')
                txt = " ".join(text.readlines())
                df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)    
                end = datetime.datetime.now()
                print(
                    f"Finished {name} at {end}. OCR = {ocr}. {count} files read. {round(count * 100 / len(list), 2)}% done.")
        except:
            print(f'Did not finish {pdf}... check out that one.')
        count = count + 1

    return df

In [9]:
out = extract_pdfs(to_extract)

Finished Document_HE9X7NKT at 2022-07-19 20:36:04.548840. OCR = False. 1 files read. 4.0% done.
Finished Document_IKPJMQZG at 2022-07-19 20:36:04.645023. OCR = False. 2 files read. 8.0% done.
Finished Document_FJ9SMXH4 at 2022-07-19 20:36:04.711377. OCR = False. 3 files read. 12.0% done.
Finished Document_82CHIVGP at 2022-07-19 20:36:04.795743. OCR = False. 4 files read. 16.0% done.
Finished Document_ZFPUQZNE at 2022-07-19 20:36:04.866921. OCR = False. 5 files read. 20.0% done.
Finished Document_SYN9YTMS at 2022-07-19 20:36:04.936928. OCR = False. 6 files read. 24.0% done.
Finished Document_G4923VSV at 2022-07-19 20:36:04.973285. OCR = False. 7 files read. 28.0% done.
Finished Document_VZGACXNN at 2022-07-19 20:36:05.074430. OCR = False. 8 files read. 32.0% done.
Finished Document_YGFJAA5F at 2022-07-19 20:36:05.193254. OCR = False. 9 files read. 36.0% done.
Finished Document_KN8U5AXF at 2022-07-19 20:36:05.245416. OCR = False. 10 files read. 40.0% done.
Finished Document_NDH25D7X at 2

In [10]:
out

Unnamed: 0,file_name,file_text,ocr
0,dummy,dummy,False
1,Document_HE9X7NKT,"439\n Copyright © 2016, IGI Global. Copying or...",False
2,Document_IKPJMQZG,Chapter 7\n A Smart Disaster Management System...,False
3,Document_FJ9SMXH4,"439\n Copyright © 2016, IGI Global. Copying or...",False
4,Document_82CHIVGP,"1\n Copyright © 2015, IGI Global. Copying or d...",False
5,Document_ZFPUQZNE,Will You Accept an Imperfect AI? Exploring Des...,False
6,Document_SYN9YTMS,When (ish) is My Bus? User-centered Visualizat...,False
7,Document_G4923VSV,What are Data Insights to Professional Visuali...,False
8,Document_VZGACXNN,Visual Analytics in Urban Computing:\n An Over...,False
9,Document_YGFJAA5F,Visual Analytics in Deep Learning: An\n Interr...,False


### #2. Cleaning up the text
Using strategies based on [this article](https://monkeylearn.com/blog/text-cleaning/).

Here, before we do any other cleaning tasks, this is where we need to match and remove references with the information we got from the Scholarcy API above.

#### Resources:
- [String comparison in Python](https://note.nkmk.me/en/python-str-compare/)
- [Potentially useful example on StackOverflow](https://stackoverflow.com/questions/39551029/if-else-statement-for-finding-the-index-of-a-character-in-a-string)

In [11]:
def referenceChecker(extracted_df):
    ref_excluded = {}
    counter = 0
    for name in extracted_df['file_name']:
        if "dummy" not in name:
            text_file = open(str(name) +".txt", "r")
            data = text_file.read()      # Read whole file to a string
            text_file.close()         # Close file
            string1 = data.replace('\n',"")
            string2 = string1.replace('\t',"")
            for i in refdict[str(name) + ".pdf"]["references"]:
                if i[0:24] in string2:
                    starti =string2.find(i[0:24])
                    endi = int(starti) + len(i)
              #      print("Match Found: " + string2[starti:endi])
                    string3 = string2.replace(string2[starti:endi], "")
                else:
                    pass
             #       print("Match Not Found: " + str(i))
            ref_excluded[name] = string3
    return ref_excluded

In [12]:
ref_ex = referenceChecker(out)

### Cleaning Tasks
- Case Normalization
- Remove Unicode Characters
    - In the future, we may want to experiment with using [ftfy](https://github.com/rspeer/python-ftfy), which fixes text encoding issues, in this pipeline. We may also be interested in exploring [scrubadub](https://scrubadub.readthedocs.io/en/stable/index.html), which redacts potential PII from text.
- Remove Stopwords
- Lemmatize

In [13]:
def textCleaner(ref_ex_dict):
    full_corpus = []
    for i in ref_ex_dict.keys():
        data = ref_ex_dict[i]
        da = data.lower()
        d = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", da)
        stop = stopwords.words('english')
        all_words = nltk.word_tokenize(d)
        words = [w for w in all_words if w not in stop]
        words = [w for w in words if w.isalpha()]
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        word_out = []
        for word in words:
            a = lemmatizer.lemmatize(word)
            word_out.append(a)
        ref_ex_dict[i] = word_out
        full_corpus.append(word_out)
    return ref_ex_dict, full_corpus

In [14]:
a = textCleaner(ref_ex)

In [16]:
corpusdict = a[0]
corpus = a[1]
full_corpus = [x for xs in corpus for x in xs] #flattened
full_corpus

['igi',
 'global',
 'copying',
 'distributing',
 'print',
 'electronic',
 'form',
 'without',
 'written',
 'permission',
 'igi',
 'global',
 'prohibitedchapter',
 'planning',
 'recent',
 'development',
 'web',
 'urban',
 'planningabstractthis',
 'chapter',
 'address',
 'challenge',
 'changing',
 'technology',
 'pose',
 'urban',
 'planning',
 'urban',
 'planning',
 'continues',
 'influenced',
 'emerging',
 'creativity',
 'knowledgesharing',
 'culture',
 'inherent',
 'connection',
 'digital',
 'transformation',
 'technology',
 'certainly',
 'play',
 'important',
 'role',
 'production',
 'content',
 'distribution',
 'transformation',
 'giving',
 'urban',
 'planning',
 'new',
 'look',
 'depicted',
 'concept',
 'urban',
 'planning',
 'urban',
 'planning',
 'chapter',
 'paradigm',
 'shift',
 'explained',
 'illustrated',
 'special',
 'view',
 'identifying',
 'way',
 'second',
 'third',
 'generation',
 'web',
 'affect',
 'urban',
 'planning',
 'plethora',
 'pilot',
 'project',
 'new',
 'practi

## Basic analysis

#### A quick frequency distribution of the *most common words* in the corpus, and the *most common two and three word collocations* in the corpus.

In [17]:
word_fd = nltk.FreqDist(full_corpus)
word_fd

FreqDist({'data': 1689, 'urban': 1647, 'planning': 1230, 'city': 1155, 'system': 1106, 'et': 731, 'al': 695, 'model': 629, 'network': 563, 'technology': 561, ...})

In [18]:
bigram_fd = nltk.FreqDist(nltk.bigrams(full_corpus))
bigram_fd

FreqDist({('et', 'al'): 652, ('urban', 'planning'): 274, ('visual', 'analytics'): 162, ('smart', 'city'): 156, ('deep', 'learning'): 149, ('neural', 'network'): 133, ('new', 'york'): 114, ('machine', 'learning'): 107, ('urban', 'design'): 105, ('vol', 'pp'): 98, ...})

In [19]:
trigram_fd = nltk.FreqDist(nltk.trigrams(full_corpus))
trigram_fd

FreqDist({('planning', 'support', 'system'): 58, ('transaction', 'intelligent', 'system'): 57, ('intelligent', 'system', 'technology'): 57, ('system', 'technology', 'vol'): 55, ('technology', 'vol', 'article'): 55, ('vol', 'article', 'publication'): 55, ('article', 'publication', 'date'): 55, ('publication', 'date', 'september'): 55, ('licensed', 'use', 'limited'): 51, ('use', 'limited', 'ieeexplore'): 51, ...})

#### Using some of nltk's built in functions to get more information about the collocation scores according to association measures.
See more information about the nltk collocation methodology [here](https://www.nltk.org/api/nltk.collocations.html)

In [20]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder(word_fd, bigram_fd)
finder.score_ngrams(bigram_measures.raw_freq)

[(('et', 'al'), 0.004491440774291324),
 (('urban', 'planning'), 0.0018875073192573969),
 (('visual', 'analytics'), 0.0011159714807288258),
 (('smart', 'city'), 0.0010746392036647952),
 (('deep', 'learning'), 0.0010264182137567596),
 (('neural', 'network'), 0.000916198808252678),
 (('new', 'york'), 0.0007853132642165812),
 (('machine', 'learning'), 0.0007370922743085455),
 (('urban', 'design'), 0.0007233148486205352),
 (('vol', 'pp'), 0.0006750938587124995),
 (('big', 'data'), 0.0006613164330244893),
 (('urban', 'computing'), 0.0006613164330244893),
 (('urban', 'strategy'), 0.0005786518788964282),
 (('expert', 'system'), 0.0005442083146764027),
 (('international', 'conference'), 0.0005442083146764027),
 (('urban', 'data'), 0.0005373196018323976),
 (('scheduling', 'assistant'), 0.0005235421761443874),
 (('zheng', 'et'), 0.0005097647504563773),
 (('support', 'system'), 0.0004959873247683671),
 (('planning', 'support'), 0.000489098611924362),
 (('water', 'resource'), 0.00047532118623635176

### Optional File Cleanup

In [None]:
for f in glob.glob("Document_*"):
    os.remove(f)