# Data Preparation and Basic Text Analysis - Topic Modeling Pt. 1

### #1. Zotero API + PDF Text Miner
We need this so we don't have to bring down all of the pdfs in our library to our local machines, and we can always run our data prep on the most up-to-date corpus.

In [None]:
import fitz
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import ftfy
import nltk.corpus
from nltk.collocations import *
from nltk.corpus import stopwords
import pandas as pd
import datetime
import glob
import requests
import urllib.request
import os
import re
import time
import math
import json

In [None]:
def zoteroCrawler(key, groupid):
    multiplier = 0
    zoterogroup = []
    # get number of items in group
    headers = {"Zotero-API-Version":"3",'Connection':'close', "Zotero-API-Key":key}
    checkurl = "https://api.zotero.org/groups/" +groupid
    rcheck = requests.get(checkurl, headers=headers)
    items = rcheck.json()['meta']['numItems']
    print(items)
    pages = math.ceil((items/100))
    while pages > 0:
        url = "https://api.zotero.org/groups/" +groupid + "/items" + "?limit=100&start=" + str(multiplier)
        print(url)
        r = requests.get(url, headers=headers)
        rj = r.json() #jsonified version of our Zotero group
        print(len(rj))
        for i in rj:
            zoterogroup.append(i)
        multiplier = multiplier + 100
        pages = pages - 1
    return zoterogroup

In [None]:
api_key = "***"
group_id = "2808857"

In [None]:
rj= zoteroCrawler(api_key, group_id)

In [None]:
def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)    
    file = open(filename + ".pdf", 'wb')
    file.write(response.read())
    file.close()

In [None]:
# now what? get the attachments
# inputs are jsonified version of your zotero group, base url to zotero group items, and api key
# going to have to loop through and run this for every page I think
def attachmentGrabber(rj, url, key):
    counter = 0
    to_extract = []
    citation_list = []
    for i in rj:
        item_key = rj[counter]["key"]
        headers = {"Zotero-API-Version":"3",'Connection':'close', "Zotero-API-Key":key}
        attach = requests.get(url + item_key + "/file", headers=headers)
        if "200" in str(attach):
            download_file(attach.url, f'Document_{item_key}')
            to_extract.append(f'Document_{item_key}.pdf')
            print("GOOD" + " 1 " + str(item_key))
        else:
            try:
                attach = requests.get(rj[counter]['links']['attachment']['href'] + "/file", headers=headers)
                print(attach)
                if "200" in str(attach):
                    download_file(attach.url, f'Document_{item_key}')
                    to_extract.append(f'Document_{item_key}.pdf')
                    print("GOOD" + " " + str(item_key))
                else:
                    print("that didn't work" + " " + attach.url)
            except Exception as e:
                print(e)
        counter +=1
    return to_extract

In [None]:
url = "https://api.zotero.org/groups/2808857/items/"
to_extract  = attachmentGrabber(rj, url, api_key)

Before we extract the PDFs, this is where we should use the Scholarcy API to grab their references. https://ref.scholarcy.com/api/ We should play around with different return types to see what will be easiest. Part of me thinks it will be easiest to isolate references and compare them with the text in the PDF if we use the ```/extract``` endpoint to output in JSON.

In [None]:
scholarcy_headers = {"accept":"application/json","Authorization":"Bearer ", "Content-Type": 'multipart/form-data','Connection':'close' }
alt_headers = {"accept":"application/json","Authorization":"Bearer ", "Content-Type": 'multipart/form-data',"engine": "v2",'Connection':'close'}
url = "https://ref.scholarcy.com/api/references/extract"
refdict = {}
for i in to_extract:
    time.sleep(2)
    print("Working on " + i + "...")
    try:
        file = {'file': open(i, 'rb')}
        r = requests.post(url, files=file, data=scholarcy_headers)
        print(r)
        refdict[i] = r.json()
        print(i + " " + "is good to go")
    except Exception as e:
        print(str(e) + " " + "Trying API v2 instead...")
        try:
            r2 = requests.post(url, files=file,data=alt_headers)
            print(r2)
            refdict[i] = r2.json()
            print(i + " " + "is good to go")
        except Exception as e:
            print("That didn't work either...")


The attachments have been procured. Let's do something with them. Thanks to [PDF Text Miner](https://github.com/prldc/pdf_text_miner) for this function.

In [None]:
def extract_pdfs(list):  # You can easily extract a list from a .csv with pandas.
    d = {'file_name': ['dummy'], 'file_text': ['dummy'], 'ocr': [False]}
    df = pd.DataFrame(d, columns=['file_name', 'file_text', 'ocr'])
    count = 1
    for pdf in list:
        try:
            ext = os.path.splitext(pdf)[1][1:].strip()  # Gets file extension.
            if ext == 'pdf':  # Guarantees that the file is a .pdf, otherwise the program will crash when extracting text.
                ocr = False
                name = pdf.split('.pdf')[0]
                doc = fitz.open(f"{name}.pdf")
                text_file = open(f"{name}.txt", 'w')
                number_of_pages = doc.pageCount
                for page_n in range(number_of_pages):  # Extracts text from each page.
                    page = doc.load_page(page_n)
                    page_content = page.get_text("text")
                    text_file.write(page_content)
                if os.stat(
                        f"{name}.txt").st_size < 2000:  # Assumes file lacks OCR based on .txt file size, starts Tesseract.
                    ocr = True
                    os.remove(f"{name}.txt")  # Removes the previously scraped .txt.
                    tess_file = f"{name}.pdf"
                    pages = convert_from_path(tess_file, 500)
                    image_counter = 1
                    for page in pages:  # Converts the PDF to image.
                        filename = f"{name}page_{str(image_counter)}.jpg"
                        page.save(filename, 'JPEG')
                        image_counter = image_counter + 1
                    filelimit = image_counter - 1
                    outfile = f"{name}.txt"
                    f = open(outfile, "a")
                    for i in range(1, filelimit + 1):  # Applies OCR to each image, saves text file.
                        filename = f"{name}page_{str(i)}.jpg"
                        text = str((pytesseract.image_to_string(Image.open(filename), lang="por")))
                        text = text.replace('-\n', '')
                        f.write(text)
                    f.close()
                text = open(f"{name}.txt", 'r')
                txt = " ".join(text.readlines())
                df = df.append({'file_name': f"{name}", 'file_text': txt, 'ocr': ocr}, ignore_index=True)    
                end = datetime.datetime.now()
                print(
                    f"Finished {name} at {end}. OCR = {ocr}. {count} files read. {round(count * 100 / len(list), 2)}% done.")
        except:
            print(f'Did not finish {pdf}... check out that one.')
        count = count + 1

    return df

In [None]:
out = extract_pdfs(to_extract)

In [None]:
out # we are able to scrape the vast majority of the articles without a problem

In [None]:
#for rapid topic modeling/cleaning prototyping:
out.to_csv("extracted_text.csv")
with open('refs.txt', 'w') as convert_file:
     convert_file.write(json.dumps(refdict))

### #2. Cleaning up the text
Using strategies based on [this article](https://monkeylearn.com/blog/text-cleaning/).

Here, before we do any other cleaning tasks, this is where we need to match and remove references with the information we got from the Scholarcy API above.

#### Resources:
- [String comparison in Python](https://note.nkmk.me/en/python-str-compare/)
- [Potentially useful example on StackOverflow](https://stackoverflow.com/questions/39551029/if-else-statement-for-finding-the-index-of-a-character-in-a-string)

In [None]:
def referenceChecker(extracted_df):
    ref_excluded = {}
    counter = 0
    string3=""
    matches=0
    nomatch=0
    for name in extracted_df['file_name']:
        if "dummy" not in name:
            text_file = open(str(name) +".txt", "r")
            data = text_file.read()      # Read whole file to a string
            text_file.close()         # Close file
            string1 = data.replace('\n',"")
            string2 = string1.replace('\t',"")
            for i in refdict[str(name) + ".pdf"]["references"]:
                if i[0:24] in string2:
                    starti =string2.find(i[0:24])
                    endi = int(starti) + len(i)
                    matches+=1
                    string3 = string2.replace(string2[starti:endi], "")
                else:
                    pass
                    nomatch +=1
            ref_excluded[name] = string3
    print("Matches: " + str(matches) + " No Matches: " +str(nomatch) )
    return ref_excluded

In [None]:
ref_ex = referenceChecker(out) #needs some work

### Cleaning Tasks
- Case Normalization
- Remove Unicode Characters
    - In the future, we may want to experiment with using [ftfy](https://github.com/rspeer/python-ftfy), which fixes text encoding issues, in this pipeline. We may also be interested in exploring [scrubadub](https://scrubadub.readthedocs.io/en/stable/index.html), which redacts potential PII from text.
- Remove Stopwords
- Lemmatize

In [None]:
def textCleaner(ref_ex_dict):
    full_corpus = []
    for i in ref_ex_dict.keys():
        data = ref_ex_dict[i]
        da = data.lower()
        d = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", da)
        stop = stopwords.words('english')
        all_words = nltk.word_tokenize(d)
        words = [w for w in all_words if w not in stop]
        words = [w for w in words if w.isalpha()]
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        word_out = []
        for word in words:
            a = lemmatizer.lemmatize(word)
            word_out.append(a)
        ref_ex_dict[i] = word_out
        full_corpus.append(word_out)
    return ref_ex_dict, full_corpus

In [None]:
a = textCleaner(ref_ex)

In [None]:
corpusdict = a[0]
corpus = a[1]
full_corpus = [x for xs in corpus for x in xs] #flattened

## Basic analysis

#### A quick frequency distribution of the *most common words* in the corpus, and the *most common two and three word collocations* in the corpus.

In [None]:
word_fd = nltk.FreqDist(full_corpus)
word_fd

In [None]:
bigram_fd = nltk.FreqDist(nltk.bigrams(full_corpus))
bigram_fd

In [None]:
trigram_fd = nltk.FreqDist(nltk.trigrams(full_corpus))
trigram_fd

#### Using some of nltk's built in functions to get more information about the collocation scores according to association measures.
See more information about the nltk collocation methodology [here](https://www.nltk.org/api/nltk.collocations.html)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder(word_fd, bigram_fd)
finder.score_ngrams(bigram_measures.raw_freq)

### Optional File Cleanup

In [None]:
for f in glob.glob("Document_*"):
    os.remove(f)