In [5]:
import arxiv
import requests
import time
import pandas as pd
import matplotlib.pyplot as plt
import random
import string as st
import re
import os
from PyPDF2 import PdfReader

# Error handling
import logging
logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.ERROR)

# Importing all the algorithm implementation
from LCS import longest_common_subsequence
from RapinKarp import RapinKarpSearch
from KMP import KMP

In [6]:
# Helper Methods
def bigO_time(length):
    start_time = time.time()
    for i in range(0, length):
        pass
    return time.time() - start_time

In [7]:
def refine_filename(filename):
    # Replace all non-alphanumeric characters with underscores
    filename = re.sub('[^0-9a-zA-Z]+', '_', filename)

    # Remove leading and trailing underscores
    filename = re.sub('^_|_$', '', filename)

    # Ensure that the filename ends with ".py"
    if not filename.endswith('.pdf'):
        filename += '.pdf'

    return filename

# Method to extract papers found based on a query, and extract the text and return the concatenated string
def extract_papers(query, filepath, volume):
    session = requests.Session()
    session.headers.update({'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'})
    papers_extracted = 0
    try:
        for result in arxiv.Search(query=query, max_results=volume).results():
            extracted = False
            while not extracted:
                try:
                    result.download_pdf(dirpath=filepath, filename=refine_filename(result.title))
                    extracted = True
                except: time.sleep(10)
            papers_extracted += 1
            print(f"Papers extracted: {papers_extracted}/{volume}", end='\r')
    except Exception as e: print(e)
    finally: session.close()

def read_pdf_files(folder_path):
    try:
        # Get a list of all PDF files in the folder
        pdf_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.pdf')]

        # Loop through all the PDF files and read their contents
        all_text = ''
        for pdf_file in pdf_files:
            with open(pdf_file, 'rb') as f:
                pdf_reader = PdfReader(f)
                for page in pdf_reader.pages:
                    all_text += page.extract_text()

        return all_text

    except Exception as e:
        print("An error occurred while reading PDF files:", e)
        return None

In [None]:
print("STARTED: Extracting papers")
extract_papers(query="psychology", filepath="PDFs/", volume=1000)
print("FINISH: Extracted all the papers")

STARTED: Extracting papers
Papers extracted: 139/1000

In [None]:
all_papers_string = read_pdf_files("PDFs/")
print("Total characters extracted from Arxiv Database:", len(all_papers_string))

In [None]:
# Plotting a graph keeping the length of pattern constant, and length of text varying

n_vals = []
KMP_vals = []
LCS_vals = []
RabinKarp_vals = []
baseline_vals = []

gap_size = int(len(all_papers_string)/1000)
p = " ^_^ This text pattern never exists in research papers ^_^ "

for N in range(1000, len(all_papers_string), gap_size):
    t = all_papers_string[0:N]
    n_vals.append(len(t))

    # Running the KMP algorithm
    start_time = time.time()
    KMP(p, t)
    runtime = time.time() - start_time
    KMP_vals.append(runtime)

    # Running the Rabin Karp algorithm
    start_time = time.time()
    RapinKarpSearch(p, t)
    runtime = time.time() - start_time
    RabinKarp_vals.append(runtime)

    # Running the LCS algorithm
    start_time = time.time()
    longest_common_subsequence(p, t)
    runtime = time.time() - start_time
    LCS_vals.append(runtime)

    # Plotting the function O(n)
    baseline_vals.append(bigO_time(len(t) + len(p)))

data = {"n" : n_vals, "KMPrt" : KMP_vals,"RKrt" : RabinKarp_vals, "LCSrt" : LCS_vals, "O(n+m)" : baseline_vals}
df = pd.DataFrame(data)
df.to_csv('Files/RT_manipulating_text.csv', encoding='utf-8', index=False)
plt.plot(df['n'], df['KMPrt'], 'o', label='KMPrt')
plt.plot(df['n'], df['RKrt'], 'o', label='RKrt')
plt.plot(df['n'], df['LCSrt'], 'o', label='LCSrt')
plt.plot(df['n'], df['O(n+m)'], 'o', label='O(n + m)')
plt.xlabel('n')
plt.ylabel('runtime (in secs.)')
plt.legend()
plt.title("Runtime of Algorithm (with varying text length)")
plt.savefig('Plots/plot_manipulating_text.png')
plt.show()

In [None]:
# Plotting a graph keeping the length of text constant, and length of pattern varying

def get_random_pattern(length):
    # choose from all lowercase letter
    letters = st.ascii_lowercase
    result_str = ''.join(random.choice(letters) for i in range(length))
    return result_str

m_vals = []
KMP_vals = []
LCS_vals = []
RabinKarp_vals = []
baseline_vals = []

gap_size = int(len(all_papers_string)/1000)

for N in range(1000, len(all_papers_string), gap_size):
    m_vals.append(N)
    p = get_random_pattern(N)

    # Running the KMP algorithm
    start_time = time.time()
    KMP(p, all_papers_string)
    runtime = time.time() - start_time
    KMP_vals.append(runtime)

    # Running the Rabin Karp algorithm
    start_time = time.time()
    RapinKarpSearch(p, all_papers_string)
    runtime = time.time() - start_time
    RabinKarp_vals.append(runtime)

    # Running the LCS algorithm
    start_time = time.time()
    longest_common_subsequence(p, all_papers_string)
    runtime = time.time() - start_time
    LCS_vals.append(runtime)

    # Plotting the function O(n)
    baseline_vals.append(bigO_time(len(all_papers_string) + len(p)))

data = {"m" : m_vals, "KMPrt" : KMP_vals,"RKrt" : RabinKarp_vals, "LCSrt" : LCS_vals, "O(n+m)" : baseline_vals}
df = pd.DataFrame(data)
df.to_csv('Files/RT_manipulating_pattern.csv', encoding='utf-8', index=False)
plt.plot(df['m'], df['KMPrt'], 'o', label='KMPrt')
plt.plot(df['m'], df['RKrt'], 'o', label='RKrt')
plt.plot(df['m'], df['LCSrt'], 'o', label='LCSrt')
plt.plot(df['m'], df['O(n+m)'], 'o', label='O(n + m)')
plt.xlabel('m')
plt.ylabel('runtime (in secs.)')
plt.legend()
plt.title("Runtime of Algorithm (with varying pattern length)")
plt.savefig('Plots/plot_manipulating_pattern.png')
plt.show()