In [16]:
import arxiv
import requests
import io
import PyPDF2
import time
import pandas as pd
import matplotlib.pyplot as plt
import random
import string as st

# Error handling
import logging
logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.ERROR)

# Importing all the algorithm implementation
from LCS import longest_common_subsequence
from RapinKarp import RapinKarpSearch
from KMP import KMP

In [17]:
# Helper Methods
def bigO_time(length):
    start_time = time.time()
    for i in range(0, length):
        pass
    return time.time() - start_time

In [18]:
# Code to extract papers from Arxiv database
# Method to extract papers found based on a query, and extract the text and return the concatenated string
def extract_papers(query, volume):
    search = arxiv.Search(
        query = query,
        max_results = volume,
    )
    string = ""
    paper_extracted = 0;
    for result in search.results():
        # Get binary data of the PDF
        response = requests.get(result.pdf_url)
        pdf_file = io.BytesIO(response.content)
        # Read the PDF file
        try:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)
            # Iterate through each page and extract the text
            for page_number in range(num_pages):
                pdf_page = pdf_reader.pages[page_number]
                text = pdf_page.extract_text()
                string += text
            paper_extracted += 1
            print(f"Papers extracted        : {paper_extracted}/{volume}", end='\r')
        except:
            print("PDF with Invalid formatting found. Skipping..")
    return string

In [None]:
print("STARTED: Extracting papers")
all_papers_string = extract_papers(query="Robot", volume=1000)
print("FINISH: Extracted all the papers")
print("Total word extracted", len(all_papers_string))

STARTED: Extracting papers
Papers extracted: 20/1000

In [None]:
# Plotting a graph keeping the length of pattern constant, and length of text varying

n_vals = []
KMP_vals = []
LCS_vals = []
RabinKarp_vals = []
baseline_vals = []

gap_size = int(len(all_papers_string)/1000)
p = " ^_^ This text pattern never exists in research papers ^_^ "

for N in range(1000, len(all_papers_string), gap_size):
    t = all_papers_string[0:N]
    n_vals.append(len(t))

    # Running the KMP algorithm
    start_time = time.time()
    KMP(p, t)
    runtime = time.time() - start_time
    KMP_vals.append(runtime)

    # Running the Rabin Karp algorithm
    start_time = time.time()
    RapinKarpSearch(p, t)
    runtime = time.time() - start_time
    RabinKarp_vals.append(runtime)

    # Running the LCS algorithm
    start_time = time.time()
    longest_common_subsequence(p, t)
    runtime = time.time() - start_time
    LCS_vals.append(runtime)

    # Plotting the function O(n)
    baseline_vals.append(bigO_time(len(t) + len(p)))

data = {"n" : n_vals, "KMPrt" : KMP_vals,"RKrt" : RabinKarp_vals, "LCSrt" : LCS_vals, "O(n+m)" : baseline_vals}
df = pd.DataFrame(data)
plt.plot(df['n'], df['KMPrt'], 'o', label='KMPrt')
plt.plot(df['n'], df['RKrt'], 'o', label='RKrt')
plt.plot(df['n'], df['LCSrt'], 'o', label='LCSrt')
plt.plot(df['n'], df['O(n+m)'], 'o', label='O(n + m)')
plt.xlabel('n')
plt.ylabel('runtime (in secs.)')
plt.legend()
plt.title("Runtime of Algorithm (with varying text length)")
plt.savefig('Plots/plot_manipulating_text.png')
plt.show()

In [None]:
# Plotting a graph keeping the length of text constant, and length of pattern varying

def get_random_pattern(length):
    # choose from all lowercase letter
    letters = st.ascii_lowercase
    result_str = ''.join(random.choice(letters) for i in range(length))
    return result_str

m_vals = []
KMP_vals = []
LCS_vals = []
RabinKarp_vals = []
baseline_vals = []

gap_size = int(len(all_papers_string)/1000)

for N in range(1000, len(all_papers_string), gap_size):
    m_vals.append(N)
    p = get_random_pattern(N)

    # Running the KMP algorithm
    start_time = time.time()
    KMP(p, all_papers_string)
    runtime = time.time() - start_time
    KMP_vals.append(runtime)

    # Running the Rabin Karp algorithm
    start_time = time.time()
    RapinKarpSearch(p, all_papers_string)
    runtime = time.time() - start_time
    RabinKarp_vals.append(runtime)

    # Running the LCS algorithm
    start_time = time.time()
    longest_common_subsequence(p, all_papers_string)
    runtime = time.time() - start_time
    LCS_vals.append(runtime)

    # Plotting the function O(n)
    baseline_vals.append(bigO_time(len(all_papers_string) + len(p)))

data = {"m" : m_vals, "KMPrt" : KMP_vals,"RKrt" : RabinKarp_vals, "LCSrt" : LCS_vals, "O(n+m)" : baseline_vals}
df = pd.DataFrame(data)
plt.plot(df['m'], df['KMPrt'], 'o', label='KMPrt')
plt.plot(df['m'], df['RKrt'], 'o', label='RKrt')
plt.plot(df['m'], df['LCSrt'], 'o', label='LCSrt')
plt.plot(df['m'], df['O(n+m)'], 'o', label='O(n + m)')
plt.xlabel('m')
plt.ylabel('runtime (in secs.)')
plt.legend()
plt.title("Runtime of Algorithm (with varying pattern length)")
plt.savefig('Plots/plot_manipulating_pattern.png')
plt.show()