<a href="https://colab.research.google.com/github/harveyrutland/pdf_scanner/blob/main/paper_scanner_ML_algorithmns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pdfminer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycryptodome
  Downloading pycryptodome-3.16.0-cp35-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140096 sha256=f8a18a13fbd5a2a5d17ccbc24a513f3a51b2cb9340275d0d6c5cf7fa7d3f360d
  Stored in directory: /root/.cache/pip/wheels/1c/28/7d/f390b82bb0307deb63ff27a1474fd308ec68ee028

In [103]:
import re
from io import StringIO
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
import os




def pdf_to_text(pdf_file):
    """
    Extracts text from a PDF file and returns it as a list of strings (one string per page).
    """
    text = []
    with open(pdf_file, 'rb') as fh:
        # Create a PDF resource manager object that stores shared resources
        rsrcmgr = PDFResourceManager()
        # Create a layout analyzer object
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Extract text from each page and append it to the text list
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if hasattr(lt_obj, "get_text"):
                    text.append(lt_obj.get_text())
    
    return text


# Assume that the location of your PDF files is stored in the variable pdf_dir
pdf_dir = "/content/drive/MyDrive/PhD/Survey_Paper/"

# Use the os.listdir() function to get a list of all files in the directory
pdf_files = os.listdir(pdf_dir)

# Iterate over the list of files
for pdf_file_name in pdf_files:
    # Check if the file is a PDF
    if pdf_file_name.endswith(".pdf"):
        # Print the name of the file
        # print(pdf_file)

        pdf_file = pdf_dir + str(pdf_file_name)
        papers = pdf_to_text(pdf_file)


        models = {
            "Linear Regression": ["Linear Regression", "LR","Simple Linear Regression","Multiple Linear Regression"],
            "Logistic Regression": ["Logistic Regression", "LogReg","Binary Logistic Regression","Multinomial Logistic Regression","Ordinal Logistic Regression"],
            "Decision Trees": ["Decision Trees", "DT","CART","ID3","C4.5","CHAID","Random Forest"],
            "Random Forest": ["Random Forest", "RF","RandomForest","Random Forest Classifier","Random Forest Regressor"],
            "Gradient Boosting": ["GB","GBM","LightGBM","CatBoost"],
            "Support Vector Machine (SVM)": ["SVM", "Support Vector Machine","Linear SVM","Non-Linear SVM"],
            "k-Nearest Neighbors (k-NN)": ["k-NN", "k-Nearest Neighbors","KNN"],
            "Neural Networks": ["NN", "Neural Network", "MLP", "Perception","CNN","RNN","DNN","Autoencoder","Deep Belief Networks", "deep learning"],
            "Naive Bayes": ["Naive Bayes", "NB","Gaussian Naive Bayes","Multinomial Naive Bayes","Bernoulli Naive Bayes"],
            "k-Means Clustering": ["k-Means", "KMeans"],
            "Principal Component Analysis (PCA)": ["PCA"],
            "Singular Value Decomposition (SVD)": ["SVD"],
            "Latent Dirichlet Allocation (LDA)": ["LDA"],
            "t-Distributed Stochastic Neighbor Embedding (t-SNE)": ["t-SNE","tSNE"],
            "Generative Adversarial Networks (GAN)": ["GAN","Generative Adversarial Networks"],
            "Autoencoder": ["Autoencoder","AE","Autoencoders"],
            "Q-Learning, Reinforcement Learning": ["Q-Learning","Reinforcement Learning","RL"],
            "Bagging, Boosting, AdaBoost": ["Bagging","Boosting","AdaBoost"],
            "Markov Chain Monte Carlo (MCMC)": ["MCMC","Markov Chain Monte Carlo"],
            "Hidden Markov Models": ["HMM","Hidden Markov Models"],
            "Conditional Random Fields": ["CRF","Conditional Random Fields"],
            "Boltzmann Machines": ["BM","Boltzmann Machines"],
            "Random Projection": ["Random Projection","RP"],
            "Gaussian Mixture Model": ["GMM","Gaussian Mixture Model"],
            "Variational Autoencoder": ["VAE","Variational Autoencoder"],
            "Extreme Gradient Boosting": ["XGBoost","LightGBM","CatBoost"],
            "BERT, GPT, transformer-based models": ["BERT","GPT","transformer-based models"]
            }
          


        combined = []

        # Iterate over the papers
        paper_models = []





        for paper in papers:
            

            # Iterate over the models dictionary
            for model_type, model_synonyms in models.items():
                # Iterate over the synonyms of the model
                for model_synonym in model_synonyms:
                    # Check if the synonym is in the paper text
                    if re.search(r"\b"+model_synonym+r"\b", paper):
                      paper_models.append(model_type)
            


        paper_models = list( dict.fromkeys(paper_models))
        print(pdf_file_name, paper_models)





paper1.pdf ['Extreme Gradient Boosting', 'Decision Trees', 'Random Forest', 'Gradient Boosting', 'Support Vector Machine (SVM)', 'Neural Networks', 'Bagging, Boosting, AdaBoost']
