In [3]:
import pdfplumber
import csv
from collections import Counter

In [4]:
# Open the PDF file
pdf_path = '1706.03762.pdf'
with pdfplumber.open(pdf_path) as pdf:
    # Extract text from each page of the PDF
    text = ''
    for page in pdf.pages:
        text += page.extract_text()

In [6]:
# Store the extracted text in a CSV file
csv_file = 'pdf_text.csv'
with open(csv_file, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['Text'])
    writer.writerow([text])
print(f"Extracted text from the PDF has been saved in '{csv_file}'")

Extracted text from the PDF has been saved in '<_io.TextIOWrapper name='pdf_text.csv' mode='w' encoding='utf-8'>'


In [7]:
# Find the most repeated word
word_count = Counter(text.split())
most_common_word = word_count.most_common(1)[0][0]
print(f"The most repeated word in the PDF is: '{most_common_word}'")

The most repeated word in the PDF is: 'the'


# Topic Modelling

In [17]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\8092\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\8092\AppData\Roaming\nltk_data...


True

In [18]:
# Load the extracted text from the CSV file
csv_file = 'pdf_text.csv'
with open(csv_file, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    extracted_text = next(reader)[0]

In [19]:
# Tokenize and preprocess the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [20]:
tokens = word_tokenize(extracted_text.lower())
tokens = [token for token in tokens if token.isalpha()]
tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

In [21]:
# Keyword extraction
word_freq = FreqDist(tokens)
most_common_keywords = word_freq.most_common(10)  # Change the number to adjust the number of keywords to extract

In [22]:
print("Most common keywords:")
for keyword, freq in most_common_keywords:
    print(f"{keyword}: {freq}")


Most common keywords:
model: 22
dap: 20
k: 17
n: 13
layer: 13
attention: 12
soe: 10
transformer: 9
x: 8
v: 8


In [23]:
# Topic modeling with Latent Dirichlet Allocation (LDA)
num_topics = 5  # Change the number to adjust the number of topics to extract
num_top_words = 5  # Change the number to adjust the number of top words to display for each topic

In [37]:
# Create the TF-IDF matrix
vectorizer = TfidfVectorizer(max_df=2, min_df=0.95, stop_words='english')
tfidf_matrix = vectorizer.fit_transform([extracted_text])

In [38]:
# Perform LDA
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(tfidf_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [39]:
# Get the top words for each topic
feature_names = vectorizer.get_feature_names()
topics = []
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    topics.append(top_words)

In [40]:
print("\nTopics:")
for topic_idx, topic_words in enumerate(topics):
    print(f"Topic {topic_idx + 1}: {', '.join(topic_words)}")


Topics:
Topic 1: 512, deep, layer5, models, values
Topic 2: 512, deep, layer5, models, values
Topic 3: 512, deep, layer5, models, values
Topic 4: attention, dap, model, 25, 2016
Topic 5: 512, deep, layer5, models, values
