In [2]:
# import packages
import os
import re
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenne\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lenne\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
# Download the pdfs from our S3 bucket
#import requests
#
#url = 'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Articles/article10.pdf'
#
#response = requests.get(url)
#with open('local_file.pdf', 'wb') as f:
#    f.write(response.content)
#print('PDF file downloaded successfully.')

In [None]:
# Define a function to extract text from PDF file
def extract_text_from_pdf(file_path):
    if not os.path.exists(file_path):
        print("File not found:", file_path)
        return ""

    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            text = ""

            for page_number in range(num_pages):
                page = pdf_reader.pages[page_number]
                text += page.extract_text()

        return text
    except Exception as e:
        print("Error occurred while extracting text:", str(e))
        return ""


# Define a function to preprocess the text
def preprocess_text(text):
    # Remove noise, white spaces, and punctuation using regular expressions
    text = re.sub(r'\s+', ' ', text)  # Remove extra white spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    # Tokenization
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens


# Define a function to perform part of speech tagging
def pos_tagging(tokens):
    tagged_tokens = pos_tag(tokens)
    return tagged_tokens

# Define a function to extract bag of words and 2-grams
def extract_bag_of_words(tokens):
    # Bag of Words
    word_freq = FreqDist(tokens)
    most_common_words = word_freq.most_common()

    # 2-grams
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokens)
    n_gram_scores = finder.score_ngrams(bigram_measures.raw_freq)
    most_common_2grams = sorted(n_gram_scores, key=lambda x: x[1], reverse=True)

    return most_common_words, most_common_2grams


# Define a function for text summarization
def text_summarization(text):
    sentences = sent_tokenize(text)

    # Extract the most frequent words
    tokens = word_tokenize(text)
    word_freq = FreqDist(tokens)
    most_common_words = [word for word, _ in word_freq.most_common(10)]

    # Select short sentences that contain the most frequent words
    selected_sentences = []
    for sentence in sentences:
        sentence_tokens = word_tokenize(sentence)
        common_words_count = sum(1 for token in sentence_tokens if token in most_common_words)
        if common_words_count >= 3:
            selected_sentences.append(sentence)

    return selected_sentences


# Define a function for sentiment analysis
def sentiment_analysis(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores


# Define a function for topic modelling (Latent Dirichlet Allocation)
def latent_dirichlet_allocation(tokens, num_topics=3):
    if len(tokens) == 0:
        print("No tokens available for topic modeling.")
        return []

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([' '.join(tokens)])

    if len(vectorizer.get_feature_names()) == 0:
        print("Empty vocabulary. The documents may only contain stop words.")
        return []

    feature_names = vectorizer.get_feature_names()

    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)

    topic_keywords = []
    for topic_idx, topic in enumerate(lda.components_):
        top_indices = topic.argsort()[:-11:-1]
        topic_keywords.append([feature_names[i] for i in top_indices])

    return topic_keywords


# Define a function to generate a word cloud
def generate_word_cloud(tokens):
    text = ' '.join(tokens)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [23]:
# Get the folder_path where the file is in (= current working directory)
folder_path = os.getcwd()

for file_name in os.listdir(folder_path):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(folder_path, file_name)
        text = extract_text_from_pdf(file_path)
        preprocessed_text = preprocess_text(text)
        tagged_tokens = pos_tagging(preprocessed_text)
        most_common_words, most_common_2grams = extract_bag_of_words(preprocessed_text)
        selected_sentences = text_summarization(text)
        sentiment_scores = sentiment_analysis(text)
        topic_keywords = latent_dirichlet_allocation(preprocessed_text, num_topics=5)
        generate_word_cloud(preprocessed_text)

        print("Processed file:", file_name)
        print("Most common words:", most_common_words)
        print("Most common 2-grams:", most_common_2grams)
        print("Selected sentences:", selected_sentences)
        print("Sentiment scores:", sentiment_scores)
        print("Topic keywords:", topic_keywords)