    Importing Libraries and other Modules

In [1]:
from PyPDF2 import PdfReader
import pandas as pd
import os
import nltk
from nltk.tokenize.toktok import ToktokTokenizer 
import string
from nltk.stem import PorterStemmer
import requests
import re



In [2]:
stopword_list = nltk.corpus.stopwords.words('english')
nltk.download('punkt')  # 'punkt' tokenizer module must be downloaded to use tokenization in nltk
API_URL = "https://api-inference.huggingface.co/models/Sigma/financial-sentiment-analysis"
headers = {"Authorization": "Bearer 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"}

test1_path = r'C:\Users\hp\Desktop\preply\lars\testing_pdfs\test1'

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


    Data Extraction from PDF's:

In [4]:
def extract_pdf_text(file_path):
    name = os.path.basename(file_path).replace('.pdf', '')
    highlights = ''
    investment_rationale = ''

    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        
        # Extract content from page 1 for Highlights:
        page1_text = pdf_reader.pages[0].extract_text()

        start_index_highlights = page1_text.find('Highlights')
        stop_index_highlights = page1_text.find('Investment Rationale/Risk')
        highlights += page1_text[start_index_highlights:stop_index_highlights]

        # Extract content from page 1 for Investment Rationale:
        start_index_investment = page1_text.find('Investment Rationale/Risk')
        stop_index_investment = page1_text.find('Price Performance')
        investment_rationale += page1_text[start_index_investment:stop_index_investment]
    
    return name, highlights, investment_rationale

pdf_folder_path = test1_path
pdf_files = [os.path.join(pdf_folder_path, file) for file in os.listdir(pdf_folder_path) if file.endswith('.pdf')]

data = []
for pdf_file in pdf_files:
    name, highlights, investment_rationale = extract_pdf_text(pdf_file)
    data.append({'Name': name, 'Highlights': highlights, 'Investment Rationale': investment_rationale})

df = pd.DataFrame(data)

In [5]:
df.head(10)

Unnamed: 0,Name,Highlights,Investment Rationale
0,EQUITY Factsheet_ Arista Networks Inc.,Highlights\nuFollowing sales growth of 48.6% i...,Investment Rationale/Risk\nuOur recent upgrade...
1,EQUITY Factsheet_ Boliden AB (publ),Highlights\nuBoliden’s (BOL) Q2 2023 revenue f...,Investment Rationale/Risk\nuOur call is 2-STAR...
2,EQUITY Factsheet_ Caterpillar Inc.,Highlights\nuThe Highlights section of this St...,Investment Rationale/Risk section of this \nSt...
3,EQUITY Factsheet_ Publicis Groupe S.A.,Highlights\nuPublicis (PUB) reported H1 2023 n...,Investment Rationale/Risk\nuOur recommendation...


    Preprocessing using NLP techniques:

In [6]:
df = df.astype(str)

# Remove 'u' characters from all columns
df = df.apply(lambda x: x.str.replace(r'\bu', '', regex=True))  # using regex python libraryto process our text better

df['Highlights'] = df['Highlights'].str.lower() # Text converted to lowercase for the column 'Highlights'
df['Investment Rationale'] = df['Investment Rationale'].str.lower()

tokenizer=ToktokTokenizer() # Initializing the Toktok tokenizer

# tokenize text in 'highlights' using our initialized instance of toktoktokenizer:
df['Highlights'] = df['Highlights'].apply(tokenizer.tokenize)   
df['Investment Rationale'] = df['Investment Rationale'].apply(tokenizer.tokenize)   



def remove_punc(tokens):
    # Initialize an empty list to store tokens without punctuation:
    tokens_without_punc_marks = []

    # Create a for loop to iterate through each token in the input list:
    for i in tokens:    # i stands for iterator
        if i not in string.punctuation: # Checking whether token is NOT a punctuation mark
            tokens_without_punc_marks.append(i) # Add token to the list if it is NOT a punc. mark

    return tokens_without_punc_marks

df['Highlights'] = df['Highlights'].apply(remove_punc)
df['Investment Rationale'] = df['Investment Rationale'].apply(remove_punc)

def remove_stopwords(tokens):
    filtered_tokens = []

    for i in tokens:
        if i not in stopword_list:
            filtered_tokens.append(i)
    
    return filtered_tokens


df['Highlights'] = df['Highlights'].apply(remove_stopwords)
df['Investment Rationale'] = df['Investment Rationale'].apply(remove_stopwords)

stemmer = nltk.porter.PorterStemmer()

# Without using list comp. (defining a func to perform stemming):
def stem_tokens(tokens):
    stemmed_tokens = []

    for i in tokens:
        stemmed_tokens. append(stemmer.stem(i)) # Passing current token to stem it.

    return stemmed_tokens

df['Highlights'] = df['Highlights'].apply(stem_tokens)
df['Investment Rationale'] = df['Investment Rationale'].apply(stem_tokens)


def remove_nums(tokens):
    alphabetic_tokens = []  # empty list to store alphabetic tokens only!

    for i in tokens:
        if i.isalpha(): # checks if the token contains only alphabetic characters.
            alphabetic_tokens.append(i)
    
    return alphabetic_tokens


df['Highlights'] = df['Highlights'].apply(remove_nums)
df['Investment Rationale'] = df['Investment Rationale'].apply(remove_nums)


df['Highlights'] = df['Highlights'].apply(lambda x: x[1:])
df['Investment Rationale'] = df['Investment Rationale'].apply(lambda x: x[1:])

In [7]:
df.head(10)

Unnamed: 0,Name,Highlights,Investment Rationale
0,EQUITY Factsheet_ Arista Networks Inc.,"[follow, sale, growth, see, growth, growth, re...","[recent, pgrade, hold, reflect, view, strong, ..."
1,EQUITY Factsheet_ Boliden AB (publ),"[boliden, bol, revenu, fell, drag, lower, volu...","[call, sell, view, stock, fulli, valu, current..."
2,EQUITY Factsheet_ Caterpillar Inc.,"[highlight, section, stock, report, pdate]","[section, stock, report, pdate, latest, news, ..."
3,EQUITY Factsheet_ Publicis Groupe S.A.,"[publici, pub, report, net, revenu, slightli, ...","[recommend, strong, buy, believ, pub, abl, mai..."


    Conversion list to string:

In [8]:
def list_to_str(lst):
    return ' '.join(lst)

df[['Highlights', 'Investment Rationale']] = df[['Highlights', 'Investment Rationale']].applymap(lambda x: list_to_str(x))
df.head(10)

  df[['Highlights', 'Investment Rationale']] = df[['Highlights', 'Investment Rationale']].applymap(lambda x: list_to_str(x))


Unnamed: 0,Name,Highlights,Investment Rationale
0,EQUITY Factsheet_ Arista Networks Inc.,follow sale growth see growth growth result su...,recent pgrade hold reflect view strong enterpr...
1,EQUITY Factsheet_ Boliden AB (publ),boliden bol revenu fell drag lower volum metal...,call sell view stock fulli valu current see lo...
2,EQUITY Factsheet_ Caterpillar Inc.,highlight section stock report pdate,section stock report pdate latest news stori c...
3,EQUITY Factsheet_ Publicis Groupe S.A.,publici pub report net revenu slightli capit i...,recommend strong buy believ pub abl maintain i...


In [9]:
type(df['Highlights'][0])

str

In [10]:
df['Content'] = df['Highlights'] + ' ' + df['Investment Rationale']
central_df = df[['Name', 'Content']]

In [11]:
central_df.head(10)

Unnamed: 0,Name,Content
0,EQUITY Factsheet_ Arista Networks Inc.,follow sale growth see growth growth result su...
1,EQUITY Factsheet_ Boliden AB (publ),boliden bol revenu fell drag lower volum metal...
2,EQUITY Factsheet_ Caterpillar Inc.,highlight section stock report pdate section s...
3,EQUITY Factsheet_ Publicis Groupe S.A.,publici pub report net revenu slightli capit i...


In [12]:
df = central_df.copy()

--------------------

In [13]:
def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def analyze_large_text(text, max_chunk_size=512):
    sentiment = []
    # Split the text into chunks of max_chunk_size
    chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    for chunk in chunks:
        output = query({"inputs": chunk})
        sentiment.append(output)
    return sentiment

def get_average_sentiment_fast(sentiment_chunks):
    if not sentiment_chunks:
        return None 

    # Assuming sentiment_chunks is a list of lists of dictionaries
    label = sentiment_chunks[0][0][1]['label'] if 'label' in sentiment_chunks[0][0][1] else 'unknown'
    total_score = sum(chunk[0][0]["score"] for chunk in sentiment_chunks)  # Sum scores
    average_score = total_score / len(sentiment_chunks)

    return {'label': label, 'score': average_score}

def main(large_text):
    sentiment_chunks = analyze_large_text(large_text)
    average_sentiment = get_average_sentiment_fast(sentiment_chunks)
    return average_sentiment


In [None]:
df['Content']

In [None]:
main(df['Content'][1])

In [15]:
sentiment_list = []

for index, row in df.iterrows():
    result = main(row['Content'])
    if result is not None:  # Check if result is not None
        sentiment_list.append({'Name': row['Name'], 'Label': result['label'], 'Sentiment': result['score']})

sentiment_df = pd.DataFrame(sentiment_list, columns=['Name', 'Label', 'Sentiment'])
sentiment_df = sentiment_df.sort_values(by='Sentiment', ascending=False)

In [16]:
csv_file_path = r'C:\Users\hp\Desktop\preply\lars\testing_pdfs\test1\test1.csv'
csv_data = pd.read_csv(csv_file_path)

out_performance_values = []

# Iterate over each row in your DataFrame
for name in sentiment_df['Name']:
    # Search for the name in the 'BUY' column of the CSV data
    match = csv_data[csv_data['PDF Name'] == name]['Out-performance'].values
    # If a match is found, append the Out-Performance value to the list
    if len(match) > 0:
        out_performance_values.append(match[0])
    else:
        out_performance_values.append(None)  # Append None if no match is found

# Add the Out-Performance values to your existing DataFrame
sentiment_df['Out-Performance'] = out_performance_values

In [17]:
sentiment_df.head(10)

Unnamed: 0,Name,Label,Sentiment,Out-Performance
2,EQUITY Factsheet_ Caterpillar Inc.,LABEL_2,0.998843,6.8%
0,EQUITY Factsheet_ Arista Networks Inc.,LABEL_1,0.959013,-5.3%
1,EQUITY Factsheet_ Boliden AB (publ),LABEL_2,0.874807,-27.7%
3,EQUITY Factsheet_ Publicis Groupe S.A.,LABEL_2,0.719976,17.2%


In [None]:
file_path = r'C:\Users\hp\Desktop\preply\lars\project\module 5\final_scores.csv'
sentiment_df.to_csv(file_path, index=False)
print(f"CSV file saved successfully.")