In [None]:
# pip install tensorflow
# pip install transformers
# pip install torch torchvision torchsummary

# Read PDF , isolate the Q & A , get the text - DONE 

In [1]:
import re
import os
import PyPDF2
from pdfminer.high_level import extract_text
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

# Load pre-trained model and tokenizer
MODEL = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [2]:


def extract_pdf_portion(input_path, header_word, footer_text):
    extracted_text = ""
    start_page = None
    
    with open(input_path, 'rb') as file:
        pdf = PyPDF2.PdfReader(file)

        header_count = 0
        for page_num, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if header_word in page_text:
                header_count += 1
                if header_count == 2:
                    start_page = page_num + 1
                    break

        end_page = None
        for page_num in range(len(pdf.pages) - 1, -1, -1):
            page = pdf.pages[page_num]
            page_text = page.extract_text()
            if footer_text in page_text:
                end_page = page_num
                break

        if start_page is None:
            return start_page, extracted_text

        for page_num in range(start_page - 1, end_page):
            page = pdf.pages[page_num]
            extracted_text += page.extract_text()

    return start_page, extracted_text

#####################################################################################################################################################

def parse_filename(filename):
    # Remove the '.pdf' extension and the ' Earnings Call' phrase
    clean_filename = filename.replace('.pdf', '').replace(' Earnings Call', '')

    # Split the remaining filename by commas
    parts = [part.strip() for part in clean_filename.split(',')]

    company_name = parts[0]

    # Handle different number of parts
    if len(parts) == 5:
        quarter_year = parts[2]
        full_date = parts[3] + ", " + parts[4]
    elif len(parts) == 4:
        quarter_year = parts[1]
        full_date = parts[2] + ", " + parts[3]
    else:
        quarter_year = "Unknown"
        full_date = "Unknown"
    
    # # Extract the needed parts
    # company_name = parts[0]
    # quarter_year = parts[2]
    # full_date = parts[3] + ", " + parts[4]

    return company_name, quarter_year, full_date

#####################################################################################################################################################

def sentiment_analysis(sentences):
    # Tokenize the sentences
    inputs = tokenizer(sentences, return_tensors="pt", truncation=True, padding=True)
    # Run the sentences through the model
    outputs = model(**inputs)
    # Get the sentiment predictions
    logits = outputs.logits
    softmax = torch.nn.Softmax(dim=1)  # Create a SoftMax function
    probs = softmax(logits)  # Get the model's prediction probabilities
    sentiment_scores = torch.argmax(probs, dim=1).tolist()  # Get the most probable sentiment
    return sentiment_scores  # Return the list of sentiment scores


#####################################################################################################################################################

def score_retriever(text):

# Split text into sentences
    sentences = re.split(r'(?<=[.?!])\s+', text)

    # Initialize variables
    qa_dict = {}
    current_question = ''
    current_response = ''
    aggregated_score = 0 
    response_scores = []  # Initialize list to hold all response scores

    # Process each sentence
    for sentence in sentences:
        # Replace newline characters
        sentence = sentence.replace('\n', ' ')
        
        if sentence.endswith('?'):
            if current_question:
                # Append the current question and response if they exist
                if current_response:
                    qa_dict[current_question.strip()] = current_response.strip()
                    current_response = ''
                else:
                    qa_dict[current_question.strip()] = ''
            current_question = sentence
        else:
            current_response += sentence + ' '

    # Append the last question and response
    if current_question:
        if current_response:
            qa_dict[current_question.strip()] = current_response.strip()
        else:
            qa_dict[current_question.strip()] = ''

    # Process each question and response
    for question, response in qa_dict.items():
        # Breakdown each sentence in response, perform sentiment analysis
        sentences_of_a_response = re.split(r'(?<=[.?!])\s+', response)
        sentiment_scores = sentiment_analysis(sentences_of_a_response)
        
        ### un-majoritiseed score ## 
        aggregated_score += sum(sentiment_analysis(sentences_of_a_response))
        
        # Calculate the score of the response
        score_counts = [sentiment_scores.count(score) for score in set(sentiment_scores)]
        max_count = max(score_counts)
        majority_scores = [score for score in set(sentiment_scores) if sentiment_scores.count(score) == max_count]
        if len(majority_scores) == 1:
            response_score = majority_scores[0]
        else:
            response_score = 1  # In case of a tie, we consider the response to be neutral
        
        # Add the response score to the list of response scores
        response_scores.append(response_score)

    # Determine the final sentiment score based on the majority of response scores
    final_score_counts = Counter(response_scores)
    final_score = final_score_counts.most_common(1)[0][0]
    
    return final_score,aggregated_score

#####################################################################################################################################################

# Directory containing your PDF files
directory_path = r'C:\Users\jerie\iCloudDrive\SMU MQF\mqf624- ML and FA\Transcripts'
header_word = "Question and Answer"
footer_text = "These materials have been prepared solely"
required_values = []


# Iterate over each file in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a PDF
    if filename.endswith('.pdf'):
        input_path = os.path.join(directory_path, filename)
        
        company_name, quarter_year, full_date = parse_filename(filename)
        
        start_page, Q_AND_A = extract_pdf_portion(input_path, header_word, footer_text)
        
        if start_page is None:
            continue
        
        final_score, aggregated_score = score_retriever(Q_AND_A)
        
        required_values.append((company_name, quarter_year, full_date , final_score, aggregated_score))
        
        # print(Q_AND_A)
        # print(filename)
        # print("------------------------------------")
        print(required_values)
        # Now you can use Q_AND_A for your next operations


[('Alphabet', 'Q1 2014', 'Apr 16, 2014', 2, 649)]
[('Alphabet', 'Q1 2014', 'Apr 16, 2014', 2, 649), ('Alphabet', 'Q1 2015', 'Apr 23, 2015', 2, 544)]
[('Alphabet', 'Q1 2014', 'Apr 16, 2014', 2, 649), ('Alphabet', 'Q1 2015', 'Apr 23, 2015', 2, 544), ('Alphabet', 'Q1 2016', 'Apr 21, 2016', 2, 461)]
[('Alphabet', 'Q1 2014', 'Apr 16, 2014', 2, 649), ('Alphabet', 'Q1 2015', 'Apr 23, 2015', 2, 544), ('Alphabet', 'Q1 2016', 'Apr 21, 2016', 2, 461), ('Alphabet', 'Q1 2017', 'Apr 27, 2017', 2, 474)]
[('Alphabet', 'Q1 2014', 'Apr 16, 2014', 2, 649), ('Alphabet', 'Q1 2015', 'Apr 23, 2015', 2, 544), ('Alphabet', 'Q1 2016', 'Apr 21, 2016', 2, 461), ('Alphabet', 'Q1 2017', 'Apr 27, 2017', 2, 474), ('Alphabet', 'Q1 2018', 'Apr 23, 2018', 2, 401)]
[('Alphabet', 'Q1 2014', 'Apr 16, 2014', 2, 649), ('Alphabet', 'Q1 2015', 'Apr 23, 2015', 2, 544), ('Alphabet', 'Q1 2016', 'Apr 21, 2016', 2, 461), ('Alphabet', 'Q1 2017', 'Apr 27, 2017', 2, 474), ('Alphabet', 'Q1 2018', 'Apr 23, 2018', 2, 401), ('Alphabet', '

In [25]:
df = pd.DataFrame(data = required_values, columns = ['company_name','QQ_YYYY','Exact_Date_Of_Releasee','Overall_Sentiment_Of_QA','Net_Sentiment_Score'])

In [26]:
def categorize(value):
    if value < (mean_val - std_val):
        return -1
    elif value > (mean_val + std_val):
        return 1
    else:
        return 0
    
mean_val = df['Net_Sentiment_Score'].mean()
std_val = df['Net_Sentiment_Score'].std()
df['class'] = df['Net_Sentiment_Score'].apply(categorize)

In [27]:
df['Exact_Date_Of_Releasee2'] = pd.to_datetime(df['Exact_Date_Of_Releasee'])



In [28]:
def map_quarters(value):
    q, y = value.split()
    if q == 'Q1':
        return f'{y}-03-31'
    elif q == 'Q2':
        return f'{y}-06-30'
    elif q == 'Q3':
        return f'{y}-09-30'
    elif q == 'Q4':
        return f'{y}-12-31'
    
df['Final_Date'] = df['QQ_YYYY'].apply(map_quarters)

In [30]:
df['ticker'] = df['company_name'].map({'Alphabet' : 'GOOGL','Netflix' :'NFLX','Facebook':'META','Amazon.com':'AMZN','Apple Inc.' :'AAPL'} )

In [31]:
df.to_csv('Sentiment.csv')

In [32]:
df

Unnamed: 0,company_name,QQ_YYYY,Exact_Date_Of_Releasee,Overall_Sentiment_Of_QA,Net_Sentiment_Score,class,Exact_Date_Of_Releasee2,Final_Date,ticker
0,Alphabet,Q1 2014,"Apr 16, 2014",2,649,0,2014-04-16,2014-03-31,GOOGL
1,Alphabet,Q1 2015,"Apr 23, 2015",2,544,0,2015-04-23,2015-03-31,GOOGL
2,Alphabet,Q1 2016,"Apr 21, 2016",2,461,0,2016-04-21,2016-03-31,GOOGL
3,Alphabet,Q1 2017,"Apr 27, 2017",2,474,0,2017-04-27,2017-03-31,GOOGL
4,Alphabet,Q1 2018,"Apr 23, 2018",2,401,0,2018-04-23,2018-03-31,GOOGL
...,...,...,...,...,...,...,...,...,...
194,Apple Inc.,Q4 2018,"Nov 01, 2018",2,337,-1,2018-11-01,2018-12-31,AAPL
195,Apple Inc.,Q4 2019,"Oct 30, 2019",2,389,0,2019-10-30,2019-12-31,AAPL
196,Apple Inc.,Q4 2020,"Oct 29, 2020",2,436,0,2020-10-29,2020-12-31,AAPL
197,Apple Inc.,Q4 2021,"Oct 28, 2021",2,540,0,2021-10-28,2021-12-31,AAPL


# Keep for reference

In [None]:


def extract_pdf_portion(input_path, header_word, footer_text):
    with open(input_path, 'rb') as file:
        pdf = PyPDF2.PdfReader(file)

        # Find the start page based on the second appearance of the header word
        start_page = None
        header_count = 0
        for page_num, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if header_word in page_text:
                header_count += 1
                if header_count == 2:
                    start_page = page_num + 1
                    break

        # Find the end page based on the footer text
        end_page = None
        for page_num in range(len(pdf.pages) - 1, -1, -1):
            page = pdf.pages[page_num]
            page_text = page.extract_text()
            if footer_text in page_text:
                end_page = page_num
                break

        # Extract the portion of the PDF
        extracted_text = ""  # initialize an empty string to store the text
        for page_num in range(start_page - 1, end_page):
            page = pdf.pages[page_num]
            extracted_text += page.extract_text()  # append the page text to the string
            
        return extracted_text

#####################################################################################################################################################

def parse_filename(filename):
    # Remove the '.pdf' extension and the ' Earnings Call' phrase
    clean_filename = filename.replace('.pdf', '').replace(' Earnings Call', '')

    # Split the remaining filename by commas
    parts = [part.strip() for part in clean_filename.split(',')]

    company_name = parts[0]

    # Handle different number of parts
    if len(parts) == 5:
        quarter_year = parts[2]
        full_date = parts[3] + ", " + parts[4]
    elif len(parts) == 4:
        quarter_year = parts[1]
        full_date = parts[2] + ", " + parts[3]
    else:
        quarter_year = "Unknown"
        full_date = "Unknown"
    
    # # Extract the needed parts
    # company_name = parts[0]
    # quarter_year = parts[2]
    # full_date = parts[3] + ", " + parts[4]

    return company_name, quarter_year, full_date

#####################################################################################################################################################

def sentiment_analysis(sentences):
    # Tokenize the sentences
    inputs = tokenizer(sentences, return_tensors="pt", truncation=True, padding=True)
    # Run the sentences through the model
    outputs = model(**inputs)
    # Get the sentiment predictions
    logits = outputs.logits
    softmax = torch.nn.Softmax(dim=1)  # Create a SoftMax function
    probs = softmax(logits)  # Get the model's prediction probabilities
    sentiment_scores = torch.argmax(probs, dim=1).tolist()  # Get the most probable sentiment
    return sentiment_scores  # Return the list of sentiment scores


#####################################################################################################################################################

def score_retriever(text):

# Split text into sentences
    sentences = re.split(r'(?<=[.?!])\s+', text)

    # Initialize variables
    qa_dict = {}
    current_question = ''
    current_response = ''
    aggregated_score = 0 
    response_scores = []  # Initialize list to hold all response scores

    # Process each sentence
    for sentence in sentences:
        # Replace newline characters
        sentence = sentence.replace('\n', ' ')
        
        if sentence.endswith('?'):
            if current_question:
                # Append the current question and response if they exist
                if current_response:
                    qa_dict[current_question.strip()] = current_response.strip()
                    current_response = ''
                else:
                    qa_dict[current_question.strip()] = ''
            current_question = sentence
        else:
            current_response += sentence + ' '

    # Append the last question and response
    if current_question:
        if current_response:
            qa_dict[current_question.strip()] = current_response.strip()
        else:
            qa_dict[current_question.strip()] = ''

    # Process each question and response
    for question, response in qa_dict.items():
        # Breakdown each sentence in response, perform sentiment analysis
        sentences_of_a_response = re.split(r'(?<=[.?!])\s+', response)
        sentiment_scores = sentiment_analysis(sentences_of_a_response)
        
        ### un-majoritiseed score ## 
        aggregated_score += sum(sentiment_analysis(sentences_of_a_response))
        
        # Calculate the score of the response
        score_counts = [sentiment_scores.count(score) for score in set(sentiment_scores)]
        max_count = max(score_counts)
        majority_scores = [score for score in set(sentiment_scores) if sentiment_scores.count(score) == max_count]
        if len(majority_scores) == 1:
            response_score = majority_scores[0]
        else:
            response_score = 1  # In case of a tie, we consider the response to be neutral
        
        # Add the response score to the list of response scores
        response_scores.append(response_score)

    # Determine the final sentiment score based on the majority of response scores
    final_score_counts = Counter(response_scores)
    final_score = final_score_counts.most_common(1)[0][0]
    
    return final_score,aggregated_score

#####################################################################################################################################################

# Directory containing your PDF files
directory_path = r'C:\Users\jerie\iCloudDrive\SMU MQF\mqf624- ML and FA\Transcripts'
header_word = "Question and Answer"
footer_text = "These materials have been prepared solely"
required_values = []


# Iterate over each file in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a PDF
    if filename.endswith('.pdf'):
        input_path = os.path.join(directory_path, filename)
        
        company_name, quarter_year, full_date = parse_filename(filename)
        
        Q_AND_A = extract_pdf_portion(input_path, header_word, footer_text)
        
        final_score, aggregated_score = score_retriever(Q_AND_A)
        
        required_values.append((company_name, quarter_year, full_date , final_score, aggregated_score))
        
        # print(Q_AND_A)
        # print(filename)
        # print("------------------------------------")
        print(required_values)
        # Now you can use Q_AND_A for your next operations


In [None]:
from collections import Counter

# Load pre-trained model and tokenizer
MODEL = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def sentiment_analysis(sentences):
    # Tokenize the sentences
    inputs = tokenizer(sentences, return_tensors="pt", truncation=True, padding=True)
    # Run the sentences through the model
    outputs = model(**inputs)
    # Get the sentiment predictions
    logits = outputs.logits
    softmax = torch.nn.Softmax(dim=1)  # Create a SoftMax function
    probs = softmax(logits)  # Get the model's prediction probabilities
    sentiment_scores = torch.argmax(probs, dim=1).tolist()  # Get the most probable sentiment
    return sentiment_scores  # Return the list of sentiment scores

text = Q_AND_A  # Replace this with your actual text

# Split text into sentences
sentences = re.split(r'(?<=[.?!])\s+', text)

# Initialize variables
qa_dict = {}
current_question = ''
current_response = ''
aggregated_score = 0 
response_scores = []  # Initialize list to hold all response scores

# Process each sentence
for sentence in sentences:
    # Replace newline characters
    sentence = sentence.replace('\n', ' ')
    
    if sentence.endswith('?'):
        if current_question:
            # Append the current question and response if they exist
            if current_response:
                qa_dict[current_question.strip()] = current_response.strip()
                current_response = ''
            else:
                qa_dict[current_question.strip()] = ''
        current_question = sentence
    else:
        current_response += sentence + ' '

# Append the last question and response
if current_question:
    if current_response:
        qa_dict[current_question.strip()] = current_response.strip()
    else:
        qa_dict[current_question.strip()] = ''

# Process each question and response
for question, response in qa_dict.items():
    # Breakdown each sentence in response, perform sentiment analysis
    sentences_of_a_response = re.split(r'(?<=[.?!])\s+', response)
    sentiment_scores = sentiment_analysis(sentences_of_a_response)
    
    ### un-majoritiseed score ## 
    aggregated_score += sum(sentiment_analysis(sentences_of_a_response))
    
    # Calculate the score of the response
    score_counts = [sentiment_scores.count(score) for score in set(sentiment_scores)]
    max_count = max(score_counts)
    majority_scores = [score for score in set(sentiment_scores) if sentiment_scores.count(score) == max_count]
    if len(majority_scores) == 1:
        response_score = majority_scores[0]
    else:
        response_score = 1  # In case of a tie, we consider the response to be neutral
    
    # Add the response score to the list of response scores
    response_scores.append(response_score)

# Determine the final sentiment score based on the majority of response scores
final_score_counts = Counter(response_scores)
final_score = final_score_counts.most_common(1)[0][0]

# print("Final Sentiment Score:", final_score)
# print("Aggregated Sentiment Score:", aggregated_score)

In [None]:
import PyPDF2


def extract_pdf_portion(input_path, header_word, footer_text):
    with open(input_path, 'rb') as file:
        pdf = PyPDF2.PdfReader(file)

        # Extract the "Call Participants" text
        call_participants = pdf.pages[2].extract_text()  # 0-based indexing, so 2 is the third page

        # Find the start page based on the second appearance of the header word
        start_page = None
        header_count = 0
        for page_num, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if header_word in page_text:
                header_count += 1
                if header_count == 2:
                    start_page = page_num + 1
                    break

        # Find the end page based on the footer text
        end_page = None
        for page_num in range(len(pdf.pages) - 1, -1, -1):
            page = pdf.pages[page_num]
            page_text = page.extract_text()
            if footer_text in page_text:
                end_page = page_num
                break

        # Extract the portion of the PDF
        output_pdf = PyPDF2.PdfWriter()
        extracted_text = ""  # initialize an empty string to store the text
        for page_num in range(start_page - 1, end_page):
            page = pdf.pages[page_num]
            output_pdf.add_page(page)
            extracted_text += page.extract_text()  # append the page text to the string

        output_path = "output_portion.pdf"
        with open(output_path, 'wb') as output_file:
            output_pdf.write(output_file)
            
        return extracted_text, call_participants

    
# Example usage
input_path = 'Amazon.com Inc., Q2 2013 Earnings Call, Jul 25, 2013.pdf'
header_word = "Question and Answer"
footer_text = "These materials have been prepared solely"
extract_pdf_portion(input_path, header_word, footer_text)

Q_AND_A, CALL_PARTICIPANTS = extract_pdf_portion(input_path, header_word, footer_text)


In [None]:

import re




text = Q_AND_A

# Split text into sentences
sentences = re.split(r'(?<=[.?!])\s+', text)

# Initialize variables
qa_dict = {}
current_question = ''
current_response = ''

# Process each sentence
for sentence in sentences:
    # Replace newline characters
    sentence = sentence.replace('\n', ' ')
    
    if sentence.endswith('?'):
        if current_question:
            # Append the current question and response if they exist
            if current_response:
                qa_dict[current_question.strip()] = current_response.strip()
                current_response = ''
            else:
                qa_dict[current_question.strip()] = ''
        current_question = sentence
    elif sentence.endswith('.'):
        current_response += sentence + ' '
    else:
        # Group consecutive sentences with a question mark as a question
        if current_question:
            current_question += ' ' + sentence
        else:
            current_response += sentence + ' '

# Append the last question and response
if current_question:
    if current_response:
        qa_dict[current_question.strip()] = current_response.strip()
    else:
        qa_dict[current_question.strip()] = ''

# Print the questions and responses in chronological order
for question, response in qa_dict.items():

    

    print("Question:", question)
    print("++++++++++++++++++")
    print("Response:", response)
    print("------------------")

In [None]:
########################## making it to dictionary  ################################### 
import pandas as pd
testdict ={}
for question, response in qa_dict.items():
   sentences_of_a_response = re.split(r'(?<=[.?!])\s+', response)
   sentences_of_a_response_dict = {}   

   sentences_of_a_response_dict[response] = sentences_of_a_response
   
   testdict[question] = sentences_of_a_response_dict
   
      # print(single_sentence)
      # print('---------------')
      
      
########################## convert to dataframe ################################### 
df = pd.DataFrame(
   [
      (questions,full_resp,sentences_of_full_resp)
   for questions, nested_dict in testdict.items()
   for full_resp , sentences_of_full_resp in nested_dict.items()
   ],
   
   columns = ['question','full_response','sentence_of_full_response']
)

df2 = df.explode(['sentence_of_full_response'],ignore_index=True)



########################## for analysing the escores out in dataframe ###################################

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the FinBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Function to predict the classification using FinBERT
def predict_classification(sentence, tokenizer, model):
    inputs = tokenizer.encode_plus(
        sentence,
        truncation=True,
        padding='max_length',
        
        return_tensors='pt'
    )

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    probabilities = torch.softmax(logits, dim=1).tolist()[0]
    labels = model.config.id2label
    results = [{'label': labels[i], 'score': score} for i, score in enumerate(probabilities)]
    return results

# List of column names to apply classification on
columns_to_apply = ['question','full_response','sentence_of_full_response']  # Update with the desired column names

# Apply FinBERT to each row in the specified columns
for column in columns_to_apply:
    df2[f'{column}_classification_finbert'] = df2[column].apply(lambda x: predict_classification(x, tokenizer, model))

# Create new columns for each label with their respective scores
for index, row in df2.iterrows():
    for column in columns_to_apply:
        classification_results = row[f'{column}_classification_finbert']
        for label_score in classification_results:
            label = label_score['label']
            score = label_score['score']
            new_label = f'finbert_{column}_{label}'  # Add 'finbert_' prefix to the label and include the column name
            df2.at[index, new_label] = score




In [None]:
from transformers import pipeline
import pandas as pd

# Assuming df is your DataFrame and 'question' is your column

# Initialize the pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# Apply the classifier to each row in the 'question' column




df2['classification'] = df2['sentence_of_full_response'].apply(lambda x: classifier(x))

# Extract the classification results from the 'classification' column
classification_results = df2['classification'].apply(lambda x: x[0])

# Create new columns for each label with their respective scores
for index, row in df2.iterrows():
    classification_results = row['classification'][0]
    for label_score in classification_results:
        label = label_score['label']
        score = label_score['score']
        new_label = f'roberta_{label}'  # Add 'roberta_' prefix to the label
        df2.at[index, new_label] = score



In [None]:
## good to have down the road 

from transformers import pipeline
import pandas as pd

# Assuming df is your DataFrame and 'question' is your column

# Initialize the pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# Apply the classifier to each row in the 'question' column

columns_to_apply = ['question','full_response','sentence_of_full_response']  # Update with the desired column names

for column in columns_to_apply:
    df2[f'{column}_classification_roberta'] = df2[column].apply(lambda x: classifier(x))



for index, row in df2.iterrows():
    for column in columns_to_apply:
        classification_results = row[f'{column}_classification_roberta']
        for label_score in classification_results:
            label = label_score['label']
            score = label_score['score']
            new_label = f'roberta_{column}_{label}'  # Add 'finbert_' prefix to the label and include the column name
            df2.at[index, new_label] = score




In [None]:
text = Q_AND_A

# Split text into sentences
sentences = re.split(r'(?<=[.?!])\s+', text)

# Initialize variables
qa_dict = {}
current_question = ''
current_response = ''

# Process each sentence
for sentence in sentences:
    # Replace newline characters
    sentence = sentence.replace('\n', ' ')
    
    if sentence.endswith('?'):
        if current_question:
            # Append the current question and response if they exist
            if current_response:
                qa_dict[current_question.strip()] = current_response.strip()
                current_response = ''
            else:
                qa_dict[current_question.strip()] = ''
        current_question = sentence
    elif sentence.endswith('.'):
        current_response += sentence + ' '
    else:
        # Group consecutive sentences with a question mark as a question
        if current_question:
            current_question += ' ' + sentence
        else:
            current_response += sentence + ' '

# Append the last question and response
if current_question:
    if current_response:
        qa_dict[current_question.strip()] = current_response.strip()
    else:
        qa_dict[current_question.strip()] = ''

# Print the questions and responses in chronological order
for question, response in qa_dict.items():
    print("Question:", question)
    print("++++++++++++++++++")
    print("Response:", response)
    print("------------------")

In [None]:
from pdfminer.high_level import extract_text
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

path_file = 'Netflix, Inc., Q4 2022 Earnings Call, Jan 19, 2023.pdf'
text = extract_text(path_file)

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Tokenize the text into sentences
sentences = text.split(".")

class_mapping = {0: "negative", 1: "neutral", 2: "positive"}
selected_sentences = []
predicted_sentiments = []

for sentence in sentences:
    encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
        logits = model_output.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_sentiment = class_mapping[predicted_class]
    selected_sentences.append(sentence)
    predicted_sentiments.append(predicted_sentiment)

# Print the selected sentences and their predicted sentiment
for sentence, sentiment in zip(selected_sentences, predicted_sentiments):
    print(f"Sentence: {sentence}")
    print(f"Predicted Sentiment: {sentiment}")
    print()



In [None]:
#### 

from pdfminer.high_level import extract_text
path_file = 'Netflix, Inc., Q4 2022 Earnings Call, Jan 19, 2023.pdf'
text = extract_text(path_file)




#### finbert #####

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')


with torch.no_grad():
    model_output = model(**encoded_input)
    logits = model_output.logits

predicted_class = torch.argmax(logits, dim=1).item()

predicted_class

predicted_probability = torch.softmax(logits, dim=1)[0][predicted_class].item()

class_mapping = {0: "negative", 1: "neutral", 2: "positive"}
predicted_sentiment = class_mapping[predicted_class]

print(f"Predicted Sentiment: {predicted_sentiment}")
print(f"Probability: {predicted_probability:.4f}")

In [None]:
from pdfminer.high_level import extract_text
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

path_file = 'Netflix, Inc., Q4 2022 Earnings Call, Jan 19, 2023.pdf'
text = extract_text(path_file)

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Tokenize the text into sentences
sentences = text.split(".")

class_mapping = {0: "negative", 1: "neutral", 2: "positive"}
predicted_sentiments = []

for sentence in sentences:
    encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
        logits = model_output.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_sentiment = class_mapping[predicted_class]
    predicted_sentiments.append(predicted_sentiment)

# Print the sentences and their predicted sentiment
for sentence, sentiment in zip(sentences, predicted_sentiments):
    print(f"Sentence: {sentence}")
    print(f"Predicted Sentiment: {sentiment}")
    print()

