In [None]:
import transformers
import torch

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [None]:
import spacy 
import re

# load the Spacy English language model
nlp = spacy.load('en_core_web_lg')

# define the path to your file
file_path = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_exports/MEG_2017.txt'

# open the file and read its contents
with open(file_path, 'r') as f:
    text = f.read()

# process the text with the Spacy nlp pipeline
doc = nlp(text)

# split the text into sentences using Spacy
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents]

# define a regular expression pattern to match one-word sentences
pattern = re.compile(r'^\w+(\s+\w+)*[.?!]$')

# filter out sentences that match the pattern
filtered_sentences = [sent for sent in sentences if not pattern.match(sent)]

# print the filtered sentences
for sent in filtered_sentences:
    print(sent)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# initialize the FinBERT model and tokenizer
model_name = 'ProsusAI/finbert'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

for sent in filtered_sentences:
    # tokenize the sentence and add special tokens for classification
    inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)
    # classify the sentiment using the FinBERT model
    outputs = model(**inputs)
    logits = outputs.logits.detach().numpy()[0]
    probs = torch.softmax(outputs.logits, dim=1).tolist()[0]
    # print the sentence and its sentiment scores
    print(f"Sentence: {sent}")
    print(f"Positive sentiment score: {probs[0]:.4f}")
    print(f"Neutral sentiment score: {probs[1]:.4f}")
    print(f"Negative sentiment score: {probs[2]:.4f}")
    print()


In [None]:
import spacy 
import re
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the Spacy English language model
nlp = spacy.load('en_core_web_lg')

# Initialize the FinBERT model and tokenizer
model_name = 'ProsusAI/finbert'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the path to the folder containing text files
folder_path = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_exports/'

# Loop through each file in the folder and process its contents
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        # Open the file and read its contents
        with open(os.path.join(folder_path, file_name), 'r') as f:
            text = f.read()

        # Process the text with the Spacy nlp pipeline
        doc = nlp(text)

        # Split the text into sentences using Spacy
        sentences = [sent.text.strip() for sent in doc.sents]

        # Define a regular expression pattern to match one-word sentences
        pattern = re.compile(r'^\w+(\s+\w+)*[.?!]$')

        # Filter out sentences that match the pattern
        filtered_sentences = [sent for sent in sentences if not pattern.match(sent)]

        # Create an empty DataFrame to store the results
        sentiment_df = pd.DataFrame(columns=['sentence', 'positive', 'neutral', 'negative'])

        for sent in filtered_sentences:
            # Tokenize the sentence and add special tokens for classification
            inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)

            # Classify the sentiment using the FinBERT model
            outputs = model(**inputs)
            logits = outputs.logits.detach().numpy()[0]
            probs = torch.softmax(outputs.logits, dim=1).tolist()[0]

            # Save the sentence and its sentiment scores in the DataFrame
            new_row = pd.DataFrame({'sentence': [sent], 'positive': [probs[0]], 'neutral': [probs[1]], 'negative': [probs[2]]})
            sentiment_df = pd.concat([sentiment_df, new_row], ignore_index=True)

        # Save the results to a CSV file with the same name as the input file
        output_file_name = file_name.split('.')[0] + '_sentiment.csv'
        output_file_path = os.path.join(folder_path, output_file_name)
        sentiment_df.to_csv(output_file_path, index=False)

        # print a message to indicate that the file has been processed
        print(file_name, 'has been processed')


In [None]:
import spacy 
import re
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# load the Spacy English language model
nlp = spacy.load('en_core_web_lg')

# initialize the FinBERT model and tokenizer
model_name = 'ProsusAI/finbert'
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# set a fixed seed for the random number generator
torch.manual_seed(0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# define the path to your input folder
input_folder = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_exports'

# define the path to your output folder
output_folder = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_processed'

# define a regular expression pattern to match one-word sentences
pattern = re.compile(r'^\w+(\s+\w+)*[.?!]$')

# iterate over all files in the input folder
for file_name in os.listdir(input_folder):
    # define the path to the input file
    file_path = os.path.join(input_folder, file_name)
    
    # open the input file and read its contents
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # process the text with the Spacy nlp pipeline
    doc = nlp(text)

    # split the text into sentences using Spacy
    sentences = [sent.text.strip() for sent in doc.sents]

    # filter out sentences that match the pattern
    filtered_sentences = [sent for sent in sentences if not pattern.match(sent)]

    # create an empty DataFrame to store the results
    sentiment_df = pd.DataFrame(columns=['sentence', 'positive', 'neutral', 'negative'])

    for sent in filtered_sentences:
        # tokenize the sentence and add special tokens for classification
        inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)
        # process using CPU
        device = torch.device('cpu')
        model.to(device)
        inputs.to(device)
        # classify the sentiment using the FinBERT model
        outputs = model(**inputs)
        logits = outputs.logits.detach().cpu().numpy()[0]
        probs = torch.softmax(outputs.logits, dim=1).tolist()[0]
        # save the sentence and its sentiment scores in the DataFrame
        new_row = pd.DataFrame({'sentence': [sent], 'positive': [probs[0]], 'neutral': [probs[1]], 'negative': [probs[2]]})
        sentiment_df = pd.concat([sentiment_df, new_row], ignore_index=True)

    # define the path to the output file
    output_file_path = os.path.join(output_folder, file_name.replace('.txt', '.csv'))

    # save the results to a CSV file
    sentiment_df.to_csv(output_file_path, index=False)

    # print a message to indicate that the file has been processed
    print(f"{file_name} has been processed.")


In [None]:
another test updated 

In [100]:
import spacy 
import re
import os
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# load the Spacy English language model
nlp = spacy.load('en_core_web_lg')

# initialize the FinBERT model and tokenizer
model_name = 'ProsusAI/finbert'
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# set a fixed seed for the random number generator
torch.manual_seed(0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# define the path to your input folder
input_folder = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed'

# define the path to your output folder
output_folder = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_processed'

# add regular expressions to match "-" and currency symbols
pattern = re.compile(r'^\w+(\s+\w+)*[.?!]?(\s+\$)?(\s+¥)?(\s+€)?(\s+£)?(\s+\u20B1)?(\s+-)?$', re.IGNORECASE)


# iterate over all files in the input folder
for file_name in os.listdir(input_folder):
    # define the path to the input file
    file_path = os.path.join(input_folder, file_name)
    
    # read the csv file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # replace NaN values with empty strings
    df = df.replace(np.nan, '', regex=True)
    
    # get the sentences from the second column of the DataFrame
    sentences = df.iloc[:, 1]

    # filter out sentences that match the pattern
    filtered_sentences = [sent for sent in sentences if not pattern.match(sent)]

    # create an empty DataFrame to store the results
    sentiment_df = pd.DataFrame(columns=['sentence', 'positive', 'neutral', 'negative'])

    for sent in filtered_sentences:
        # tokenize the sentence and add special tokens for classification
        inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)
        # process using CPU
        device = torch.device('cpu')
        model.to(device)
        inputs.to(device)
        # classify the sentiment using the FinBERT model
        outputs = model(**inputs)
        logits = outputs.logits.detach().cpu().numpy()[0]
        probs = torch.softmax(outputs.logits, dim=1).tolist()[0]
        # save the sentence and its sentiment scores in the DataFrame
        new_row = pd.DataFrame({'sentence': [sent], 'positive': [probs[0]], 'neutral': [probs[2]], 'negative': [probs[1]]})
        sentiment_df = pd.concat([sentiment_df, new_row], ignore_index=True)

    # define the path to the output file
    output_file_path = os.path.join(output_folder, file_name.replace('.csv', '.csv'))

    # save the results to a CSV file
    sentiment_df.to_csv(output_file_path, index=False)

    # print a message to indicate that the file has been processed
    print(f"{file_name} has been processed.")


AEV_2021.csv has been processed.
GLO_2018.csv has been processed.
GLO_2019.csv has been processed.
AEV_2020.csv has been processed.
BPI_2017.csv has been processed.
.DS_Store has been processed.
BDO_2017.csv has been processed.
AC_2017.csv has been processed.
AEV_2018.csv has been processed.
GLO_2021.csv has been processed.
GLO_2020.csv has been processed.
AEV_2019.csv has been processed.
MEG_2020.csv has been processed.
MEG_2021.csv has been processed.
ACEN_2021.csv has been processed.
ACEN_2020.csv has been processed.
ALI_2017.csv has been processed.
AGI_2017.csv has been processed.
ACEN_2018.csv has been processed.
ACEN_2019.csv has been processed.
MEG_2019.csv has been processed.
AP_2017.csv has been processed.
MEG_2018.csv has been processed.
ALI_2020.csv has been processed.
AGI_2018.csv has been processed.
AGI_2019.csv has been processed.
ALI_2021.csv has been processed.
AP_2019.csv has been processed.
AP_2018.csv has been processed.
MEG_2017.csv has been processed.
AP_2020.csv h

In [91]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", problem_type="multi_label_classification")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]

# To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
num_labels = len(model.config.id2label)
model = BertForSequenceClassification.from_pretrained(
    "textattack/bert-base-uncased-yelp-polarity", num_labels=num_labels, problem_type="multi_label_classification"
)

labels = torch.sum(
    torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
).to(torch.float)
loss = model(**inputs, labels=labels).loss

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/520 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
import re
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import nltk

# download the NLTK tokenizer
nltk.download('punkt')

# initialize the FinBERT model and tokenizer
# model_name = 'ProsusAI/finbert'
# model = AutoModelForSequenceClassification.from_pretrained(model_name)

model_name = "yiyanghkust/finbert-tone"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# set a fixed seed for the random number generator
torch.manual_seed(0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# define the path to your input folder
input_folder = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_exports'

# define the path to your output folder
output_folder = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_processed'

# define a regular expression pattern to match one-word sentences
pattern = re.compile(r'^\w+(\s+\w+)*[.?!]$')

# iterate over all files in the input folder
for file_name in os.listdir(input_folder):
    # define the path to the input file
    file_path = os.path.join(input_folder, file_name)
    
    # open the input file and read its contents
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # split the text into sentences using NLTK
    sentences = nltk.sent_tokenize(text)

    # filter out sentences that match the pattern
    filtered_sentences = [sent for sent in sentences if not pattern.match(sent)]

    # create an empty DataFrame to store the results
    sentiment_df = pd.DataFrame(columns=['sentence', 'positive', 'neutral', 'negative'])

    for sent in filtered_sentences:
        # tokenize the sentence and add special tokens for classification
        inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)
        # process using CPU
        device = torch.device('cpu')
        model.to(device)
        inputs.to(device)
        # classify the sentiment using the FinBERT model
        outputs = model(**inputs)
        logits = outputs.logits.detach().cpu().numpy()[0]
        probs = torch.softmax(outputs.logits, dim=1).tolist()[0]
        # save the sentence and its sentiment scores in the DataFrame
        new_row = pd.DataFrame({'sentence': [sent], 'positive': [probs[0]], 'neutral': [probs[2]], 'negative': [probs[1]]})
        sentiment_df = pd.concat([sentiment_df, new_row], ignore_index=True)

    # define the path to the output file
    output_file_path = os.path.join(output_folder, file_name.replace('.txt', '.csv'))

    # save the results to a CSV file
    sentiment_df.to_csv(output_file_path, index=False)

    # print a message to indicate that the file has been processed
    print(f"{file_name} has been processed.")


In [90]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

sentences = ["Consolidated EBITDA increased by 8% in 2018, primarily due to the fresh contributions from PEC and Hedcor Bukidnon, which commenced commercial operations in March 2018 and July 2018, respectively, and further augmented by higher contributions from GMCP due to higher availability factor in 2018 as compared to the previous year."]
results = nlp(sentences)
print(results)  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative


[{'label': 'Positive', 'score': 0.9999780654907227}]


In [89]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

sentences = ["there is a shortage of capital, and we need extra financing",  
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]
results = nlp(sentences)
print(results)  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative


[{'label': 'Negative', 'score': 0.9966174960136414}, {'label': 'Positive', 'score': 1.0}, {'label': 'Negative', 'score': 0.9999710321426392}, {'label': 'Neutral', 'score': 0.9889442920684814}]


In [None]:
import os
import csv
import nltk

# set the path to the input folder
input_folder = "/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_exports"

# set the path to the output folder
output_folder = "/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed"

# create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# initialize the NLTK sentence tokenizer
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# iterate over all files in the input folder
for file_name in os.listdir(input_folder):
    # define the path to the input file
    file_path = os.path.join(input_folder, file_name)
    
    # open the input file and read its contents
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # split the text into sentences using NLTK
    sentences = tokenizer.tokenize(text)

    # create a list of tuples, where each tuple contains the filename and a sentence
    rows = [(file_name, sent.strip()) for sent in sentences]

    # define the path to the output file
    output_file_path = os.path.join(output_folder, file_name.replace('.txt', '.csv'))

    # write the list of tuples to a CSV file
    with open(output_file_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['filename', 'sentence'])
        writer.writerows(rows)

    # print a message to indicate that the file has been processed
    print(f"{file_name} has been processed.")


In [88]:
import os
import re
from pdfminer.high_level import extract_text

# set the path to the input folder
input_folder = "/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/testing/test_convert_pdf"

# set the path to the output folder
output_folder = "/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/testing/test_convert_output"

# create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# iterate over all files in the input folder
for file_name in os.listdir(input_folder):
    # check if the file is a PDF file
    if file_name.endswith(".pdf"):
        # define the path to the input file
        input_file_path = os.path.join(input_folder, file_name)
        
        # define the path to the output file
        output_file_path = os.path.join(output_folder, os.path.splitext(file_name)[0] + ".txt")
        
        # extract the text from the PDF file
        text = extract_text(input_file_path)
        
        # find the start and end positions of the target section
        start_pos = text.find("Management’s Discussion and Analysis", text.find("Management’s Discussion and Analysis")+1)
        end_pos = text.find("Financial Statements and Supplementary Schedules", text.find("Financial Statements and Supplementary Schedules")+1)
        
        # check if the target section was found
        if start_pos == -1 or end_pos == -1:
            print(f"Target section not found in {file_name}.")
            continue
        
        # extract the target section
        target_section = text[start_pos:end_pos]
        
        # write the extracted text to the output file
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(target_section)
        
        # print a message to indicate that the file has been processed
        print(f"PDF file {input_file_path} has been processed and saved as {output_file_path}.")


PDF file /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/testing/test_convert_pdf/AC_2017.pdf has been processed and saved as /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/testing/test_convert_output/AC_2017.txt.


In [93]:
import csv
import os

input_folder = "/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed"

# iterate over all CSV files in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):
        print(f"Processing CSV file: {file_name}")
        # define the path to the input file
        input_file_path = os.path.join(input_folder, file_name)

        # read the data from the CSV file
        with open(input_file_path, newline='', encoding='utf-8') as fh:
            reader = csv.DictReader(fh)
            rows = list(reader)

        # update the sentences by removing newlines
        for row in rows:
            row['sentence'] = row['sentence'].replace('\n', ' ')

        # write the updated data back to the CSV file
        with open(input_file_path, 'w', newline='', encoding='utf-8') as fh:
            writer = csv.DictWriter(fh, fieldnames=['filename', 'sentence'])
            writer.writeheader()
            writer.writerows(rows)

        print(f"CSV file {input_file_path} has been updated.")


Processing CSV file: AEV_2021.csv
CSV file /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed/AEV_2021.csv has been updated.
Processing CSV file: GLO_2018.csv
CSV file /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed/GLO_2018.csv has been updated.
Processing CSV file: GLO_2019.csv
CSV file /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed/GLO_2019.csv has been updated.
Processing CSV file: AEV_2020.csv
CSV file /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed/AEV_2020.csv has been updated.
Processing CSV file: BPI_2017.csv
CSV file /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed/BPI_2017.csv has been updated.
Processing CSV file: BDO_2017.csv
CSV file /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17a_scrubbed/BDO_2017.csv has been updated.
Processing CSV file: AC_2017.csv
CSV file /Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/_data/17

ValueError: dict contains fields not in fieldnames: 'year', 'ticker'