# PART C: Modeling

## Implement a statistical models baseline

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression

## Load and Prepare Dataset

This section loads the dataset required for the model, preprocesses it by removing any rows with missing 'Sentiment' values, and combines it with another dataset loaded from an external source.

In [4]:
# Load dataset from a CSV file
new_df = pd.read_csv('final-data-updated.csv', encoding='utf-8-sig')
# Remove rows where 'Sentiment' column has missing values
new_df = new_df.dropna(subset=['Sentiment'])

# Load additional dataset from the 'financial_phrasebank'
dataset = load_dataset('financial_phrasebank', 'sentences_50agree')
train_dataset = dataset['train']

# Extract sentences and labels
x = train_dataset['sentence']
y = train_dataset['label']

#Extract columns from our dataset
new_x = new_df['Summary'].tolist()
new_y = new_df['Sentiment'].tolist()

## Split Dataset

Split the dataset into training and testing sets to evaluate the model effectively. Additionally, combine the newly loaded data with the existing dataset.

In [5]:
# Split dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Append newly loaded data to the training set
x_train += new_x
y_train += new_y

print("Trainset:", len(x_train))
print("Validset:", len(x_test))

Trainset: 6152
Validset: 970


## Feature Extraction with TF-IDF

Use Term Frequency-Inverse Document Frequency (TF-IDF) to convert the text data into a format that can be used by the machine learning model.

In [6]:
# Initialize TF-IDF vectorizer with a maximum of 1000 features
vectorizer = TfidfVectorizer(max_features=1000)
# Fit the vectorizer to the training data and transform it
x_train_tfidf = vectorizer.fit_transform(x_train).toarray()
# Transform the test data based on the trained vectorizer
x_valid_tfidf = vectorizer.transform(x_test).toarray()

In [8]:
# Logistic Regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(x_train_tfidf, y_train)

## Train the SVM Classifier

Train a Support Vector Machine (SVM) classifier using the linear kernel and evaluate its performance on the test set.

In [9]:
# Initialize the SVM classifier with a linear kernel
svm_classifier = SVC(kernel='linear', C=1)
# Train the classifier on the training data
svm_classifier.fit(x_train_tfidf, y_train)

## Predict and Calculate the Accuracy

This section involves predicting labels for the test data using SVM and TF-IDF, calculating their accuracy and F1 scores.

In [12]:
# Predict labels for the test data
y_pred_svm = svm_classifier.predict(x_valid_tfidf)
y_pred = model.predict(x_valid_tfidf)
# Calculate accuracy and F1 score

# SVM evaluation
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1_score = f1_score(y_test, y_pred_svm, average='weighted')

# TF IDF evaluation
TF_IDF_accuracy = accuracy_score(y_test, y_pred)
TF_IDF_score = f1_score(y_test, y_pred, average='weighted')

print(f"SVM Accuracy: {svm_accuracy:.5f}")
print(f"SVM F1-score: {svm_f1_score:.5f}")

print(f"TF-IDF Accuracy: {TF_IDF_accuracy:.5f}")
print(f"TF-IDF F1-score: {TF_IDF_score:.5f}")

SVM Accuracy: 0.77629
SVM F1-score: 0.76472
TF-IDF Accuracy: 0.77216
TF-IDF F1-score: 0.76128
