In [147]:
import warnings
warnings.filterwarnings('ignore')

In [148]:
# Load data
import pandas as pd

df = pd.read_csv('data/News Articles/Amy/Testing.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Author,Date,Desc,Sentiment
0,0,Europe Races to Craft Plan to Save Zelenskiy a...,"By Alex Wickham, Andrea Palasciano, and Irina ...",2025.3.3 at GMT+8 07:02 Updated on 2025.3.3 at...,The Polish zloty recently hit its strongest le...,NEU
1,1,European Leaders Seek ??Coalition of Willing?�...,By Alex Wickham and Irina Anghel,2025.3.2 at GMT+8 17:26 Updated on 2025.3.3 at...,"European leaders are working to form a ""coalit...",NEU
2,2,"Trump Sees Zelenskiy as the Problem, But Ukrai...","By Daryna Krasnolutska, Olesia Safronova, and ...",2025.3.2 at GMT+8 15:14,Tensions between US President Donald Trump and...,NEU
3,3,"Trump Heads Toward Tariff Barrage on Canada, M...",By Josh Wingrove,2025.3.3 at GMT+8 03:18 Updated on 2025.3.3 at...,President Trump plans to impose 25% tariffs on...,NEG
4,4,Bessent Says He Sees Inflation Reaching Fed?�?...,By Tony Czuczka,2025.3.3 at GMT+8 00:21,Treasury Secretary Scott Bessent expressed con...,NEG


In [149]:
# Preprocess data

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text is None:
        return ""
    # Remove extra spaces and normalize
    text = ' '.join(text.split())
    tokens = word_tokenize(text.lower())
    # Retain alphanumeric tokens to keep numbers (e.g., "20%")
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(cleaned_tokens)

df['Title_processed'] = df['Title'].apply(preprocess)
df['Desc_processed'] = df['Desc'].apply(preprocess)

[nltk_data] Downloading package wordnet to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [150]:
# Python libraries

# 1.DistilBERT
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def DistilBERT(sentence):
    cl = classifier(sentence)
    t = cl[0]['label']
    score = cl[0]['score']
    label = 'NEU'
    if t == 'POSITIVE':
        label = 'POS'
    elif t == 'NEGATIVE':
        label = 'NEG'
    return (label, score)

# 2.Flair
from flair.data import Sentence
from flair.nn import Classifier
tagger = Classifier.load('sentiment')

def Flair(sentence):
    sentence = Sentence(sentence)
    tagger.predict(sentence)
    cl = sentence.labels[0]
    t = cl.value
    label = 'NEU'
    if t == 'POSITIVE':
        label = 'POS'
    elif t == 'NEGATIVE':
        label = 'NEG'
    score = cl.score
    return (label, score)

# 3.FinBERT
classifier1 = pipeline("sentiment-analysis", model="ProsusAI/finbert")

def FinBERT(sentence):
    cl = classifier(sentence)
    t = cl[0]['label']
    score = cl[0]['score']
    label = 'NEU'
    if t == 'positive':
        label = 'POS'
    elif t == 'negative':
        label = 'NEG'
    return (label, score)

print(DistilBERT('Hello'))
print(Flair('Hello'))
print(FinBERT('Hello'))


('POS', 0.9995185136795044)
('POS', 0.8004733920097351)
('NEU', 0.9995185136795044)


In [151]:
# Simple ML models

# data which they were trained on
df1 = pd.read_csv("data/News Articles/headlines_with_sentiment (grok).csv")
df1['Headline'] = df1['Headline'].apply(preprocess)
X = df1['Headline']
y = df1['Sentiment_label']

# 1.SVM
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC

pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LinearSVC())
])
pipeline_svc.fit(X, y)

def SVC(sentence):
    return pipeline_svc.predict(sentence)

# 2.KNN
# KNN
from sklearn.neighbors import KNeighborsClassifier

pipeline_knn = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', KNeighborsClassifier())
])
pipeline_knn.fit(X, y)

def KNN(sentence):
    return pipeline_knn.predict(sentence)

# 3.GBDT
from sklearn.ensemble import GradientBoostingClassifier

pipeline_GBDT = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', GradientBoostingClassifier())
])

pipeline_GBDT.fit(X, y)

def GBDT(sentence):
    return pipeline_GBDT.predict(sentence)

print(SVC(['Herllo i am good']))
print(KNN(['Herllo i am bad']))
print(GBDT(['Herllo i am bad']))

['POS']
['NEG']
['NEG']


In [172]:
df['DistilBERT'] = df['Title_processed'].apply(DistilBERT).apply(lambda x:x[0] )
df['Flair'] = df['Title_processed'].apply(Flair).apply(lambda x:x[0] )
df['FinBERT'] = df['Title_processed'].apply(FinBERT).apply(lambda x:x[0] )
df['SVC'] = pipeline_svc.predict(df['Title_processed'])
df['KNN'] = pipeline_knn.predict(df['Title_processed'])
df['GBDT'] = pipeline_GBDT.predict(df['Title_processed'])

In [169]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(accuracy_score(df['Sentiment'], df['DistilBERT']))
print(accuracy_score(df['Sentiment'], df['Flair']))
print(accuracy_score(df['Sentiment'], df['FinBERT']))
print(accuracy_score(df['Sentiment'], df['SVC']))
print(accuracy_score(df['Sentiment'], df['KNN']))
print(accuracy_score(df['Sentiment'], df['GBDT']))

0.43902439024390244
0.4634146341463415
0.3780487804878049
0.4268292682926829
0.4024390243902439
0.3902439024390244


In [171]:
print(classification_report(df['Sentiment'], df['DistilBERT']))

              precision    recall  f1-score   support

         NEG       0.44      0.90      0.59        31
         NEU       0.00      0.00      0.00        31
         POS       0.44      0.40      0.42        20

    accuracy                           0.44        82
   macro avg       0.29      0.43      0.34        82
weighted avg       0.27      0.44      0.33        82

