# Model training

trained using the same 2000 entries. train-test split 8-2

In [1]:
# labelling all the data using the best model: distilbert

import pandas as pd

df = pd.read_csv("data/Training Data/headlines_with_sentiment.csv")
print(df.head())
print(df.describe())

   Index                                           Headline Sentiment_label
0      0   Johnson is asking Santa for a Christmas recovery             POS
1      1  ‘I now fear the worst’: four grim tales of wor...             NEG
2      2  Five key areas Sunak must tackle to serve up e...             NEU
3      3  Covid-19 leaves firms ‘fatally ill-prepared’ f...             NEG
4      4  The Week in Patriarchy Bacardi's 'lady vodka':...             NEG
             Index
count  1984.000000
mean    993.127520
std     575.397447
min       0.000000
25%     495.750000
50%     991.500000
75%    1487.250000
max    1999.000000
   Index                                           Headline Sentiment_label
0      0   Johnson is asking Santa for a Christmas recovery             POS
1      1  ‘I now fear the worst’: four grim tales of wor...             NEG
2      2  Five key areas Sunak must tackle to serve up e...             NEU
3      3  Covid-19 leaves firms ‘fatally ill-prepared’ f...          

In [2]:
# set up

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str) or text is None:
        return ""
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    test_sentence = " ".join(cleaned_tokens)
    return test_sentence

headline = list(df['Headline'])
processed_headline = list(map(preprocess, headline))

data = pd.DataFrame({'Headline': processed_headline, 'Sentiment': df['Sentiment_label'] })
data.head()

[nltk_data] Downloading package wordnet to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Jay
[nltk_data]     Tai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Headline,Sentiment
0,johnson asking santa christmas recovery,POS
1,fear worst four grim tale working life upended,NEG
2,five key area sunak must tackle serve economic...,NEU
3,leaf firm fatally brexit,NEG
4,week patriarchy bacardi vodka latest long line...,NEG


In [3]:
# Split the dataset  into test and train
# 90% train , 10% test and random state 999

from sklearn.model_selection import train_test_split
import numpy as np
np.random.seed(999)

# Define the features and target variable
X = data['Headline']  # Features
y = data['Sentiment']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=999)

# Print the shape of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1785,)
X_test shape: (199,)
y_train shape: (1785,)
y_test shape: (199,)


## Testing using different models

In [17]:
# SVM

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LinearSVC())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("LinearSVC")

print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

LinearSVC
accuracy score: 90.45%
Confusion Matrix:
 [[171   1   1]
 [  4   2   3]
 [ 10   0   7]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.92      0.99      0.96       173
         NEU       0.67      0.22      0.33         9
         POS       0.64      0.41      0.50        17

    accuracy                           0.90       199
   macro avg       0.74      0.54      0.60       199
weighted avg       0.89      0.90      0.89       199



In [18]:
# LR

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Create a pipeline with CountVectorizer, TfidfTransformer, and Logistic Regression
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', LogisticRegression())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test dataset
y_pred = pipeline.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression")
print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Logistic Regression
accuracy score: 86.93%
Confusion Matrix:
 [[173   0   0]
 [  9   0   0]
 [ 17   0   0]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.87      1.00      0.93       173
         NEU       0.00      0.00      0.00         9
         POS       0.00      0.00      0.00        17

    accuracy                           0.87       199
   macro avg       0.29      0.33      0.31       199
weighted avg       0.76      0.87      0.81       199



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
# Multinomial NB

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Create a pipeline with CountVectorizer, TfidfTransformer, and Multinomial Naive Bayes
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', MultinomialNB())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test dataset
y_pred = pipeline.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("MultinomialNB")
print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

MultinomialNB
accuracy score: 86.93%
Confusion Matrix:
 [[173   0   0]
 [  9   0   0]
 [ 17   0   0]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.87      1.00      0.93       173
         NEU       0.00      0.00      0.00         9
         POS       0.00      0.00      0.00        17

    accuracy                           0.87       199
   macro avg       0.29      0.33      0.31       199
weighted avg       0.76      0.87      0.81       199



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [20]:
# Binomial NB

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Create a pipeline with CountVectorizer, TfidfTransformer, and Bernoulli Naive Bayes
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', BernoulliNB())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test dataset
y_pred = pipeline.predict(X_test)
print("BernoulliNB")
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


BernoulliNB
accuracy score: 86.43%
Confusion Matrix:
 [[172   0   1]
 [  9   0   0]
 [ 17   0   0]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.87      0.99      0.93       173
         NEU       0.00      0.00      0.00         9
         POS       0.00      0.00      0.00        17

    accuracy                           0.86       199
   macro avg       0.29      0.33      0.31       199
weighted avg       0.76      0.86      0.81       199



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [21]:
# Grad Boost

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Create a pipeline with CountVectorizer, TfidfTransformer, and Gradient Boosting Classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', GradientBoostingClassifier())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test dataset
y_pred = pipeline.predict(X_test)
print("GradientBoostingClassifier")
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

GradientBoostingClassifier
accuracy score: 87.94%
Confusion Matrix:
 [[172   1   0]
 [  7   0   2]
 [ 12   2   3]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.90      0.99      0.95       173
         NEU       0.00      0.00      0.00         9
         POS       0.60      0.18      0.27        17

    accuracy                           0.88       199
   macro avg       0.50      0.39      0.41       199
weighted avg       0.83      0.88      0.84       199



In [22]:
# XG Boost

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Create a label encoder
label_encoder = LabelEncoder()

# Fit the label encoder on the sentiment labels and transform them to numerical values
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create a pipeline with CountVectorizer, TfidfTransformer, and XGBoost Classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', XGBClassifier())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train_encoded)

# Predict on the test dataset
y_pred_encoded = pipeline.predict(X_test)
print("XGBoostClassifier")
# Calculate accuracy score
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test_encoded, y_pred_encoded))

# Print classification report
print("Classification Report:\n", classification_report(y_test_encoded, y_pred_encoded, target_names=label_encoder.classes_))

XGBoostClassifier
accuracy score: 86.43%
Confusion Matrix:
 [[168   1   4]
 [  8   1   0]
 [ 13   1   3]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.89      0.97      0.93       173
         NEU       0.33      0.11      0.17         9
         POS       0.43      0.18      0.25        17

    accuracy                           0.86       199
   macro avg       0.55      0.42      0.45       199
weighted avg       0.82      0.86      0.84       199



In [23]:
# DT

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming you have already split the data into X_train, X_test, y_train, y_test
# If not, please refer to the previous code snippets

# Create a pipeline with CountVectorizer, TfidfTransformer, and Decision Tree Classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', DecisionTreeClassifier())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test dataset
y_pred = pipeline.predict(X_test)
print("DecisionTreeClassifier")
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

print(pipeline.named_steps['model'].feature_importances_)

DecisionTreeClassifier
accuracy score: 83.42%
Confusion Matrix:
 [[161   4   8]
 [  5   1   3]
 [ 11   2   4]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.91      0.93      0.92       173
         NEU       0.14      0.11      0.12         9
         POS       0.27      0.24      0.25        17

    accuracy                           0.83       199
   macro avg       0.44      0.43      0.43       199
weighted avg       0.82      0.83      0.83       199

[0. 0. 0. ... 0. 0. 0.]


In [24]:
# RF

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming you have already split the data into X_train, X_test, y_train, y_test
# If not, please refer to the previous code snippets

# Create a pipeline with CountVectorizer, TfidfTransformer, and Decision Tree Classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', RandomForestClassifier())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test dataset
y_pred = pipeline.predict(X_test)
print("RandomForestClassifier")
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

RandomForestClassifier
accuracy score: 87.44%
Confusion Matrix:
 [[172   0   1]
 [  8   1   0]
 [ 15   1   1]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.88      0.99      0.93       173
         NEU       0.50      0.11      0.18         9
         POS       0.50      0.06      0.11        17

    accuracy                           0.87       199
   macro avg       0.63      0.39      0.41       199
weighted avg       0.83      0.87      0.83       199



In [25]:
# KNN

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Create a pipeline with CountVectorizer, TfidfTransformer, and K-Nearest Neighbors Classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', KNeighborsClassifier())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test dataset
y_pred = pipeline.predict(X_test)
print("KNeighborsClassifier")
accuracy = accuracy_score(y_test, y_pred)
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("accuracy score: {:.2f}%".format(accuracy * 100))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

KNeighborsClassifier
accuracy score: 87.94%
Confusion Matrix:
 [[171   1   1]
 [  7   1   1]
 [ 14   0   3]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.89      0.99      0.94       173
         NEU       0.50      0.11      0.18         9
         POS       0.60      0.18      0.27        17

    accuracy                           0.88       199
   macro avg       0.66      0.43      0.46       199
weighted avg       0.85      0.88      0.85       199



## Compare the best ML model with available Libraries

In [12]:
# SVM

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LinearSVC())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(pipeline.named_steps['model'].decision_function(X_test))

ValueError: could not convert string to float: 'behind job returned key takeaway june u job report'

In [5]:
from transformers import pipeline, set_seed
set_seed(999)

# Initialize DistilBERT pipeline
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Apply DistilBERT to test set
y_pred_distilbert = [classifier(headline)[0]['label'] for headline in X_test]
y_pred_distilbert = ['POS' if s == 'POSITIVE' else 'NEG' if s == 'NEGATIVE' else 'NEU' for s in y_pred_distilbert]

# Compute and print metrics
accuracy_distilbert = accuracy_score(y_test, y_pred_distilbert)
print("DistilBERT")
print("Accuracy score: {:.2f}%".format(accuracy_distilbert * 100))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_distilbert))
print("Classification Report:\n", classification_report(y_test, y_pred_distilbert))

Device set to use cpu


DistilBERT
Accuracy score: 80.40%
Confusion Matrix:
 [[157   0  16]
 [  6   0   3]
 [ 14   0   3]]
Classification Report:
               precision    recall  f1-score   support

         NEG       0.89      0.91      0.90       173
         NEU       0.00      0.00      0.00         9
         POS       0.14      0.18      0.15        17

    accuracy                           0.80       199
   macro avg       0.34      0.36      0.35       199
weighted avg       0.78      0.80      0.79       199



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
