# Machine Learning
In this notebook, the researcher used Recurrent Neural Network and Multinomial Naive Bayes models for predicting the sentiments of the construction news from Ireland and UK.

In [10]:
#!pip install --upgrade gensim
#!pip install scikeras[tensorflow]

In [1]:
import pandas as pd
from functions import RNN, MNB
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
import plotly.express as px

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
irelandNewsDF = pd.read_csv('datasets/ie_news_cleaned.csv')
ukNewsDF = pd.read_csv('datasets/uk_news_cleaned.csv')

In [3]:
irelandNewsDF.head()

Unnamed: 0.1,Unnamed: 0,title,publish_date,source_country,sentiment,cleaned_text,sentiment_label,word_length,char_length,month
0,1,Ministers Harris and Ryan welcome record numbe...,2023-01-17 21:41:06,ie,0.341,minister harris ryan welcome record number enr...,positive,16,113,January 2023
1,2,New €4.8m state-of-the-art facility planned fo...,2022-12-05 11:06:50,ie,0.377,new 4.8m state-of-the-art facility planned ucd...,positive,9,62,December 2022
2,3,‘Not possible’ to make Northern Ireland Protoc...,2023-02-01 20:45:45,ie,-0.146,possible make northern ireland protocol work e...,negative,12,69,February 2023
3,4,Shoppers must demand longer lasting goods and ...,2023-01-10 16:14:04,ie,-0.077,shopper must demand longer lasting good le pac...,negative,15,95,January 2023
4,5,Opportunity knocks for investors willing to ma...,2022-12-06 07:00:00,ie,-0.137,opportunity knock investor willing make long-t...,negative,13,90,December 2022


In [4]:
irelandNewsDF['sentiment_label'].shape

(426,)

## RNN Model

### Ireland RNN Model

In [5]:
# model, X_train_padded, X_test_padded, y_train, y_test, y_pred = RNN.train_model(irelandNewsDF)
model, X_train, X_test, y_train, y_test = RNN.train_model(irelandNewsDF)

Accuracy:  0.26744186046511625
Precision: 0.23958333333333334
Recall: 0.14743589743589744
F1 Score: 0.18253968253968256


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
ie_RNN_grid_search = RNN.hypertune(model, X_train_vec=X_train, y_train=y_train, y_test=y_test, X_test_vec=X_test)



In [7]:
# Get the best model
ie_best_model = ie_RNN_grid_search.best_estimator_

results = ie_RNN_grid_search.cv_results_
params = results['params']
mean_test_scores = results['mean_test_score']

# Print the accuracy scores for each combination
for param, score in zip(params, mean_test_scores):
    print("Parameters:", param)
    print("Accuracy:", score)
    print()

print("BEST PARAMS: ", ie_RNN_grid_search.best_params_)

# Evaluate the best model on the test set
test_accuracy = ie_best_model.model.evaluate(X_test, y_test)

print("Test Accuracy:", test_accuracy)

Parameters: {'batch_size': 32, 'epochs': 10}
Accuracy: 0.31764705882352945

Parameters: {'batch_size': 32, 'epochs': 20}
Accuracy: 0.26764705882352946

Parameters: {'batch_size': 32, 'epochs': 30}
Accuracy: 0.3088235294117647

Parameters: {'batch_size': 64, 'epochs': 10}
Accuracy: 0.30294117647058827

Parameters: {'batch_size': 64, 'epochs': 20}
Accuracy: 0.29411764705882354

Parameters: {'batch_size': 64, 'epochs': 30}
Accuracy: 0.3647058823529412

Parameters: {'batch_size': 128, 'epochs': 10}
Accuracy: 0.2235294117647059

Parameters: {'batch_size': 128, 'epochs': 20}
Accuracy: 0.3205882352941177

Parameters: {'batch_size': 128, 'epochs': 30}
Accuracy: 0.3205882352941177

BEST PARAMS:  {'batch_size': 64, 'epochs': 30}
Test Accuracy: [0.9164927005767822, 0.24418604373931885, 0.4655172526836395, 0.7941176295280457]


In [9]:
ukNewsDF.head()

Unnamed: 0.1,Unnamed: 0,title,publish_date,source_country,sentiment,cleaned_text,sentiment_label,word_length,char_length,month
0,0,Biogas firm to hold consultation on £50 millio...,2022-11-28 14:19:32,GB,0.087,biogas firm hold consultation 50 million anaer...,positive,16,100,November 2022
1,1,Scots Biotechnology network’s investment hits ...,2023-02-06 15:54:43,GB,0.251,scots biotechnology network investment hits 25...,positive,7,57,February 2023
2,2,Council chases owners of thousands of empty ho...,2022-11-17 09:36:50,GB,-0.111,council chases owners thousands empty homes gl...,negative,10,60,November 2022
3,3,The west’s ‘dash for gas’ in Africa is nothing...,2022-11-13 07:00:04,GB,-0.018,west dash gas africa nothing energy colonialism,negative,13,75,November 2022
4,4,Mr Heaton-Harris was interested in how MJM Mar...,2022-11-17 14:26:52,GB,0.275,mr heaton-harris interested mjm marine mivan t...,positive,16,102,November 2022


In [10]:
model, uk_X_train, uk_X_test, uk_y_train, uk_y_test = RNN.train_model(ukNewsDF)

Accuracy:  0.2876712328767123
Precision: 0.23863636363636365
Recall: 0.18918918918918917
F1 Score: 0.21105527638190957


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### UK RNN Model

In [11]:
uk_RNN_grid_search = RNN.hypertune(model, X_train_vec=uk_X_train, y_train=uk_y_train, y_test=uk_y_test, X_test_vec=uk_X_test)



In [12]:
# Get the best model
uk_best_model = uk_RNN_grid_search.best_estimator_

results = uk_RNN_grid_search.cv_results_
params = results['params']
mean_test_scores = results['mean_test_score']

# Print the accuracy scores for each combination
for param, score in zip(params, mean_test_scores):
    print("Parameters:", param)
    print("Accuracy:", score)
    print()

print("BEST PARAMS: ", uk_RNN_grid_search.best_params_)

# Evaluate the best model on the test set
test_accuracy = uk_best_model.model.evaluate(uk_X_test, uk_y_test)

print("Test Accuracy:", test_accuracy)

Parameters: {'batch_size': 32, 'epochs': 10}
Accuracy: 0.38283087027914614

Parameters: {'batch_size': 32, 'epochs': 20}
Accuracy: 0.3966633825944171

Parameters: {'batch_size': 32, 'epochs': 30}
Accuracy: 0.4035205254515599

Parameters: {'batch_size': 64, 'epochs': 10}
Accuracy: 0.35427914614121503

Parameters: {'batch_size': 64, 'epochs': 20}
Accuracy: 0.38749425287356326

Parameters: {'batch_size': 64, 'epochs': 30}
Accuracy: 0.39084400656814455

Parameters: {'batch_size': 128, 'epochs': 10}
Accuracy: 0.3760065681444992

Parameters: {'batch_size': 128, 'epochs': 20}
Accuracy: 0.3977274220032841

Parameters: {'batch_size': 128, 'epochs': 30}
Accuracy: 0.38526765188834156

BEST PARAMS:  {'batch_size': 32, 'epochs': 30}
Test Accuracy: [-18.21761131286621, 0.29223743081092834, 0.6328125, 0.75]


<i>I'm sorry I did not have enough time to implement a solution for getting the performance metrics in gridsearchCV and cross validation and just hardcoded the values from the result</i>

This is a big no-no for programmers. Never hardcode the results as it can change!

In [9]:
data = [
    {
        "parameters": 'batch_size: 32, epochs: 10',
        "accuracy": 0.31764705882352945,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 32, epochs: 20',
        "accuracy": 0.26764705882352946,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 32, epochs: 30',
        "accuracy": 0.3088235294117647,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 64, epochs: 10',
        "accuracy": 0.30294117647058827,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 64, epochs: 20',
        "accuracy": 0.29411764705882354,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 64, epochs: 30',
        "accuracy": 0.3647058823529412,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 128, epochs: 10',
        "accuracy": 0.2235294117647059,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 128, epochs: 20',
        "accuracy": 0.3205882352941177,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 128, epochs: 30',
        "accuracy": 0.3205882352941177,
        "country": "ie"
    },
    {
        "parameters": 'batch_size: 32, epochs: 10',
        "accuracy": 0.38283087027914614,
        "country": "uk"
    },
    {
        "parameters": 'batch_size: 32, epochs: 20',
        "accuracy": 0.4035205254515599,
        "country": "uk"
    },
    {
        "parameters": 'batch_size: 32, epochs: 30',
        "accuracy": 0.3966633825944171,
        "country": "uk"
    },
    {
        "parameters": 'batch_size: 64, epochs: 10',
        "accuracy": 0.35427914614121503,
        "country": "uk"
    },
    {
        "parameters": 'batch_size: 64, epochs: 20',
        "accuracy": 0.38749425287356326,
        "country": "uk"
    },
    {
        "parameters": 'batch_size: 64, epochs: 30',
        "accuracy": 0.39084400656814455,
        "country": "uk"
    },
    {
        "parameters": 'batch_size: 128, epochs: 10',
        "accuracy": 0.3760065681444992,
        "country": "uk"
    },
    {
        "parameters": 'batch_size: 128, epochs: 20',
        "accuracy": 0.3977274220032841,
        "country": "uk"
    },
    {
        "parameters": 'batch_size: 128, epochs: 30',
        "accuracy": 0.38526765188834156,
        "country": "uk"
    },
]

RNNGridSearchMetric = pd.DataFrame(data)
fig = px.bar(RNNGridSearchMetric, x="parameters", y="accuracy",
             color='country', barmode='group',
             color_discrete_map={
                 "ie": "#FF883E",
                 "uk": "#00247D"
             },
             title="Accuracy of the RNN model for Ireland and United Kingdom News Models",
             width=700, labels={'x': 'Parameters', 'y': 'Accuracy'})
fig.show()

### Cross Validation of the RNN models

In [8]:
RNN.cross_validate(irelandNewsDF, ie_best_model)



Accuracy:  0.5813953488372093
Precision: 0.3877777777777778
Recall: 0.3904286770140429
F1 Score: 0.38749194068343


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy:  0.3058823529411765
Precision: 0.2222222222222222
Recall: 0.19259259259259257
F1 Score: 0.2063492063492063


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy:  0.27058823529411763
Precision: 0.23232323232323235
Recall: 0.17037037037037037
F1 Score: 0.19658119658119658


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy:  0.2
Precision: 0.21794871794871795
Recall: 0.1259259259259259
F1 Score: 0.15962441314553993


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy:  0.3411764705882353
Precision: 0.23577235772357721
Recall: 0.21481481481481482
F1 Score: 0.22480620155038758


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
RNN.cross_validate(ukNewsDF, uk_best_model)



Accuracy:  0.3105022831050228
Precision: 0.24637681159420288
Recall: 0.20420420420420418
F1 Score: 0.22331691297208536


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy:  0.680365296803653
Precision: 0.45396825396825397
Recall: 0.4557800224466891
F1 Score: 0.4545848173755151


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy:  0.3532110091743119
Precision: 0.2467948717948718
Recall: 0.2333333333333333
F1 Score: 0.23987538940809972


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy:  0.2798165137614679
Precision: 0.24796747967479674
Recall: 0.1831831831831832
F1 Score: 0.21070811744386875


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy:  0.1834862385321101
Precision: 0.24691358024691357
Recall: 0.12012012012012012
F1 Score: 0.1616161616161616


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import plotly.graph_objects as go

crossValidationMetrics = [
    {
        "Accuracy":  0.5813953488372093,
        "Precision": 0.3877777777777778,
        "Recall": 0.3904286770140429,
        "F1 Score": 0.38749194068343,
        "iteration": 1,
        "country": "ie"
    },
    {
        "Accuracy":  0.3058823529411765,
        "Precision": 0.2222222222222222,
        "Recall": 0.19259259259259257,
        "F1 Score": 0.2063492063492063,
        "iteration": 2,
        "country": "ie"
    },
    {
        "Accuracy":  0.27058823529411763,
        "Precision": 0.23232323232323235,
        "Recall": 0.17037037037037037,
        "F1 Score": 0.19658119658119658,
        "iteration": 3,
        "country": "ie"
    },
    {
        "Accuracy":  0.2,
        "Precision": 0.21794871794871795,
        "Recall": 0.1259259259259259,
        "F1 Score": 0.15962441314553993,
        "iteration": 4,
        "country": "ie"
    },
    {
        "Accuracy":  0.3411764705882353,
        "Precision": 0.23577235772357721,
        "Recall": 0.21481481481481482,
        "F1 Score": 0.22480620155038758,
        "iteration": 5,
        "country": "ie"
    },
    {
        "Accuracy":  0.3105022831050228,
        "Precision": 0.24637681159420288,
        "Recall": 0.20420420420420418,
        "F1 Score": 0.22331691297208536,
        "iteration": 1,
        "country": "uk"
    },
    {
        "Accuracy":  0.680365296803653,
        "Precision": 0.45396825396825397,
        "Recall": 0.4557800224466891,
        "F1 Score": 0.4545848173755151,
        "iteration": 2,
        "country": "uk"
    },
    {
        "Accuracy":  0.3532110091743119,
        "Precision": 0.2467948717948718,
        "Recall": 0.2333333333333333,
        "F1 Score": 0.23987538940809972,
        "iteration": 3,
        "country": "uk"
    },
    {
        "Accuracy":  0.2798165137614679,
        "Precision": 0.24796747967479674,
        "Recall": 0.1831831831831832,
        "F1 Score": 0.21070811744386875,
        "iteration": 4,
        "country": "uk"
    },
    {
        "Accuracy":  0.1834862385321101,
        "Precision": 0.24691358024691357,
        "Recall": 0.12012012012012012,
        "F1 Score": 0.1616161616161616,
        "iteration": 5,
        "country": "uk"
    }
]

crossValidationMetrics = pd.DataFrame(crossValidationMetrics)
fig = px.bar(crossValidationMetrics, x="iteration", y="Accuracy",
             color='country', barmode='group',
             color_discrete_map={
                 "ie": "#FF883E",
                 "uk": "#00247D"
             },
             title="Cross Validation Accuracy of the RNN model for Ireland and United Kingdom News Datasets",
             width=700, labels={'x': 'Parameters', 'y': 'Accuracy'})
fig.show()
fig = px.bar(crossValidationMetrics, x="iteration", y="Precision",
             color='country', barmode='group',
             color_discrete_map={
                 "ie": "#FF883E",
                 "uk": "#00247D"
             },
             title="Cross Validation Precision of the RNN model for Ireland and United Kingdom News Datasets",
             width=700, labels={'x': 'Parameters', 'y': 'Accuracy'})
fig.show()

## Naive Bayes Model

### Ireland MNB Model

In [5]:
ie_naive_bayes_model, X_train_features, X_test_features, y_train, y_test = MNB.train_model(irelandNewsDF)

Accuracy: 0.6395348837209303
Precision: 0.6754082137555666
Recall: 0.6395348837209303
F1 Score: 0.6427135884500226



In [6]:
ie_best_model = MNB.hypertune_model(ie_naive_bayes_model, X_train_features, y_train, X_test_features, y_test)

Best parameters: {'alpha': 1.0}
Accuracy: 0.6395348837209303
Precision: 0.6754082137555666
Recall: 0.6395348837209303
F1 Score: 0.6427135884500226





In [7]:
MNB.cross_validate(irelandNewsDF, ie_best_model)

Accuracy: 0.6511627906976745
Precision: 0.6498977174849269
Recall: 0.6511627906976745
F1 Score: 0.6423597792095493

Accuracy: 0.6941176470588235
Precision: 0.6955017301038063
Recall: 0.6941176470588235
F1 Score: 0.6912758346581875

Accuracy: 0.6823529411764706
Precision: 0.6947779111644659
Recall: 0.6823529411764706
F1 Score: 0.6807666886979511

Accuracy: 0.7058823529411765
Precision: 0.7118912080961417
Recall: 0.7058823529411765
F1 Score: 0.7006100775777662

Accuracy: 0.6470588235294118
Precision: 0.6483616702494952
Recall: 0.6470588235294118
F1 Score: 0.6473522890309117



  _warn_prf(average, modifier, msg_start, len(result))


### UK MNB Model

In [163]:
uk_naive_bayes_model, uk_X_train_features, uk_X_test_features, uk_y_train, uk_y_test = MNB.train_model(ukNewsDF)

Accuracy: 0.684931506849315
Precision: 0.6854142577016
Recall: 0.684931506849315
F1 Score: 0.6844706975365019



In [165]:
param_grid = {'alpha': [0.1, 0.5, 1.0]}
grid_search = GridSearchCV(uk_naive_bayes_model, param_grid, cv=5)
grid_search.fit(uk_X_train_features, uk_y_train)

best_model = grid_search.best_estimator_
uk_y_pred = best_model.predict(uk_X_test_features)

print("Best parameters:", grid_search.best_params_)
evaluate_model(uk_y_pred, uk_y_test)

Best parameters: {'alpha': 1.0}
Accuracy: 0.684931506849315
Precision: 0.6854142577016
Recall: 0.684931506849315
F1 Score: 0.6844706975365019




The least populated class in y has only 1 members, which is less than n_splits=5.



In [168]:
MNB.cross_validate(ukNewsDF, best_model)

Accuracy: 0.7168949771689498
Precision: 0.7183411220310845
Recall: 0.7168949771689498
F1 Score: 0.7166587413354794

Accuracy: 0.7442922374429224
Precision: 0.7427355749273558
Recall: 0.7442922374429224
F1 Score: 0.7421083978558666

Accuracy: 0.6880733944954128
Precision: 0.6884319592472613
Recall: 0.6880733944954128
F1 Score: 0.6880208815988633

Accuracy: 0.7568807339449541
Precision: 0.758277944205205
Recall: 0.7568807339449541
F1 Score: 0.7562800591666109

Accuracy: 0.6926605504587156
Precision: 0.6929617273653972
Recall: 0.6926605504587156
F1 Score: 0.6926799519768989




The least populated class in y has only 1 members, which is less than n_splits=5.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [6]:
NBMetrics = [
    {
        "iteration": 1,
        "Accuracy": 0.6511627906976745,
        "Precision": 0.6498977174849269,
        "Recall": 0.6511627906976745,
        "F1 Score": 0.6423597792095493,
        "country": "ie"
    },
    {
        "iteration": 2,
        "Accuracy": 0.6941176470588235,
        "Precision": 0.6955017301038063,
        "Recall": 0.6941176470588235,
        "F1 Score": 0.6912758346581875,
        "country": "ie"
    },
    {
        "iteration": 3,
        "Accuracy": 0.6823529411764706,
        "Precision": 0.6947779111644659,
        "Recall": 0.6823529411764706,
        "F1 Score": 0.6807666886979511,
        "country": "ie"
    },
    {
        "iteration": 4,
        "Accuracy": 0.7058823529411765,
        "Precision": 0.7118912080961417,
        "Recall": 0.7058823529411765,
        "F1 Score": 0.7006100775777662,
        "country": "ie"
    },
    {
        "iteration": 5,
        "Accuracy": 0.6470588235294118,
        "Precision": 0.6483616702494952,
        "Recall": 0.6470588235294118,
        "F1 Score": 0.6473522890309117,
        "country": "ie"
    },
    {
        "iteration": 1,
        "Accuracy": 0.7168949771689498,
        "Precision": 0.7183411220310845,
        "Recall": 0.7168949771689498,
        "F1 Score": 0.7166587413354794,
        "country": "uk"
    },
    {
        "iteration": 2,
        "Accuracy": 0.7442922374429224,
        "Precision": 0.7427355749273558,
        "Recall": 0.7442922374429224,
        "F1 Score": 0.7421083978558666,
        "country": "uk"
    },
    {
        "iteration": 3,
        "Accuracy": 0.6880733944954128,
        "Precision": 0.6884319592472613,
        "Recall": 0.6880733944954128,
        "F1 Score": 0.6880208815988633,
        "country": "uk"
    },
    {
        "iteration": 4,
        "Accuracy": 0.7568807339449541,
        "Precision": 0.758277944205205,
        "Recall": 0.7568807339449541,
        "F1 Score": 0.7562800591666109,
        "country": "uk"
    },
    {
        "iteration": 5,
        "Accuracy": 0.6926605504587156,
        "Precision": 0.6929617273653972,
        "Recall": 0.6926605504587156,
        "F1 Score": 0.6926799519768989,
        "country": "uk"
    },
]

NBMetricsDF = pd.DataFrame(NBMetrics)

In [7]:
fig = px.bar(NBMetricsDF, x="iteration", y="Accuracy",
             color='country', barmode='group',
             color_discrete_map={
                 "ie": "#FF883E",
                 "uk": "#00247D"
             },
             title="Cross Validation Accuracy of the MNB model for Ireland and United Kingdom News Datasets",
             width=700, labels={'x': 'Parameters', 'y': 'Accuracy'})
fig.show()
fig = px.bar(NBMetricsDF, x="iteration", y="Precision",
             color='country', barmode='group',
             color_discrete_map={
                 "ie": "#FF883E",
                 "uk": "#00247D"
             },
             title="Cross Validation Precision of the MNB model for Ireland and United Kingdom News Datasets",
             width=700, labels={'x': 'Parameters', 'y': 'Accuracy'})
fig.show()