In [50]:
from dotenv import load_dotenv
import os
from pymongo import MongoClient
import pandas as pd
import functions.utils as utils
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import plotly.express as px
from scipy import stats
import numpy as np

In [2]:
load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")

In [3]:
# mongoClient = MongoClient(MONGODB_URI)
# db = mongoClient["news_data"]
# ieColl = db["ireland_news"]

### The Ireland News Dataset

In [4]:
# https://towardsdatascience.com/list-comprehensions-vs-for-loops-it-is-not-what-you-think-34071d4d8207
# Using array computation is a lot faster than using for loops and appending to a list
# irelandNews = list(ieColl.find({}))
# print(len(irelandNews))

# irelandNewsDF = pd.DataFrame(irelandNews)

# REMOVE THIS ONCE YOU ARE DONE WITH THE EDA
irelandNewsDF = pd.read_csv('datasets/ie_news.csv')
irelandNewsDF.head()

Unnamed: 0,_id,id,title,publish_date,source_country,sentiment
0,646fddfd0576b45037ddb2c6,111013564,Kilkenny-based recruitment company wins two in...,2023-04-18 08:59:00,ie,0.544
1,646fddfd0576b45037ddb2c7,88450232,Ministers Harris and Ryan welcome record numbe...,2023-01-17 21:41:06,ie,0.341
2,646fddfd0576b45037ddb2c8,75989338,New €4.8m state-of-the-art facility planned fo...,2022-12-05 11:06:50,ie,0.377
3,646fddfd0576b45037ddb2c9,92473892,‘Not possible’ to make Northern Ireland Protoc...,2023-02-01 20:45:45,ie,-0.146
4,646fddfd0576b45037ddb2ca,86433356,Shoppers must demand longer lasting goods and ...,2023-01-10 16:14:04,ie,-0.077


In [5]:
irelandNewsDF.describe()

Unnamed: 0,id,sentiment
count,505.0,505.0
mean,90490210.0,-0.003907
std,14507340.0,0.288732
min,64968580.0,-0.774
25%,77182770.0,-0.216
50%,90126080.0,-0.029
75%,103448600.0,0.186
max,112426300.0,0.879


In [6]:
irelandNewsDF.describe(include="object")

Unnamed: 0,_id,title,publish_date,source_country
count,505,505,505,505
unique,505,505,496,1
top,646fddfd0576b45037ddb2c6,Kilkenny-based recruitment company wins two in...,2022-11-20 03:30:00,ie
freq,1,1,3,505


In [7]:
irelandNewsDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   _id             505 non-null    object 
 1   id              505 non-null    int64  
 2   title           505 non-null    object 
 3   publish_date    505 non-null    object 
 4   source_country  505 non-null    object 
 5   sentiment       505 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 23.8+ KB


In [8]:
irelandNewsDF.drop(["_id", "id"], axis=1, inplace=True)
irelandNewsDF["publish_date"] = pd.to_datetime(irelandNewsDF["publish_date"])
# Remove news that were on april
irelandNewsDF = irelandNewsDF[irelandNewsDF["publish_date"].dt.month != 4]
irelandNewsDF.head()

Unnamed: 0,title,publish_date,source_country,sentiment
1,Ministers Harris and Ryan welcome record numbe...,2023-01-17 21:41:06,ie,0.341
2,New €4.8m state-of-the-art facility planned fo...,2022-12-05 11:06:50,ie,0.377
3,‘Not possible’ to make Northern Ireland Protoc...,2023-02-01 20:45:45,ie,-0.146
4,Shoppers must demand longer lasting goods and ...,2023-01-10 16:14:04,ie,-0.077
5,Opportunity knocks for investors willing to ma...,2022-12-06 07:00:00,ie,-0.137


In [9]:
irelandNewsDF.shape

(426, 4)

In [10]:
irelandNewsDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 1 to 504
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   title           426 non-null    object        
 1   publish_date    426 non-null    datetime64[ns]
 2   source_country  426 non-null    object        
 3   sentiment       426 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 16.6+ KB


Symbols for decimals and hyphenated words were not removed so that it the meaning of the words are not altered.

In [11]:
# Replace the &amp;rsquo; and &amp;lsquo; with ' symbols
irelandNewsDF['cleaned_text'] = irelandNewsDF['title'].str.replace('&amp;rsquo;', "'")
irelandNewsDF['cleaned_text'] = irelandNewsDF['title'].str.replace('&amp;lsquo;', "'")
irelandNewsDF['cleaned_text'] = irelandNewsDF['title'].apply(utils.lower_case)
#remove punctuations
irelandNewsDF['cleaned_text'] = irelandNewsDF['cleaned_text'].apply(utils.remove_punctuation)

In [12]:
irelandNewsDF['cleaned_text'] = irelandNewsDF['cleaned_text'].apply(utils.remove_stopwords)

Lemmatize the text so that inflectional endings are removed.

In [13]:
irelandNewsDF['cleaned_text'] = irelandNewsDF['cleaned_text'].apply(utils.lemmatize_text)

Feature extraction

In [14]:
irelandNewsDF['sentiment_label'] = irelandNewsDF['sentiment'].apply(utils.set_sentiment_label)

In [15]:
irelandNewsDF['word_length'] = irelandNewsDF['title'].apply(utils.count_words)
irelandNewsDF['char_length'] = irelandNewsDF['title'].str.len()

In [16]:
irelandNewsDF["month"] = irelandNewsDF["publish_date"].dt.strftime('%B %Y')

In [17]:
irelandNewsDF.head()

Unnamed: 0,title,publish_date,source_country,sentiment,cleaned_text,sentiment_label,word_length,char_length,month
1,Ministers Harris and Ryan welcome record numbe...,2023-01-17 21:41:06,ie,0.341,minister harris ryan welcome record number enr...,positive,16,113,January 2023
2,New €4.8m state-of-the-art facility planned fo...,2022-12-05 11:06:50,ie,0.377,new 4.8m state-of-the-art facility planned ucd...,positive,9,62,December 2022
3,‘Not possible’ to make Northern Ireland Protoc...,2023-02-01 20:45:45,ie,-0.146,possible make northern ireland protocol work e...,negative,12,69,February 2023
4,Shoppers must demand longer lasting goods and ...,2023-01-10 16:14:04,ie,-0.077,shopper must demand longer lasting good le pac...,negative,15,95,January 2023
5,Opportunity knocks for investors willing to ma...,2022-12-06 07:00:00,ie,-0.137,opportunity knock investor willing make long-t...,negative,13,90,December 2022


In [49]:
# PROTANOPIA COLOR PALLETTE
fig = px.box(
    irelandNewsDF, 
    y="sentiment", 
    x="month", 
    color="month",
    title="Box Plot of the Sentiments for News Headlines in Ireland from November 2022 to March 2023",
    color_discrete_map={
        "November 2022": "#AE9C45", 
        "December 2022": "#6073B1", 
        "January 2023": "#A7B8F8", 
        "February 2023": "#052955", 
        "March 2023": "#2E2B21"
    }
)
fig.update_layout(xaxis={'categoryorder': 'array', 'categoryarray': ["November 2022", "December 2022", "January 2023", "February 2023", "March 2023",]})
fig.show()

### UK News Dataset

In [19]:
ukNewsDF = pd.read_csv('datasets/uk_news.csv')
ukNewsDF.head()

Unnamed: 0,id,title,publish_date,source_country,sentiment
0,73732360,Biogas firm to hold consultation on £50 millio...,2022-11-28 14:19:32,GB,0.087
1,93801960,Scots Biotechnology network’s investment hits ...,2023-02-06 15:54:43,GB,0.251
2,70080002,Council chases owners of thousands of empty ho...,2022-11-17 09:36:50,GB,-0.111
3,68758894,The west’s ‘dash for gas’ in Africa is nothing...,2022-11-13 07:00:04,GB,-0.018
4,70141242,Mr Heaton-Harris was interested in how MJM Mar...,2022-11-17 14:26:52,GB,0.275


In [20]:
ukNewsDF.describe()

Unnamed: 0,id,sentiment
count,1100.0,1100.0
mean,87284590.0,0.011718
std,12621860.0,0.295913
min,64793760.0,-0.72
25%,76591140.0,-0.194
50%,88271500.0,-0.008
75%,97894170.0,0.20525
max,108451100.0,0.894


In [21]:
ukNewsDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1100 non-null   int64  
 1   title           1100 non-null   object 
 2   publish_date    1100 non-null   object 
 3   source_country  1100 non-null   object 
 4   sentiment       1100 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 43.1+ KB


In [22]:
ukNewsDF.drop(["id"], axis=1, inplace=True)
ukNewsDF["publish_date"] = pd.to_datetime(ukNewsDF["publish_date"])
ukNewsDF = ukNewsDF[ukNewsDF["publish_date"].dt.month != 4]
ukNewsDF.head()

Unnamed: 0,title,publish_date,source_country,sentiment
0,Biogas firm to hold consultation on £50 millio...,2022-11-28 14:19:32,GB,0.087
1,Scots Biotechnology network’s investment hits ...,2023-02-06 15:54:43,GB,0.251
2,Council chases owners of thousands of empty ho...,2022-11-17 09:36:50,GB,-0.111
3,The west’s ‘dash for gas’ in Africa is nothing...,2022-11-13 07:00:04,GB,-0.018
4,Mr Heaton-Harris was interested in how MJM Mar...,2022-11-17 14:26:52,GB,0.275


In [23]:
ukNewsDF.shape

(1092, 4)

In [24]:
ukNewsDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1092 entries, 0 to 1099
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   title           1092 non-null   object        
 1   publish_date    1092 non-null   datetime64[ns]
 2   source_country  1092 non-null   object        
 3   sentiment       1092 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 42.7+ KB


In [25]:
# Replace the &amp;rsquo; and &amp;lsquo; with ' symbols
ukNewsDF['cleaned_text'] = ukNewsDF['title'].str.replace('&amp;rsquo;', "'")
ukNewsDF['cleaned_text'] = ukNewsDF['title'].str.replace('&amp;lsquo;', "'")
ukNewsDF['cleaned_text'] = ukNewsDF['title'].apply(utils.lower_case)
#remove punctuations
ukNewsDF['cleaned_text'] = ukNewsDF['cleaned_text'].apply(utils.remove_punctuation)

In [26]:
ukNewsDF['cleaned_text'] = ukNewsDF['cleaned_text'].apply(utils.remove_stopwords)

Feature Extraction

In [27]:
ukNewsDF['sentiment_label'] = ukNewsDF['sentiment'].apply(utils.set_sentiment_label)

In [28]:
ukNewsDF['word_length'] = ukNewsDF['title'].apply(utils.count_words)
ukNewsDF['char_length'] = ukNewsDF['title'].str.len()

In [29]:
ukNewsDF["month"] = ukNewsDF["publish_date"].dt.strftime('%B %Y')

In [30]:
ukNewsDF.head()

Unnamed: 0,title,publish_date,source_country,sentiment,cleaned_text,sentiment_label,word_length,char_length,month
0,Biogas firm to hold consultation on £50 millio...,2022-11-28 14:19:32,GB,0.087,biogas firm hold consultation 50 million anaer...,positive,16,100,November 2022
1,Scots Biotechnology network’s investment hits ...,2023-02-06 15:54:43,GB,0.251,scots biotechnology network investment hits 25...,positive,7,57,February 2023
2,Council chases owners of thousands of empty ho...,2022-11-17 09:36:50,GB,-0.111,council chases owners thousands empty homes gl...,negative,10,60,November 2022
3,The west’s ‘dash for gas’ in Africa is nothing...,2022-11-13 07:00:04,GB,-0.018,west dash gas africa nothing energy colonialism,negative,13,75,November 2022
4,Mr Heaton-Harris was interested in how MJM Mar...,2022-11-17 14:26:52,GB,0.275,mr heaton-harris interested mjm marine mivan t...,positive,16,102,November 2022


In [48]:
# PROTANOPIA COLOR PALLETTE
fig = px.box(
    irelandNewsDF, 
    y="sentiment", 
    x="month", 
    color="month",
    title="Box Plot of the Sentiments for News Headlines in United Kingdom from November 2022 to March 2023",
    color_discrete_map={
        "November 2022": "#AE9C45", 
        "December 2022": "#6073B1", 
        "January 2023": "#A7B8F8", 
        "February 2023": "#052955", 
        "March 2023": "#2E2B21"
    }
)
fig.update_layout(xaxis={'categoryorder': 'array', 'categoryarray': ["November 2022", "December 2022", "January 2023", "February 2023", "March 2023",]})
fig.show()

## Inferential Statistics and Hypothesis testing


In [68]:
irelandNewsGroupedByMonth = irelandNewsDF.groupby(irelandNewsDF["month"]).agg(
    mean_value=("sentiment", "mean"),
    count=("sentiment", "count"),
    confidence_interval=("sentiment", lambda x: stats.t.interval(0.95, len(x)-1, loc=np.mean(x), scale=stats.sem(x)))
)
irelandNewsGroupedByMonth.reset_index(inplace=True)
irelandNewsGroupedByMonth

Unnamed: 0,month,mean_value,count,confidence_interval
0,December 2022,-0.019085,118,"(-0.06372473711687811, 0.025555245591454386)"
1,February 2023,0.004896,48,"(-0.07397868651394815, 0.08377035318061482)"
2,January 2023,-6.6e-05,76,"(-0.06788712866545676, 0.06775554971808834)"
3,March 2023,-0.00116,100,"(-0.0655385336175492, 0.06321853361754919)"
4,November 2022,-0.005583,84,"(-0.07448767450053856, 0.06332100783387187)"


In [70]:
ukNewsGroupedByMonth = ukNewsDF.groupby(ukNewsDF["month"]).agg(
    mean_value=("sentiment", "mean"),
    count=("sentiment", "count"),
    confidence_interval=("sentiment", lambda x: stats.t.interval(0.95, len(x)-1, loc=np.mean(x), scale=stats.sem(x)))
)
ukNewsGroupedByMonth.reset_index(inplace=True)
ukNewsGroupedByMonth

Unnamed: 0,month,mean_value,count,confidence_interval
0,December 2022,0.038705,183,"(-0.0008677261339427797, 0.07827756219951654)"
1,February 2023,-0.026508,183,"(-0.07509871382168022, 0.02208232037905726)"
2,January 2023,-0.015973,258,"(-0.051361453956841, 0.01941571752273248)"
3,March 2023,0.031914,233,"(-0.004460377362249662, 0.06828870354250718)"
4,November 2022,0.033562,235,"(-0.005457010641221879, 0.07258041489654102)"


In [79]:
t_test_results = []

for row1, row2 in zip(ukNewsGroupedByMonth.itertuples(index=False), irelandNewsGroupedByMonth.itertuples(index=False)):
    month = row1.month
    t_statistic, p_value = stats.ttest_ind(irelandNewsDF[irelandNewsDF["month"] == month]["sentiment"], ukNewsDF[ukNewsDF["month"] == month]["sentiment"])
    tempDict = {}
    tempDict[month] = (t_statistic, p_value)
    t_test_results.append(tempDict)
    
t_test_results

[{'December 2022': (-1.8733904953617007, 0.06198987153210318)},
 {'February 2023': (0.6023800136792646, 0.5475172120223593)},
 {'January 2023': (0.41953583225905267, 0.6750959121045581)},
 {'March 2023': (-0.9371583973815735, 0.3493603613464604)},
 {'November 2022': (-1.0020372504497483, 0.31708986592396127)}]

In [76]:
irelandNewsDF[irelandNewsDF["month"] == month]["sentiment"]

2      0.377
5     -0.137
7      0.381
10     0.128
19     0.014
       ...  
490    0.018
496   -0.347
497   -0.231
502    0.166
504    0.448
Name: sentiment, Length: 118, dtype: float64