In [2]:
from dotenv import load_dotenv
import os
from pymongo import MongoClient
import pandas as pd
import functions.utils as utils
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")

In [4]:
# mongoClient = MongoClient(MONGODB_URI)
# db = mongoClient["news_data"]
# ieColl = db["ireland_news"]

### The Ireland News Dataset

In [5]:
# https://towardsdatascience.com/list-comprehensions-vs-for-loops-it-is-not-what-you-think-34071d4d8207
# Using array computation is a lot faster than using for loops and appending to a list
# irelandNews = list(ieColl.find({}))
# print(len(irelandNews))

# irelandNewsDF = pd.DataFrame(irelandNews)

# REMOVE THIS ONCE YOU ARE DONE WITH THE EDA
irelandNewsDF = pd.read_csv('datasets/ie_news.csv')
irelandNewsDF.head()

Unnamed: 0,_id,id,title,publish_date,source_country,sentiment
0,646fddfd0576b45037ddb2c6,111013564,Kilkenny-based recruitment company wins two in...,2023-04-18 08:59:00,ie,0.544
1,646fddfd0576b45037ddb2c7,88450232,Ministers Harris and Ryan welcome record numbe...,2023-01-17 21:41:06,ie,0.341
2,646fddfd0576b45037ddb2c8,75989338,New €4.8m state-of-the-art facility planned fo...,2022-12-05 11:06:50,ie,0.377
3,646fddfd0576b45037ddb2c9,92473892,‘Not possible’ to make Northern Ireland Protoc...,2023-02-01 20:45:45,ie,-0.146
4,646fddfd0576b45037ddb2ca,86433356,Shoppers must demand longer lasting goods and ...,2023-01-10 16:14:04,ie,-0.077


In [6]:
irelandNewsDF.describe()

Unnamed: 0,id,sentiment
count,505.0,505.0
mean,90490210.0,-0.003907
std,14507340.0,0.288732
min,64968580.0,-0.774
25%,77182770.0,-0.216
50%,90126080.0,-0.029
75%,103448600.0,0.186
max,112426300.0,0.879


In [7]:
irelandNewsDF.describe(include="object")

Unnamed: 0,_id,title,publish_date,source_country
count,505,505,505,505
unique,505,505,496,1
top,646fddfd0576b45037ddb2c6,Kilkenny-based recruitment company wins two in...,2022-11-20 03:30:00,ie
freq,1,1,3,505


In [8]:
irelandNewsDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   _id             505 non-null    object 
 1   id              505 non-null    int64  
 2   title           505 non-null    object 
 3   publish_date    505 non-null    object 
 4   source_country  505 non-null    object 
 5   sentiment       505 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 23.8+ KB


In [9]:
irelandNewsDF.drop(["_id", "id"], axis=1, inplace=True)
irelandNewsDF["publish_date"] = pd.to_datetime(irelandNewsDF["publish_date"])
# Remove news that were on april
irelandNewsDF = irelandNewsDF[irelandNewsDF["publish_date"].dt.month != 4]
irelandNewsDF.head()

Unnamed: 0,title,publish_date,source_country,sentiment
1,Ministers Harris and Ryan welcome record numbe...,2023-01-17 21:41:06,ie,0.341
2,New €4.8m state-of-the-art facility planned fo...,2022-12-05 11:06:50,ie,0.377
3,‘Not possible’ to make Northern Ireland Protoc...,2023-02-01 20:45:45,ie,-0.146
4,Shoppers must demand longer lasting goods and ...,2023-01-10 16:14:04,ie,-0.077
5,Opportunity knocks for investors willing to ma...,2022-12-06 07:00:00,ie,-0.137


In [10]:
irelandNewsDF.shape

(426, 4)

In [11]:
irelandNewsDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 1 to 504
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   title           426 non-null    object        
 1   publish_date    426 non-null    datetime64[ns]
 2   source_country  426 non-null    object        
 3   sentiment       426 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 16.6+ KB


Symbols for decimals and hyphenated words were not removed so that it the meaning of the words are not altered.

In [12]:
# Replace the &amp;rsquo; and &amp;lsquo; with ' symbols
irelandNewsDF['cleaned_text'] = irelandNewsDF['title'].str.replace('&amp;rsquo;', "'")
irelandNewsDF['cleaned_text'] = irelandNewsDF['title'].str.replace('&amp;lsquo;', "'")
irelandNewsDF['cleaned_text'] = irelandNewsDF['title'].apply(utils.lower_case)
#remove punctuations
irelandNewsDF['cleaned_text'] = irelandNewsDF['cleaned_text'].apply(utils.remove_punctuation)

In [13]:
irelandNewsDF['cleaned_text'] = irelandNewsDF['cleaned_text'].apply(utils.remove_stopwords)

Lemmatize the text so that inflectional endings are removed.

In [14]:
irelandNewsDF['cleaned_text'] = irelandNewsDF['cleaned_text'].apply(utils.lemmatize_text)

Feature extraction

In [15]:
irelandNewsDF['sentiment_label'] = irelandNewsDF['sentiment'].apply(utils.set_sentiment_label)

In [16]:
irelandNewsDF['word_length'] = irelandNewsDF['title'].apply(utils.count_words)
irelandNewsDF['char_length'] = irelandNewsDF['title'].str.len()

In [17]:
irelandNewsDF.head()

Unnamed: 0,title,publish_date,source_country,sentiment,cleaned_text,sentiment_label,word_length,char_length
1,Ministers Harris and Ryan welcome record numbe...,2023-01-17 21:41:06,ie,0.341,minister harris ryan welcome record number enr...,positive,16,113
2,New €4.8m state-of-the-art facility planned fo...,2022-12-05 11:06:50,ie,0.377,new 4.8m state-of-the-art facility planned ucd...,positive,9,62
3,‘Not possible’ to make Northern Ireland Protoc...,2023-02-01 20:45:45,ie,-0.146,possible make northern ireland protocol work e...,negative,12,69
4,Shoppers must demand longer lasting goods and ...,2023-01-10 16:14:04,ie,-0.077,shopper must demand longer lasting good le pac...,negative,15,95
5,Opportunity knocks for investors willing to ma...,2022-12-06 07:00:00,ie,-0.137,opportunity knock investor willing make long-t...,negative,13,90


### UK News Dataset

In [18]:
ukNewsDF = pd.read_csv('datasets/uk_news.csv')
ukNewsDF.head()

Unnamed: 0,id,title,publish_date,source_country,sentiment
0,73732360,Biogas firm to hold consultation on £50 millio...,2022-11-28 14:19:32,GB,0.087
1,93801960,Scots Biotechnology network’s investment hits ...,2023-02-06 15:54:43,GB,0.251
2,70080002,Council chases owners of thousands of empty ho...,2022-11-17 09:36:50,GB,-0.111
3,68758894,The west’s ‘dash for gas’ in Africa is nothing...,2022-11-13 07:00:04,GB,-0.018
4,70141242,Mr Heaton-Harris was interested in how MJM Mar...,2022-11-17 14:26:52,GB,0.275


In [19]:
ukNewsDF.describe()

Unnamed: 0,id,sentiment
count,1100.0,1100.0
mean,87284590.0,0.011718
std,12621860.0,0.295913
min,64793760.0,-0.72
25%,76591140.0,-0.194
50%,88271500.0,-0.008
75%,97894170.0,0.20525
max,108451100.0,0.894


In [20]:
ukNewsDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1100 non-null   int64  
 1   title           1100 non-null   object 
 2   publish_date    1100 non-null   object 
 3   source_country  1100 non-null   object 
 4   sentiment       1100 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 43.1+ KB


In [21]:
ukNewsDF.drop(["id"], axis=1, inplace=True)
ukNewsDF["publish_date"] = pd.to_datetime(ukNewsDF["publish_date"])
ukNewsDF = ukNewsDF[ukNewsDF["publish_date"].dt.month != 4]
ukNewsDF.head()

Unnamed: 0,title,publish_date,source_country,sentiment
0,Biogas firm to hold consultation on £50 millio...,2022-11-28 14:19:32,GB,0.087
1,Scots Biotechnology network’s investment hits ...,2023-02-06 15:54:43,GB,0.251
2,Council chases owners of thousands of empty ho...,2022-11-17 09:36:50,GB,-0.111
3,The west’s ‘dash for gas’ in Africa is nothing...,2022-11-13 07:00:04,GB,-0.018
4,Mr Heaton-Harris was interested in how MJM Mar...,2022-11-17 14:26:52,GB,0.275


In [22]:
ukNewsDF.shape

(1092, 4)

In [23]:
ukNewsDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1092 entries, 0 to 1099
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   title           1092 non-null   object        
 1   publish_date    1092 non-null   datetime64[ns]
 2   source_country  1092 non-null   object        
 3   sentiment       1092 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 42.7+ KB


In [24]:
# Replace the &amp;rsquo; and &amp;lsquo; with ' symbols
ukNewsDF['cleaned_text'] = ukNewsDF['title'].str.replace('&amp;rsquo;', "'")
ukNewsDF['cleaned_text'] = ukNewsDF['title'].str.replace('&amp;lsquo;', "'")
ukNewsDF['cleaned_text'] = ukNewsDF['title'].apply(utils.lower_case)
#remove punctuations
ukNewsDF['cleaned_text'] = ukNewsDF['cleaned_text'].apply(utils.remove_punctuation)

In [25]:
ukNewsDF['cleaned_text'] = ukNewsDF['cleaned_text'].apply(utils.remove_stopwords)

Feature Extraction

In [26]:
ukNewsDF['sentiment_label'] = ukNewsDF['sentiment'].apply(utils.set_sentiment_label)

In [27]:
ukNewsDF['word_length'] = ukNewsDF['title'].apply(utils.count_words)
ukNewsDF['char_length'] = ukNewsDF['title'].str.len()

In [28]:
ukNewsDF.head()

Unnamed: 0,title,publish_date,source_country,sentiment,cleaned_text,sentiment_label,word_length,char_length
0,Biogas firm to hold consultation on £50 millio...,2022-11-28 14:19:32,GB,0.087,biogas firm hold consultation 50 million anaer...,positive,16,100
1,Scots Biotechnology network’s investment hits ...,2023-02-06 15:54:43,GB,0.251,scots biotechnology network investment hits 25...,positive,7,57
2,Council chases owners of thousands of empty ho...,2022-11-17 09:36:50,GB,-0.111,council chases owners thousands empty homes gl...,negative,10,60
3,The west’s ‘dash for gas’ in Africa is nothing...,2022-11-13 07:00:04,GB,-0.018,west dash gas africa nothing energy colonialism,negative,13,75
4,Mr Heaton-Harris was interested in how MJM Mar...,2022-11-17 14:26:52,GB,0.275,mr heaton-harris interested mjm marine mivan t...,positive,16,102


## Hypothesis testing


In [37]:
irelandNewsGroupedByMonth = irelandNewsDF.groupby([pd.Grouper(key="publish_date", freq="M"), "sentiment_label"]).mean()
irelandNewsGroupedByMonth

Unnamed: 0_level_0,Unnamed: 1_level_0,sentiment,word_length,char_length
publish_date,sentiment_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-11-30,negative,-0.269756,11.804878,76.0
2022-11-30,positive,0.246302,11.744186,77.697674
2022-12-31,negative,-0.203415,11.738462,75.692308
2022-12-31,positive,0.206981,11.679245,76.716981
2023-01-31,negative,-0.238231,11.230769,73.153846
2023-01-31,positive,0.250973,11.864865,79.594595
2023-02-28,negative,-0.21644,10.28,67.96
2023-02-28,positive,0.245478,11.608696,77.695652
2023-03-31,negative,-0.23463,11.777778,75.796296
2023-03-31,neutral,0.0,14.0,84.0


In [36]:
ukNewsGroupedByMonth = ukNewsDF.groupby([pd.Grouper(key="publish_date", freq="M"), "sentiment_label"]).mean()
ukNewsGroupedByMonth

Unnamed: 0_level_0,Unnamed: 1_level_0,sentiment,word_length,char_length
publish_date,sentiment_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-11-30,negative,-0.227273,12.154545,78.209091
2022-11-30,neutral,0.0,12.0,97.0
2022-11-30,positive,0.265218,11.169355,72.040323
2022-12-31,negative,-0.204875,11.7625,72.7625
2022-12-31,positive,0.227893,10.912621,69.582524
2023-01-31,negative,-0.216211,11.210884,69.088435
2023-01-31,positive,0.249207,10.774775,67.585586
2023-02-28,negative,-0.260705,11.247619,70.961905
2023-02-28,positive,0.288756,11.089744,70.846154
2023-03-31,negative,-0.200883,11.0,68.576577
