# **Initialization**

In [101]:
import pandas as pd
import re
import spacy
from collections import Counter
from textblob import TextBlob
# from google.colab import drive

from sklearn.feature_extraction.text import CountVectorizer
from numpy import array, log
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

#### Mount google drive where the csv files are stored

In [30]:
# Mount Google Drive
# drive.mount('/content/drive', force_remount=True)

### Confirm if drive is mounted by listing the drive contents

In [31]:
# !ls "/content/drive/My Drive"

In [32]:
# news_data = pd.read_csv('/content/drive/My Drive/all_news_data/data.csv')
# traffic_data = pd.read_csv('/content/drive/My Drive/all_news_data/traffic.csv')
# domains_location_data = pd.read_csv('/content/drive/My Drive/all_news_data/domains_location.csv')

In [33]:
news_data = pd.read_csv('../data/data.csv')
traffic_data = pd.read_csv('../data/traffic.csv')
domains_location_data = pd.read_csv('../data/domains_location.csv')

In [34]:
news_data.columns

Index(['article_id', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'category', 'full_content'],
      dtype='object')

In [35]:
news_data.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
0,89541,,International Business Times,Paavan MATHEMA,UN Chief Urges World To 'Stop The Madness' Of ...,UN Secretary-General Antonio Guterres urged th...,https://www.ibtimes.com/un-chief-urges-world-s...,https://d.ibtimes.com/en/full/4496078/nepals-g...,2023-10-30 10:12:35.000000,UN Secretary-General Antonio Guterres urged th...,Nepal,UN Secretary-General Antonio Guterres urged th...
1,89542,,Prtimes.jp,,RANDEBOOよりワンランク上の大人っぽさが漂うニットとベストが新登場。,[株式会社Ainer]\nRANDEBOO（ランデブー）では2023年7月18日(火)より公...,https://prtimes.jp/main/html/rd/p/000000147.00...,https://prtimes.jp/i/32220/147/ogp/d32220-147-...,2023-10-06 04:40:02.000000,"RANDEBOO2023718()WEB2023 Autumn Winter \n""Nepa...",Nepal,
2,89543,,VOA News,webdesk@voanews.com (Agence France-Presse),UN Chief Urges World to 'Stop the Madness' of ...,UN Secretary-General Antonio Guterres urged th...,https://www.voanews.com/a/un-chief-urges-world...,https://gdb.voanews.com/01000000-0a00-0242-60f...,2023-10-30 10:53:30.000000,"Kathmandu, Nepal UN Secretary-General Antonio...",Nepal,
3,89545,,The Indian Express,Editorial,Sikkim warning: Hydroelectricity push must be ...,Ecologists caution against the adverse effects...,https://indianexpress.com/article/opinion/edit...,https://images.indianexpress.com/2023/10/edit-...,2023-10-06 01:20:24.000000,At least 14 persons lost their lives and more ...,Nepal,At least 14 persons lost their lives and more ...
4,89547,,The Times of Israel,Jacob Magid,"200 foreigners, dual nationals cut down in Ham...","France lost 35 citizens, Thailand 33, US 31, U...",https://www.timesofisrael.com/200-foreigners-d...,https://static.timesofisrael.com/www/uploads/2...,2023-10-27 01:08:34.000000,"Scores of foreign citizens were killed, taken ...",Nepal,


# **Data preprocessing**

In [36]:
# Function to preprocess source_name into source_id
def preprocess_source_id(source_name):
    # Convert to lowercase
    source_id = source_name.lower()
    # Replace spaces with hyphens
    source_id = source_id.replace(" ", "-")
    # Keep only alphanumeric characters, hyphens, brackets and full stops
    source_id = re.sub(r'[^\w\-.()]+', '', source_id)
    return source_id

# Apply the function to the source_name column to create the source_id column
news_data['source_id'] = news_data['source_name'].apply(preprocess_source_id)

In [37]:
# Preprocessing for author column missing values
news_data['author'].fillna('Unknown', inplace=True)

# Deal with full_content
news_data.dropna(subset=['full_content'], inplace=True)

# Since url_to_image is not important we can fill with placeholder
news_data['url_to_image'].fillna('http://example.com/placeholder.jpg', inplace=True)

# same with description column, use placeholder, unless otherwise important
news_data['description'].fillna('No description provided', inplace=True)

# Replace missing values in 'category' with 'Unknown'
news_data['category'].fillna('Unknown', inplace=True)

# Replace missing values in 'title' with 'No title provided'
news_data['title'].fillna('No title provided', inplace=True)

In [38]:
# Perform preprocessing and clean missing values on the domains_info dataset
domains_location_data.dropna(subset=['Country'], inplace=True)

### More Preprocessing, on date fields

In [39]:
# prompt: Using dataframe news_data: perform data preprocessing and cleaning

# Check for missing values
news_data.isnull().sum()

# Drop rows with missing values
news_data.dropna(inplace=True)

# Remove duplicate rows
news_data.drop_duplicates(inplace=True)

# Convert the 'published_at' column to datetime format
news_data['published_at'] = pd.to_datetime(news_data['published_at'], format='ISO8601')
# news_data['published_at'] = pd.to_datetime(news_data['published_at'])

# Extract the year from the 'published_at' column
news_data['year'] = news_data['published_at'].dt.year

# Group the data by year and calculate

ValueError: time data '2023-10-30 10:12:35.000000' does not match format 'ISO8601' (match)

In [None]:
news_data.head()

In [40]:
missing_values_news_data = news_data.isnull().sum()
print(missing_values_news_data)

article_id      0
source_id       0
source_name     0
author          0
title           0
description     0
url             0
url_to_image    0
published_at    0
content         0
category        0
full_content    0
dtype: int64


In [41]:
missing_values_domains_location_data = domains_location_data.isnull().sum()
print(missing_values_domains_location_data)



SourceCommonName    0
location            0
Country             0
dtype: int64


In [42]:
missing_values_traffic_data = traffic_data.isnull().sum()
print(missing_values_traffic_data)

GlobalRank        0
TldRank           0
Domain            0
TLD               0
RefSubNets        0
RefIPs            0
IDN_Domain        0
IDN_TLD           0
PrevGlobalRank    0
PrevTldRank       0
PrevRefSubNets    0
PrevRefIPs        0
dtype: int64


In [43]:
domains_location_data.columns

Index(['SourceCommonName', 'location', 'Country'], dtype='object')

### Fix missing values

In [44]:
traffic_data.columns

Index(['GlobalRank', 'TldRank', 'Domain', 'TLD', 'RefSubNets', 'RefIPs',
       'IDN_Domain', 'IDN_TLD', 'PrevGlobalRank', 'PrevTldRank',
       'PrevRefSubNets', 'PrevRefIPs'],
      dtype='object')

In [45]:
news_data = news_data.dropna()
missing_values_news_data = news_data.isnull().sum()
print(missing_values_news_data)

article_id      0
source_id       0
source_name     0
author          0
title           0
description     0
url             0
url_to_image    0
published_at    0
content         0
category        0
full_content    0
dtype: int64


### Check outliers

In [46]:
# Check for missing values
news_data.isnull().all()

article_id      False
source_id       False
source_name     False
author          False
title           False
description     False
url             False
url_to_image    False
published_at    False
content         False
category        False
full_content    False
dtype: bool

In [47]:
len(news_data)

54889

In [48]:
domains_location_data.isnull().all()

SourceCommonName    False
location            False
Country             False
dtype: bool

In [49]:
domains_location_data.empty

False

In [50]:
news_data.columns

Index(['article_id', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'category', 'full_content'],
      dtype='object')

In [51]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54889 entries, 0 to 105374
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_id    54889 non-null  int64 
 1   source_id     54889 non-null  object
 2   source_name   54889 non-null  object
 3   author        54889 non-null  object
 4   title         54889 non-null  object
 5   description   54889 non-null  object
 6   url           54889 non-null  object
 7   url_to_image  54889 non-null  object
 8   published_at  54889 non-null  object
 9   content       54889 non-null  object
 10  category      54889 non-null  object
 11  full_content  54889 non-null  object
dtypes: int64(1), object(11)
memory usage: 5.4+ MB


In [52]:
news_data.describe()

Unnamed: 0,article_id
count,54889.0
mean,305940.568019
std,220864.360813
min,418.0
25%,105821.0
50%,271035.0
75%,469868.0
max,781308.0


# **NLP**

## Keyword Extraction with TF-IDF

In [108]:
def extract_keywords_custom_tfidf(text_data, max_features=10):
    # Use the 'article' column as the text data
    array_text = text_data.dropna().tolist()

     # Extend the stop words used by CountVectorizer
    stop_words = text.ENGLISH_STOP_WORDS.union(['said', 'new'])

    # Convert the set of stop words to a list
    stop_words = list(stop_words)

    # Initialize the CountVectorizer with stop_words set to 'english'
    vectorizer = CountVectorizer(max_features=max_features, stop_words=stop_words)
    
    # Fit the vectorizer on the text data
    tf = vectorizer.fit_transform([x.lower() for x in array_text])
    tf = tf.toarray()
    tf = log(tf + 1)

    # Compute IDF values
    df = pd.DataFrame(tf, columns=vectorizer.get_feature_names_out())
    # calculates the Inverse Document Frequency (IDF) for each word in the text data.
    idf = (len(array_text) / (df > 0).sum()).apply(log)

    # We are ready to multiply the TF and IDF values to get the TF-IDF values.
    tfidf = tf.copy()
    words = array(vectorizer.get_feature_names_out())

    for word in words:
        tfidf[:, words == word] = tfidf[:, words == word] * idf[word]

    keywords = []
    for j in range(tfidf.shape[0]):
        # Get the top 5 words with the highest TF-IDF values
        top_words = [words[i] for i in tfidf[j].argsort()[-5:][::-1]]
        keywords.append(top_words)

    return keywords

### Extract keywords for the first 10 and last 10 articles' title and content


In [109]:
# Extract keywords for the first 10 and last 10 articles' title and content
limited_news_data = pd.concat([news_data.head(10), news_data.tail(10)])
limited_news_data['title_keywords'] = extract_keywords_custom_tfidf(limited_news_data['title'])
limited_news_data['content_keywords'] = extract_keywords_custom_tfidf(limited_news_data['full_content'])

#### Check the added columns of keywords

In [112]:
limited_news_data.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content,title_keywords,content_keywords
0,89541,international-business-times,International Business Times,Paavan MATHEMA,UN Chief Urges World To 'Stop The Madness' Of ...,UN Secretary-General Antonio Guterres urged th...,https://www.ibtimes.com/un-chief-urges-world-s...,https://d.ibtimes.com/en/full/4496078/nepals-g...,2023-10-30 10:12:35.000000,UN Secretary-General Antonio Guterres urged th...,Nepal,UN Secretary-General Antonio Guterres urged th...,"[world, climate, chief, change, pay]","[world, people, million, state, israel]"
3,89545,the-indian-express,The Indian Express,Editorial,Sikkim warning: Hydroelectricity push must be ...,Ecologists caution against the adverse effects...,https://indianexpress.com/article/opinion/edit...,https://images.indianexpress.com/2023/10/edit-...,2023-10-06 01:20:24.000000,At least 14 persons lost their lives and more ...,Nepal,At least 14 persons lost their lives and more ...,"[world, pay, party, palestine, india]","[state, indian, government, people, world]"
6,89551,al-jazeera-english,Al Jazeera English,Kaushik Raj,Pro-Israel rallies allowed in India but Palest...,"India, the first non-Arab country to recognise...",https://www.aljazeera.com/news/2023/10/25/pro-...,https://www.aljazeera.com/wp-content/uploads/2...,2023-10-25 09:58:17.000000,"New Delhi, India Israels relentless bombing of...",Nepal,"India, the first non-Arab country to recognise...","[palestine, india, world, pay, party]","[israel, india, state, government, indian]"
7,89555,the-indian-express,The Indian Express,New York Times,No nation in the world is buying more planes t...,India's largest airlines have ordered nearly 1...,https://indianexpress.com/article/business/avi...,https://images.indianexpress.com/2023/11/igiai...,2023-11-02 05:48:58.000000,No nation in the world is buying as many airpl...,Nepal,Written by Alex Travelli and Hari Kumar No nat...,"[world, india, pay, party, palestine]","[india, million, indian, 000, world]"
12,89563,the-times-of-india,The Times of India,Durgesh Nandan Jha,PM Hasina’s war on terror gets daughter India’...,India News: NEW DELHI: India preferred Banglad...,https://timesofindia.indiatimes.com/india/pm-h...,"https://static.toiimg.com/thumb/msid-47529300,...",2023-11-02 01:12:47.000000,Ranked! Worlds most loved landmarks; Taj Mahal...,Nepal,NEW DELHI: India preferred Bangladesh over Nep...,"[india, world, pay, party, palestine]","[india, world, state, people, million]"


### Similarity of keywords between title and keywords in the news

In [125]:
# Convert the list of keywords for each article into a string
limited_news_data['title_keywords_str'] = limited_news_data['title_keywords'].apply(' '.join)
limited_news_data['content_keywords_str'] = limited_news_data['content_keywords'].apply(' '.join)

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Convert the keywords into TF-IDF vectors
title_tfidf = vectorizer.fit_transform(limited_news_data['title_keywords_str'])
content_tfidf = vectorizer.transform(limited_news_data['content_keywords_str'])

# Calculate the cosine similarity between the title and content keywords for each article
similarity_scores = cosine_similarity(title_tfidf, content_tfidf)

# Print the similarity scores
print("The similarity score between keywords in the title and content of the articles is as follows:")
for i, score in enumerate(similarity_scores):
    print(f"article {i+1}: {score[0]}")

The similarity score between keywords in the title and content of the articles is as follows:
article 1: 0.18886369489717672
article 2: 0.41171810764997363
article 3: 0.41171810764997363
article 4: 0.41171810764997363
article 5: 0.41171810764997363
article 6: 0.22109649138855314
article 7: 0.41171810764997363
article 8: 0.41171810764997363
article 9: 0.25158568922967073
article 10: 0.41171810764997363
article 11: 0.2761797570583058
article 12: 0.41171810764997363
article 13: 0.41171810764997363
article 14: 0.41171810764997363
article 15: 0.41171810764997363
article 16: 0.41171810764997363
article 17: 0.41171810764997363
article 18: 0.41171810764997363
article 19: 0.2526691720356441
article 20: 0.41171810764997363


## Categorize the title/content into known set of topic categories

In [126]:
# Import the necessary libraries
from bertopic import BERTopic

# Assume news_data is your DataFrame and 'content' is the column with text data
docs = news_data['content'].values.tolist()

# Create an instance of BERTopic
topic_model = BERTopic()

# Fit BERTopic to your documents and transform the documents into topics
topics, probs = topic_model.fit_transform(docs)

# Get an overview of the topics
topic_overview = topic_model.get_topic_info()

# Print the overview
print(topic_overview)

ModuleNotFoundError: No module named 'bertopic'