<a href="https://colab.research.google.com/github/jwym02/data-vibes/blob/main/data_cleaning_and_topic_modelling_for_news_excerpt_(bia_datathon).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib as plt
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

In [None]:
# Download necessary NLTK data files (if not already available)
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Load the second Excel file
news_path = "news_excerpts_parsed.xlsx"
news_df = pd.read_excel(news_path)

In [None]:
# Display a summary of both datasets to understand their structure and content
news_excerpts_summary = news_df.head()


In [None]:
news_excerpts_summary

Unnamed: 0,Link,Text
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...


Preprocessing


In [None]:
# Preprocessing function
def preprocess_text(text):
    # Remove non-alphanumeric characters and lowercase the text
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()
    return text

# Apply preprocessing steps
news_df['Cleaned_Text'] = news_df['Text'].apply(preprocess_text)

# Remove duplicates and empty rows
news_df = news_df.drop_duplicates(subset=['Cleaned_Text']).dropna(subset=['Cleaned_Text'])

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatization function
def lemmatize_text(text):
    words = text.split()
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized)

# Apply lemmatization to the cleaned text
news_df['Lemmatized_Text'] = news_df['Cleaned_Text'].apply(lemmatize_text)



In [None]:
# Display the lemmatized data for review
print("\nLemmatized News Data:")
print(news_df.head())  # Displays the first 5 rows of the DataFrame



Lemmatized News Data:
                                                Link  \
0  https://edition.cnn.com/2023/09/29/business/st...   
1  https://www.channelnewsasia.com/singapore/su-w...   
2  https://edition.cnn.com/2023/05/22/tech/meta-f...   
3  https://www.channelnewsasia.com/singapore/bill...   
4  https://edition.cnn.com/2024/03/05/politics/li...   

                                                Text  \
0  Starbucks violated federal labor law when it i...   
1  The first suspect to plead guilty in Singapore...   
2  Meta has been fined a record-breaking €1.2 bil...   
3  SINGAPORE: A 45-year-old man linked to Singapo...   
4  The Department of Education imposed a record $...   

                                        Cleaned_Text  \
0  starbucks violated federal labor law when it i...   
1  the first suspect to plead guilty in singapore...   
2  meta has been fined a recordbreaking  billion ...   
3  singapore a yearold man linked to singapores l...   
4  the department of ed

In [None]:
# Replace '\n\n' and '\n' with a space
news_df['Cleaned_Text'] = news_df['Cleaned_Text'].str.replace(r'\n+', ' ', regex=True)


In [None]:
print(news_df['Cleaned_Text'].head())

0    starbucks violated federal labor law when it i...
1    the first suspect to plead guilty in singapore...
2    meta has been fined a recordbreaking  billion ...
3    singapore a yearold man linked to singapores l...
4    the department of education imposed a record  ...
Name: Cleaned_Text, dtype: object


In [None]:
from nltk.corpus import stopwords

# Download the stop words set
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Add custom stopwords to the existing stop words set
custom_stop_words = {
       "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december",
    "year", "years", "week", "weeks", "day", "days", "month", "months",
    "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    "first", "second", "third",
    "united", "nations", "committee", "report", "review", "internal", "official", "officer", "program", "project",
    "airport", "terminal", "location", "place", "zone",
    "approximately", "next", "prior", "details", "percent", "inclusive",
    # New additions
    "summer", "winter", "fall", "spring",
    "memo", "dra", "control", "panel", "annual", "result",
    "us", "uk", "eu", "board", "group", "provisional",
    "department", "infrastructure", "agency", "also",
    "said", "mr", "dr"
}
stop_words.update(custom_stop_words)

# modal verbs
modal_verbs = ["would", "can", "cant", "could", "couldnt", "did", "didnt", "may", "might", "must", "mustnt",
    "shall", "shant", "should", "shouldnt", "will", "wont", "would", "wouldnt", "dont"]

# Function to remove stop words
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words and word not in modal_verbs]
    return ' '.join(filtered_words)

# Apply to the cleaned text column
news_df['Text_No_Stopwords'] = news_df['Cleaned_Text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(news_df['Text_No_Stopwords']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names_out()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 8

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [None]:
model.fit(tf)

In [None]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [None]:
no_top_words = 8
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights
0,south,98.9,media,123.0,singapore,155.2,court,131.3,per,212.0,time,94.9,singapore,192.7,china,226.9
1,north,86.0,social,87.2,family,111.1,yearold,70.0,company,203.5,school,88.6,hospital,87.6,trade,169.4
2,korea,78.1,company,51.2,people,71.5,found,56.6,cent,192.1,university,75.0,university,81.8,sanctions,118.1
3,singapore,74.6,last,48.1,like,70.3,law,51.8,billion,162.8,ms,74.0,hong,81.0,foreign,109.9
4,people,69.8,world,43.6,lee,54.4,case,49.7,million,159.9,new,56.6,national,79.0,chinese,95.1
5,minister,56.3,time,43.6,food,51.2,last,48.3,technology,110.9,health,55.4,research,76.1,russia,85.7
6,israel,53.1,news,37.2,back,48.7,data,47.5,ai,95.4,people,55.3,covid,62.4,relations,71.2
7,new,52.1,ukraine,36.1,made,46.0,guilty,47.1,market,91.7,tan,48.7,health,59.8,government,68.1


In [None]:
print(news_df[['Cleaned_Text', 'Text_No_Stopwords']].head())

                                        Cleaned_Text  \
0  starbucks violated federal labor law when it i...   
1  the first suspect to plead guilty in singapore...   
2  meta has been fined a recordbreaking  billion ...   
3  singapore a yearold man linked to singapores l...   
4  the department of education imposed a record  ...   

                                   Text_No_Stopwords  
0  starbucks violated federal labor law increased...  
1  suspect plead guilty singapores largest money ...  
2  meta fined recordbreaking billion billion euro...  
3  singapore yearold man linked singapores larges...  
4  education imposed record million fine liberty ...  


In [None]:
# import spacy
# from itertools import combinations
# from collections import Counter
# import networkx as nx
# import matplotlib.pyplot as plt

# # Load spaCy's English model
# nlp = spacy.load("en_core_web_sm")

# # Function to extract entities
# def extract_entities(text):
#     doc = nlp(text)
#     entities = [ent.text for ent in doc.ents]
#     return entities

# # Function to extract relationships (co-occurrence of entities)
# def extract_relationships(text):
#     entities = extract_entities(text)
#     return list(combinations(entities, 2)) if len(entities) > 1 else []

# # Apply entity and relationship extraction to the dataset
# news_df['Relationships'] = news_df['Text_No_Stopwords'].apply(extract_relationships)

In [None]:
# from itertools import combinations
# import pandas as pd

# # Sample function to extract relationships based on co-occurrence
# def extract_relationships(text, entities):
#     # Find all entity pairs
#     entity_pairs = list(combinations(entities, 2))
#     return entity_pairs

# # Example data for entities (mock entity extraction)
# entities = ["airport", "administrative", "division"]

# # Apply to the dataset
# news_df['Relationships'] = news_df['Text_No_Stopwords'].apply(
#     lambda x: extract_relationships(x, entities)  # Replace `entities` with actual entity extraction logic
# )


In [None]:
news_df.columns

Index(['Link', 'Text', 'Cleaned_Text', 'Lemmatized_Text', 'Text_No_Stopwords',
       'Relationships'],
      dtype='object')

In [None]:
news_df.to_csv('news_cleaned.csv', index=False)