In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [34]:
# Load the news articles data
news_en = pd.read_csv('data/news-articles-eng.csv')
print("Successfully loaded", len(news_en), "rows from news-articles-eng.csv")
news_ara = pd.read_csv('data/news-articles-ara.csv')
print("Successfully loaded", len(news_ara), "rows from news-articles-ara.csv")

Successfully loaded 86660 rows from news-articles-eng.csv
Successfully loaded 85511 rows from news-articles-ara.csv


In [None]:
# Investigate specific columns in detail
# 1. isDuplicate column
print("1. isDuplicate Analysis:")
duplicates_eng = news_en[news_en['isDuplicate'] == True]
duplicates_ara = news_ara[news_ara['isDuplicate'] == True]
print(f"\nDuplicate articles: English={len(duplicates_eng):,}, Arabic={len(duplicates_ara):,}")

# 2. dataType column
print(f"\n2. dataType Analysis:")
print(f"English dataType values:", news_en['dataType'].value_counts())
print(f"Arabic dataType values:", news_ara['dataType'].value_counts())

1. isDuplicate Analysis:

Duplicate articles: English=0, Arabic=0

2. dataType Analysis:
English dataType values: dataType
news    86660
Name: count, dtype: int64
Arabic dataType values: dataType
news    85511
Name: count, dtype: int64


So we have 0 duplicates and all articles are tagged as news. I still do not fully grasp what the columns, sim, wgt and relevance are telling me though.

Perhaps: 
- sim: Similarity score (0-1) measuring content similarity to other articles
- wgt: Weight/importance score for the article (not sure how measured or how estimated)
- relevance: Relevance score (perhaps to query/topic - but not sure to what topic and how measured...)
- Sources (different news outlets covering same story)?

In [37]:
# Basic data overview
def summarize_dataset(df, name):
    print(f"\n{name} Dataset Summary:")
    print(f"  Total articles: {len(df):,}")
    print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"  Unique sources: {df['source'].apply(lambda x: eval(x)['uri'] if isinstance(x, str) else 'unknown').nunique()}")
    print(f"  Articles with images: {df['image'].notna().sum():,} ({df['image'].notna().mean()*100:.1f}%)")
    print(f"  Articles with authors: {df['authors'].apply(lambda x: len(eval(x)) if isinstance(x, str) and x != '[]' else 0).sum():,}")
    print(f"  Duplicates: {df['isDuplicate'].sum():,} ({df['isDuplicate'].mean()*100:.1f}%)")
    
    # Check for missing values in key columns
    key_cols = ['title', 'body', 'url', 'source', 'date']
    print(f"  Missing values:")
    for col in key_cols:
        missing = df[col].isna().sum()
        print(f"    {col}: {missing:,} ({missing/len(df)*100:.1f}%)")

summarize_dataset(news_en, "English")
summarize_dataset(news_ara, "Arabic")


English Dataset Summary:
  Total articles: 86,660
  Date range: 2024-06-23 to 2024-07-24
  Unique sources: 4759
  Articles with images: 81,476 (94.0%)
  Articles with authors: 35,911
  Duplicates: 0 (0.0%)
  Missing values:
    title: 3 (0.0%)
    body: 0 (0.0%)
    url: 0 (0.0%)
    source: 0 (0.0%)
    date: 0 (0.0%)

Arabic Dataset Summary:
  Total articles: 85,511
  Date range: 2024-06-23 to 2024-07-24
  Unique sources: 415
  Articles with images: 80,132 (93.7%)
  Articles with authors: 15,447
  Duplicates: 0 (0.0%)
  Missing values:
    title: 37 (0.0%)
    body: 0 (0.0%)
    url: 0 (0.0%)
    source: 0 (0.0%)
    date: 0 (0.0%)


# Open Questions and Next Steps
- We have many more articles than sources - would it make sense to use this fact for analysis somehow. Cluster the articles by relevance according to source - focusing on the most relevant source? Or doing some sort of sentiment analysis where we find out if different sources have significant different sentiments?

- Other than that, data looks quite clean and ready to use. We may remove the ones with no titles to keep it really smooth.

- I think whether or not to clean the text is a question without a certain answer. We may lowercaes all, remoce white spaces etc but I think SOTA NLP, LLM tools should be able to handle everything without much cleaning. So perhaps we skip this for now. Let's see if I change my mind later.

In [68]:
def remove_missing_titles(df, dataset_name):
    """Remove rows with missing titles as derived from analysis above."""
    original_count = len(df)
    missing_titles = df['title'].isna().sum()
    
    print(f"\n{dataset_name}:")
    print(f"  Original articles: {original_count:,}")
    print(f"  Missing titles: {missing_titles:,} ({missing_titles/original_count*100:.2f}%)")
    
    # Remove missing titles
    cleaned_df = df.dropna(subset=['title']).copy()
    final_count = len(cleaned_df)
    removed = original_count - final_count
    
    print(f"  Removed: {removed:,} articles")
    
    return cleaned_df

# Clean both datasets
english_news_clean = remove_missing_titles(news_en, "English News")
arabic_news_clean = remove_missing_titles(news_ara, "Arabic News")



English News:
  Original articles: 86,660
  Missing titles: 3 (0.00%)
  Removed: 3 articles

Arabic News:
  Original articles: 85,511
  Missing titles: 37 (0.04%)
  Removed: 37 articles


In [69]:
english_news_clean.columns

Index(['uri', 'lang', 'isDuplicate', 'date', 'time', 'dateTime', 'dateTimePub',
       'dataType', 'sim', 'url', 'title', 'body', 'source', 'authors', 'image',
       'eventUri', 'sentiment', 'wgt', 'relevance', 'userHasPermissions'],
      dtype='object')

In [70]:
def clean_news_text(text):
    """
    Minimal cleaning function for news articles.
    Preserves meaningful content while removing the usual formating artifacts (/n, extra spaces, etc).
    
    Args:
        text (str): Raw news article text
        
    Returns:
        str: Cleaned text
    """
    if not text or not isinstance(text, str):
        return ""
    
    # Replace multiple newlines with single space
    text = re.sub(r'\n+', ' ', text)
    
    # Replace multiple spaces/tabs with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip() 
    
    return text

In [73]:
# Clean English news with simple apply function to keep it easy here. In scale, perhaps there is a faster way
print("\nCleaning English news...")
english_news_clean['title'] = english_news_clean['title'].apply(clean_news_text)
english_news_clean['body'] = english_news_clean['body'].apply(clean_news_text)

# Clean Arabic news  
print("Cleaning Arabic news...")
arabic_news_clean['title'] = arabic_news_clean['title'].apply(clean_news_text)
arabic_news_clean['body'] = arabic_news_clean['body'].apply(clean_news_text)




Cleaning English news...
Cleaning Arabic news...


In [79]:
# Create non-random subset with seed to ensure reproducibility for easier sampling and development.
# I opt to use a non-random sample to keep the time series structure of the data intact.
# Let's pick a block of n rows from a random start index though.
n_rows_en = len(english_news_clean)
n_rows_ara = len(arabic_news_clean)
block_size = 10000

# set random seed
np.random.seed(42)

# pick a random start index so that the slice fits inside the dataframe
start_en = np.random.randint(0, n_rows_en - block_size + 1)
start_ara = np.random.randint(0, n_rows_ara - block_size + 1)

# slice the dataframe
subset_new_en = english_news_clean.iloc[start_en:start_en + block_size].reset_index(drop=True)
subset_new_ara = arabic_news_clean.iloc[start_ara:start_ara + block_size].reset_index(drop=True)

In [None]:
# Save Excel files
output_path = 'new_data/english_news_clean.xlsx'
english_news_clean.to_excel(output_path, index=False)
print(f"Saved translated data to {output_path}")

Saved translated data to new_data/english_news_clean.xlsx


Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,title,body,source,authors,image,eventUri,sentiment,wgt,relevance,userHasPermissions
0,8216521939,eng,False,2024-07-09,08:15:07,2024-07-09T08:15:07Z,2024-07-09T08:14:39Z,news,0.631373,https://english.enabbaladi.net/archives/2024/0...,Israeli escalation on southern Lebanon : Al-As...,Hussam al-Mahmoud | Yamen Moghrabi | Hassan Ib...,"{'uri': 'english.enabbaladi.net', 'dataType': ...","[{'uri': 'enab10_ula@english.enabbaladi.net', ...",https://cdn.enabbaladi.net/english/wp-content/...,eng-9713305,-0.505882,503,503,
1,8235923227,eng,False,2024-07-21,08:16:52,2024-07-21T08:16:52Z,2024-07-21T08:16:12Z,news,0.0,https://www.jewishpress.com/indepth/analysis/j...,"Israel Should NOT be Stuck with a ""Fair Fight""...",t's an excellent axiom to never put your citiz...,"{'uri': 'jewishpress.com', 'dataType': 'news',...","[{'uri': 'j_e_dyer@jewishpress.com', 'name': '...",https://www.jewishpress.com/wp-content/uploads...,,0.003922,477,477,
2,2024-07-412131568,eng,False,2024-07-05,08:47:41,2024-07-05T08:47:41Z,2024-07-05T08:36:20Z,news,0.788235,https://eu.poconorecord.com/story/opinion/2024...,"Iran and Hamas are targeting Jordan, hoping to...",Iran and Hamas pose an escalating threat to th...,"{'uri': 'eu.poconorecord.com', 'dataType': 'ne...",[],https://www.gannett-cdn.com/authoring/authorin...,eng-9716644,-0.003922,423,423,
3,8191876537,eng,False,2024-06-23,21:11:10,2024-06-23T21:11:10Z,2024-06-23T21:09:27Z,news,0.635294,https://syrianobserver.com/foreign-actors/syri...,Syria Today - Conflict in Lebanon Displaces Sy...,The simmering conflict between Israel and Hezb...,"{'uri': 'syrianobserver.com', 'dataType': 'new...",[],https://syrianobserver.com/wp-content/uploads/...,eng-9669018,-0.545098,423,423,
4,8210853386,eng,False,2024-07-05,08:54:07,2024-07-05T08:54:07Z,2024-07-05T08:53:35Z,news,0.780392,https://www.yahoo.com/news/iran-hamas-targetin...,"Iran and Hamas are targeting Jordan, hoping to...",Iran and Hamas pose an escalating threat to th...,"{'uri': 'yahoo.com', 'dataType': 'news', 'titl...",[],https://s.yimg.com/ny/api/res/1.2/DmwzdEvMx2_4...,eng-9716644,-0.129412,418,418,


In [None]:
# Save Excel files
output_path = 'new_data/arabic_news_clean.xlsx'
arabic_news_clean.to_excel(output_path, index=False)
print(f"Saved translated data to {output_path}")

Saved translated data to new_data/arabic_news_clean.xlsx


Unnamed: 0,uri,lang,isDuplicate,date,time,dateTime,dateTimePub,dataType,sim,url,title,body,source,authors,image,eventUri,sentiment,wgt,relevance
0,8221345226,ara,False,2024-07-12,00:55:33,2024-07-12T00:55:33Z,2024-07-12T00:55:00Z,news,0.615686,https://www.almanar.com.lb/12211949,الصحافة اليوم: 12-7-2024,‫‫ محمد الهندي: من يرى فيديوهات المقاومين يجب ...,"{'uri': 'almanar.com.lb', 'dataType': 'news', ...",[],https://www.almanar.com.lb/framework/includes/...,ara-1673095,,533,533
1,8202585568,ara,False,2024-06-30,10:00:50,2024-06-30T10:00:50Z,2024-06-30T10:00:18Z,news,0.0,https://www.enabbaladi.net/706124/انفجار-جنوبي...,انفجار جنوبي لبنان.. الأسد حاضر دون أدوات,حسام المحمود | يامن المغربي | حسن إبراهيم أفاق...,"{'uri': 'enabbaladi.net', 'dataType': 'news', ...",[],https://cdn.enabbaladi.net/arabic/wp-content/u...,,,516,516
2,8203455578,ara,False,2024-07-01,02:22:38,2024-07-01T02:22:38Z,2024-07-01T02:22:23Z,news,0.0,https://www.almanar.com.lb/12164385,الصحافة اليوم: 1-7-2024,‫‫الحزب الديمقراطي اللبناني في ذكرى تأسيسه الـ...,"{'uri': 'almanar.com.lb', 'dataType': 'news', ...",[],https://www.almanar.com.lb/framework/includes/...,,,458,458
3,8201361178,ara,False,2024-06-29,10:09:34,2024-06-29T10:09:34Z,2024-06-29T10:09:15Z,news,0.0,https://www.shorouknews.com/news/view.aspx?cda...,الكاتب الفلسطيني ربعي المدهون في حوار مطوّل: ا...,الفارق بين حرب غزة والهولوكوست هو أن الحرب تُب...,"{'uri': 'shorouknews.com', 'dataType': 'news',...",[],https://www.shorouknews.com/uploadedimages/Gal...,,,455,455
4,8201914853,ara,False,2024-06-29,19:34:06,2024-06-29T19:34:06Z,2024-06-29T19:33:54Z,news,0.0,https://www.maghress.com/lakome/348309,"""طوفان الأقصى"": الحاجة إلى الحرب.. مرحلة ما بع...","""إذا أراد الله أحداثا عظيمة هيأ أسبابها، تخطئ ...","{'uri': 'maghress.com', 'dataType': 'news', 't...",[],https://images2.maghress.com/lakome/lakome,,,452,452
