# Factiva News Data Exploration

This notebook performs exploratory data analysis (EDA) on the Factiva News dataset from 2025. 

## Objectives:
- Load and examine sample data files
- Analyze metadata structure and content
- Understand data quality and characteristics
- Identify key fields and patterns


In [32]:

# Import necessary libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
from joblib import Parallel, delayed
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")


Libraries imported successfully!


In [33]:
# Define data directory path
data_dir = Path("/ephemeral/home/xiong/data/Fund/Factiva_News/2025")

# List all available files
print("Available data files:")
json_files = list(data_dir.glob("*.json"))
for i, file in enumerate(json_files):
    file_size_mb = file.stat().st_size / (1024 * 1024)
    print(f"{i+1}. {file.name} ({file_size_mb:.1f} MB)")

# Select the first file as sample
sample_file = json_files[0]
print(f"\nSelected sample file: {sample_file.name}")
print(f"File size: {sample_file.stat().st_size / (1024 * 1024):.1f} MB")


Available data files:
1. 2025_articles_4.json (635.4 MB)
2. 2025_articles.json (632.1 MB)
3. 2025_articles_1.json (636.8 MB)
4. 2025_articles_3.json (634.3 MB)
5. 2025_articles_5.json (633.0 MB)
6. 2025_articles_7.json (629.7 MB)
7. 2025_articles_2.json (639.1 MB)
8. 2025_articles_6.json (632.3 MB)

Selected sample file: 2025_articles_4.json
File size: 635.4 MB


In [34]:
# Load a sample of the data to understand structure
def load_data(file_path, sample_size=None):
    """
    Load a sample of articles from the JSON file
    
    Args:
        file_path: Path to the JSON file
        sample_size: Number of articles to load. If None, loads all articles
    """
    articles = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if sample_size is None:
            articles = data
        else:
            articles = data[:sample_size]

    except Exception as e:
        print(f"Error loading data: {e}")
        return []
    return articles


In [35]:
# Load sample data
sample_articles = load_data(sample_file)
print(len(sample_articles))

89557


In [36]:
def transform_dates(articles, verbose=True):
    """Transform date/datetime fields to readable format"""
    if not articles:
        return articles
    
    # Handle single article case
    if isinstance(articles, dict):
        articles = [articles]
    
    date_patterns = {'date', 'datetime', 'time', 'timestamp', 'created', 'modified', 
                     'published', 'ingestion', 'publication', 'modification'}
    
    total_transformed = 0
    
    for article in articles:
        if not isinstance(article, dict):
            continue
        
        for key, value in article.items():
            if (any(p in key.lower() for p in date_patterns) and 
                isinstance(value, str) and value.isdigit()):
                try:
                    ts = int(value) / (1000 if len(value) == 13 else 1)
                    article[key] = datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
                    total_transformed += 1
                except (ValueError, OSError):
                    continue
    
    if verbose:
        print(f"Transformed {total_transformed} date fields")
    
    # Return single article if input was single article
    if len(articles) == 1 and isinstance(articles[0], dict):
        return articles[0]
    return articles



In [37]:
# Apply date transformation
# sample_articles = transform_dates(sample_articles,verbose=True)

In [None]:
def prepare_articles_for_dataframe(articles,org_filename=None,
                                   transform_date_fields=True,
                                   other_transform_func=None,
                                   verbose=True):
    """
    Process articles to prepare for DataFrame conversion:
    - Remove 'body' and 'snippet' fields
    - Add 'original_filename' field
    - Convert to pandas DataFrame
    """
    if not articles:
        return pd.DataFrame()
    
    processed_articles = []
    
    for article in articles:
        if isinstance(article, dict):
            # Create a copy to avoid modifying original
            processed_article = article.copy()
            # Remove body and snippet fields
            processed_article.pop('body', None)
            processed_article.pop('snippet', None)
            # Add original_filename field (you may need to adjust this based on your data source)
            if org_filename is not None:
                processed_article['original_filename'] = org_filename
            else:
                processed_article['original_filename'] = ''  # Adjust as needed
            if transform_date_fields:
                processed_article = transform_dates(processed_article,verbose=False)
            if other_transform_func is not None:
                processed_article = other_transform_func(processed_article)
            processed_articles.append(processed_article)

    # Convert to DataFrame
    df = pd.DataFrame(processed_articles)
    if verbose:
        print(f"Processed {len(processed_articles)} articles into DataFrame")
        print(f"DataFrame shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
    
    return df




In [43]:
# Process articles and create DataFrame
articles_df = prepare_articles_for_dataframe(sample_articles)

Processed 89557 articles into DataFrame
DataFrame shape: (89557, 41)
Columns: ['source_name', 'ingestion_datetime', 'currency_codes', 'company_codes_association_ticker_exchange', 'title', 'company_codes_lineage_ticker_exchange', 'an', 'company_codes_occur_ticker_exchange', 'copyright', 'modification_date', 'industry_codes', 'language_code', 'company_codes_ticker_exchange', 'region_of_origin', 'company_codes_relevance', 'word_count', 'company_codes_about', 'byline', 'person_codes', 'company_codes_lineage', 'publication_datetime', 'publication_date', 'subject_codes', 'publisher_name', 'source_code', 'credit', 'art', 'company_codes_association', 'document_type', 'modification_datetime', 'company_codes', 'action', 'region_codes', 'market_index_codes', 'company_codes_about_ticker_exchange', 'company_codes_occur', 'section', 'company_codes_relevance_ticker_exchange', 'original_filename', 'dateline', 'availability_datetime']


In [45]:
articles_df.head()

Unnamed: 0,source_name,ingestion_datetime,currency_codes,company_codes_association_ticker_exchange,title,company_codes_lineage_ticker_exchange,an,company_codes_occur_ticker_exchange,copyright,modification_date,...,action,region_codes,market_index_codes,company_codes_about_ticker_exchange,company_codes_occur,section,company_codes_relevance_ticker_exchange,original_filename,dateline,availability_datetime
0,MarketLine Industry Profiles,2025-05-09 11:56:39,,,South Korea - Security Services - Market Overview,,DMRP000020250509el3d000fs,,"© 2025, MarketLine. All rights reserved",2025-05-10 07:32:52,...,add,",apacz,asiaz,easiaz,skorea,",,,",twban,",,,,,
1,La Tribuna de Ciudad Real,2025-05-01 07:20:34,,,Habrá plan de control de subvenciones a grupos...,,TRIBCR0020250501el4u00002,,Copyright 2025. La Tribuna de Ciudad Real,2025-05-02 07:25:09,...,add,",castil,eecz,eurz,medz,spain,weurz,",,,,Cortes Regionales,,,,
2,Agência CMA Latam,2025-04-25 18:25:08,,,"RADAR COLOMBIA: Tensión con Congreso, reformas...",,AGCMALAM20250425el4p002mh,,Copyright 2025 © Agência CMA.,2025-05-23 07:26:38,...,rep,",col,devgcoz,dvpcoz,lamz,samz,",,,,,,,,
3,中国经济信息网 (简体),2025-04-29 12:11:33,,,中小银行IPO漫漫长跑路,,CEINCN0020250429el4t0001s,",6030:XHKG,CI9:XFRA,600030:XSHG,CIIHY:PINX,CII...",(c) 2025 中国经济信息网版权所有,2025-05-16 07:22:26,...,rep,",apacz,asiaz,china,chinaz,devgcoz,dvpcoz,easiaz,",,,",citich,",,,,"北京, 2025年4月29日",2025-04-29 12:13:51
4,Cinco Dias.com,2025-04-29 07:01:20,,,Eléctricas y aseguradoras cubrirán parte de lo...,,CINCOM0020250429el4t00009,,"© DIARIO CINCO DÍAS, S.A",2025-05-14 07:31:38,...,rep,",eecz,eurz,medz,spain,weurz,",,,,Companías,,,,
