In [7]:
# ===================================================================================
# THE DEFINITIVE DATA PREPARATION SCRIPT (FOR VS CODE)
# This script combines all 27+ datasets into a single, clean, balanced master file.
# ===================================================================================

import pandas as pd
import os

print("--- Worldwide Information Credibility Dataset Builder ---")

# --- Step 1: Define File Lists Based on Our Strategy ---
# We intentionally EXCLUDE test.csv and valid_data.csv from the training set.
files_with_internal_labels = [
    'fake_news_dataset.csv', 'train.csv', 'train_data.csv', 'news_dataset.csv'
]
files_to_label_as_credible = {
    'True.csv': ['title', 'text'], 'abcnews-date-text.csv': ['headline_text'],
    'articles.csv': ['text'], 'BBCHindi.csv': ['description'],
    'bbc-news-data.csv': ['content'], 'business_data.csv': ['text'],
    'DainikBhaskar.csv': ['content'], 'DataSet_Misinfo_TRUE.csv': ['text'],
    'education_data.csv': ['text'], 'entertainment_data.csv': ['text'],
    'IndianFinancialNews.csv': ['Title', 'Description'], 'JagranNews.csv': ['content'],
    'kosmopulse_articles_with_entities.csv': ['text'], 'Live_Hindustan.csv': ['content'],
    'NavbharatTimes.csv': ['content'], 'news_articles.csv': ['text'],
    'news_summary.csv': ['text'], 'news_summary_more.csv': ['headlines', 'text'],
    'sports_data.csv': ['text'], 'technology_data.csv': ['text'],
}
files_to_label_as_misinformation = {
    'Fake.csv': ['title', 'text'], 'EXTRA_RussianPropagandaSubset.csv': ['content'],
    'DataSet_Misinfo_FAKE.csv': ['text']
}

all_dataframes = []
processed_log = []
error_log = []

# --- Step 2: Processing Logic ---
def process_file(filepath, label, text_columns):
    global all_dataframes, processed_log, error_log
    try:
        df = pd.read_csv(filepath, on_bad_lines='skip')
        df['label'] = label
        df['text'] = df[text_columns].fillna('').agg(' '.join, axis=1)
        all_dataframes.append(df[['text', 'label']])
        processed_log.append(filepath)
    except Exception as e:
        error_log.append((filepath, e))

print("\n--- Processing Files ---")
for fp, cols in files_to_label_as_credible.items(): process_file(fp, 1, cols)
for fp, cols in files_to_label_as_misinformation.items(): process_file(fp, 0, cols)
for fp in files_with_internal_labels:
    try:
        df = pd.read_csv(fp, on_bad_lines='skip')
        if 'label' in df.columns:
            if pd.api.types.is_numeric_dtype(df['label']):
                df['label'] = df['label'].apply(lambda x: 1 if x == 1 else 0)
            else:
                df['label'] = df['label'].astype(str).str.lower().apply(lambda x: 0 if 'fake' in x else 1)
            text_parts = []
            for col in ['title', 'author', 'text']:
                if col in df.columns: text_parts.append(df[col].fillna(''))
            df['text'] = pd.concat(text_parts, axis=1).agg(' '.join, axis=1)
            all_dataframes.append(df[['text', 'label']])
            processed_log.append(fp)
        else:
            error_log.append((fp, "No 'label' column found"))
    except Exception as e:
        error_log.append((fp, e))

# --- Step 3: Combine, Clean, and Balance ---
if not all_dataframes:
    print("\nCRITICAL: No data was processed. Please check file names.")
else:
    print("\nCombining all processed data...")
    df_master = pd.concat(all_dataframes, ignore_index=True)
    df_master.dropna(subset=['text'], inplace=True)
    df_master = df_master[df_master['text'].str.strip() != '']
    df_master.drop_duplicates(subset=['text'], inplace=True)
    
    print("\n--- Balancing the Dataset (Downsampling) ---")
    misinfo_df = df_master[df_master['label'] == 0]
    credible_df = df_master[df_master['label'] == 1]
    
    if len(credible_df) > len(misinfo_df):
        credible_downsampled_df = credible_df.sample(n=len(misinfo_df), random_state=42)
        df_balanced = pd.concat([misinfo_df, credible_downsampled_df])
    else: # In case there's more misinformation, which is unlikely but safe to handle
        misinfo_downsampled_df = misinfo_df.sample(n=len(credible_df), random_state=42)
        df_balanced = pd.concat([misinfo_downsampled_df, credible_df])

    df_final_training = df_balanced.sample(frac=1).reset_index(drop=True)

    print(f"Final Balanced Training Set Size: {len(df_final_training)}")
    print(f"CREDIBLE articles in final set: {df_final_training['label'].value_counts().get(1, 0)}")
    print(f"MISINFORMATION articles in final set: {df_final_training['label'].value_counts().get(0, 0)}")

    # --- Step 4: Save the Final Dataset ---
    print("\nSaving the final, balanced training dataset...")
    df_final_training.to_csv('master_balanced_training_data.csv', index=False)
    print("SUCCESS: File 'master_balanced_training_data.csv' saved!")


--- Worldwide Information Credibility Dataset Builder ---

--- Processing Files ---

Combining all processed data...

--- Balancing the Dataset (Downsampling) ---
Final Balanced Training Set Size: 160090
CREDIBLE articles in final set: 80045
MISINFORMATION articles in final set: 80045

Saving the final, balanced training dataset...
SUCCESS: File 'master_balanced_training_data.csv' saved!


In [8]:
# ===================================================================================
# THE DEFINITIVE DATA PREPARATION SCRIPT (FOR VS CODE)
# This script combines all 27+ datasets into a single, clean, balanced master file.
# ===================================================================================

import pandas as pd
import os

print("--- Worldwide Information Credibility Dataset Builder ---")

# --- Step 1: Define File Lists Based on Our Strategy ---
# We intentionally EXCLUDE test.csv and valid_data.csv from the training set.
files_with_internal_labels = [
    'fake_news_dataset.csv', 'train.csv', 'train_data.csv', 'news_dataset.csv'
]
files_to_label_as_credible = {
    'True.csv': ['title', 'text'], 'abcnews-date-text.csv': ['headline_text'],
    'articles.csv': ['text'], 'BBCHindi.csv': ['description'],
    'bbc-news-data.csv': ['content'], 'business_data.csv': ['text'],
    'DainikBhaskar.csv': ['content'], 'DataSet_Misinfo_TRUE.csv': ['text'],
    'education_data.csv': ['text'], 'entertainment_data.csv': ['text'],
    'IndianFinancialNews.csv': ['Title', 'Description'], 'JagranNews.csv': ['content'],
    'kosmopulse_articles_with_entities.csv': ['text'], 'Live_Hindustan.csv': ['content'],
    'NavbharatTimes.csv': ['content'], 'news_articles.csv': ['text'],
    'news_summary.csv': ['text'], 'news_summary_more.csv': ['headlines', 'text'],
    'sports_data.csv': ['text'], 'technology_data.csv': ['text'],
}
files_to_label_as_misinformation = {
    'Fake.csv': ['title', 'text'], 'EXTRA_RussianPropagandaSubset.csv': ['content'],
    'DataSet_Misinfo_FAKE.csv': ['text']
}

all_dataframes = []
processed_log = []
error_log = []

# --- Step 2: Processing Logic ---
def process_file(filepath, label, text_columns):
    global all_dataframes, processed_log, error_log
    try:
        df = pd.read_csv(filepath, on_bad_lines='skip')
        df['label'] = label
        df['text'] = df[text_columns].fillna('').agg(' '.join, axis=1)
        all_dataframes.append(df[['text', 'label']])
        processed_log.append(filepath)
    except Exception as e:
        error_log.append((filepath, e))

print("\n--- Processing Files ---")
for fp, cols in files_to_label_as_credible.items(): process_file(fp, 1, cols)
for fp, cols in files_to_label_as_misinformation.items(): process_file(fp, 0, cols)
for fp in files_with_internal_labels:
    try:
        df = pd.read_csv(fp, on_bad_lines='skip')
        if 'label' in df.columns:
            if pd.api.types.is_numeric_dtype(df['label']):
                df['label'] = df['label'].apply(lambda x: 1 if x == 1 else 0)
            else:
                df['label'] = df['label'].astype(str).str.lower().apply(lambda x: 0 if 'fake' in x else 1)
            text_parts = []
            for col in ['title', 'author', 'text']:
                if col in df.columns: text_parts.append(df[col].fillna(''))
            df['text'] = pd.concat(text_parts, axis=1).agg(' '.join, axis=1)
            all_dataframes.append(df[['text', 'label']])
            processed_log.append(fp)
        else:
            error_log.append((fp, "No 'label' column found"))
    except Exception as e:
        error_log.append((fp, e))

# --- Step 3: Combine, Clean, and Balance ---
if not all_dataframes:
    print("\nCRITICAL: No data was processed. Please check file names.")
else:
    print("\nCombining all processed data...")
    df_master = pd.concat(all_dataframes, ignore_index=True)
    df_master.dropna(subset=['text'], inplace=True)
    df_master = df_master[df_master['text'].str.strip() != '']
    df_master.drop_duplicates(subset=['text'], inplace=True)
    
    print("\n--- Balancing the Dataset (Downsampling) ---")
    misinfo_df = df_master[df_master['label'] == 0]
    credible_df = df_master[df_master['label'] == 1]
    
    if len(credible_df) > len(misinfo_df):
        credible_downsampled_df = credible_df.sample(n=len(misinfo_df), random_state=42)
        df_balanced = pd.concat([misinfo_df, credible_downsampled_df])
    else: # In case there's more misinformation, which is unlikely but safe to handle
        misinfo_downsampled_df = misinfo_df.sample(n=len(credible_df), random_state=42)
        df_balanced = pd.concat([misinfo_downsampled_df, credible_df])

    df_final_training = df_balanced.sample(frac=1).reset_index(drop=True)

    print(f"Final Balanced Training Set Size: {len(df_final_training)}")
    print(f"CREDIBLE articles in final set: {df_final_training['label'].value_counts().get(1, 0)}")
    print(f"MISINFORMATION articles in final set: {df_final_training['label'].value_counts().get(0, 0)}")

    # --- Step 4: Save the Final Dataset ---
    print("\nSaving the final, balanced training dataset...")
    df_final_training.to_csv('master_balanced_training_data.csv', index=False)
    print("SUCCESS: File 'master_balanced_training_data.csv' saved!")


--- Worldwide Information Credibility Dataset Builder ---

--- Processing Files ---

Combining all processed data...

--- Balancing the Dataset (Downsampling) ---
Final Balanced Training Set Size: 160090
CREDIBLE articles in final set: 80045
MISINFORMATION articles in final set: 80045

Saving the final, balanced training dataset...
SUCCESS: File 'master_balanced_training_data.csv' saved!
