# Data Exploration

This notebook explores the dataset, shows statistics, visualizes class distribution, and analyzes text characteristics.


In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append(str(Path().resolve().parent))
from src import data_preprocessing
import config

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('Set2')


## Load Dataset

First, let's create a sample dataset or load existing data.


In [None]:
# Create sample dataset for demonstration
# In practice, you would load your actual dataset here
df = data_preprocessing.create_sample_dataset(n_samples=1000)

# Or load from file:
# df = data_preprocessing.load_dataset(Path("data/raw/news_data.csv"))

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()


## Dataset Statistics


In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\n" + "="*50)
print("\nBasic Statistics:")
print(df.describe())
print("\n" + "="*50)
print("\nMissing Values:")
print(df.isnull().sum())


## Class Distribution


In [None]:
# Class distribution
if 'label' in df.columns:
    class_counts = df['label'].value_counts()
    print("Class Distribution:")
    print(class_counts)
    print(f"\nPercentage:")
    print(class_counts / len(df) * 100)
    
    # Visualize
    fig, ax = plt.subplots(figsize=(8, 6))
    class_counts.plot(kind='bar', ax=ax, color=['skyblue', 'lightcoral'])
    ax.set_xlabel('Label (0=Real, 1=Fake)', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Class Distribution', fontsize=14, fontweight='bold')
    ax.set_xticklabels(['Real', 'Fake'], rotation=0)
    plt.tight_layout()
    plt.show()


## Text Characteristics Analysis


In [None]:
# Analyze text characteristics
if 'text' in df.columns:
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()
    
    # Compare real vs fake
    if 'label' in df.columns:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Text length distribution
        df.boxplot(column='text_length', by='label', ax=axes[0])
        axes[0].set_title('Text Length by Label', fontweight='bold')
        axes[0].set_xlabel('Label (0=Real, 1=Fake)')
        axes[0].set_ylabel('Text Length (characters)')
        
        # Word count distribution
        df.boxplot(column='word_count', by='label', ax=axes[1])
        axes[1].set_title('Word Count by Label', fontweight='bold')
        axes[1].set_xlabel('Label (0=Real, 1=Fake)')
        axes[1].set_ylabel('Word Count')
        
        plt.tight_layout()
        plt.show()
        
        # Statistics
        print("\nText Statistics by Label:")
        print(df.groupby('label')[['text_length', 'word_count']].describe())


## Sample Posts


In [None]:
# Show sample posts
if 'text' in df.columns and 'label' in df.columns:
    print("Sample Real News Posts:")
    print("="*50)
    real_samples = df[df['label'] == 0]['text'].head(3)
    for i, text in enumerate(real_samples, 1):
        print(f"\n{i}. {text[:200]}...")
    
    print("\n\nSample Fake News Posts:")
    print("="*50)
    fake_samples = df[df['label'] == 1]['text'].head(3)
    for i, text in enumerate(fake_samples, 1):
        print(f"\n{i}. {text[:200]}...")
