# SentiSight - Exploratory Data Analysis

This notebook performs exploratory data analysis on the customer feedback dataset.
**Important:** The dataset is large, so we use chunked loading to avoid memory issues.

## 1. Import Libraries and Setup

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import sys
import warnings

# Add src to path
sys.path.append('../')
from src.preprocessing import DataLoader, TextPreprocessor

# Configure warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported successfully!")

In [None]:
# Configure pandas display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.precision', 2)

print("✓ Pandas display options configured")

## 2. Load Dataset Information (without loading all data)

In [None]:
# Initialize DataLoader
data_path = '../data/twcs.csv'
loader = DataLoader(data_path, chunksize=5000)

# Get dataset info without loading all data
info = loader.get_info()

print("=" * 60)
print("DATASET INFORMATION")
print("=" * 60)
for key, value in info.items():
    print(f"{key:20s}: {value}")
print("=" * 60)

## 3. Load Sample Data for Analysis

In [None]:
# Load a random sample (5000 rows) to avoid memory issues
sample_size = 5000
print(f"Loading {sample_size} random samples from the dataset...")
df_sample = loader.load_sample(n_rows=sample_size)

print(f"\n✓ Loaded {len(df_sample)} samples")
print(f"Memory usage: {df_sample.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Display first few rows
print("Sample Data Preview:")
df_sample.head(10)

In [None]:
# Data types and info
print("Data Types and Info:")
df_sample.info()

## 4. Data Quality Analysis

In [None]:
# Check for missing values
missing_data = pd.DataFrame({
    'Column': df_sample.columns,
    'Missing_Count': df_sample.isnull().sum().values,
    'Missing_Percentage': (df_sample.isnull().sum().values / len(df_sample) * 100).round(2)
})

missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_data) > 0:
    print("Missing Values Summary:")
    print(missing_data.to_string(index=False))
else:
    print("✓ No missing values found in the sample!")

In [None]:
# Statistical summary
print("Statistical Summary:")
df_sample.describe(include='all').T

## 5. Text Analysis

We'll analyze the text column to understand content patterns, length distribution, and other text characteristics.

In [None]:
# Identify text column (assuming it's named 'text' or similar)
# Let's find the likely text column
text_col = None
for col in df_sample.columns:
    if 'text' in col.lower() or 'message' in col.lower() or 'content' in col.lower() or 'tweet' in col.lower():
        text_col = col
        break

if text_col is None:
    # Use the column with longest average string length
    str_cols = df_sample.select_dtypes(include=['object']).columns
    avg_lengths = {col: df_sample[col].astype(str).str.len().mean() for col in str_cols}
    text_col = max(avg_lengths, key=avg_lengths.get)

print(f"Text column identified: '{text_col}'")
print(f"Sample text: {df_sample[text_col].iloc[0]}")

In [None]:
# Calculate text statistics
df_sample['text_length'] = df_sample[text_col].astype(str).str.len()
df_sample['word_count'] = df_sample[text_col].astype(str).str.split().str.len()

print("Text Statistics:")
print(f"Average text length: {df_sample['text_length'].mean():.1f} characters")
print(f"Average word count: {df_sample['word_count'].mean():.1f} words")
print(f"Min text length: {df_sample['text_length'].min()}")
print(f"Max text length: {df_sample['text_length'].max()}")

In [None]:
# Text length distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df_sample['text_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Text Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Text Length')
axes[0].axvline(df_sample['text_length'].mean(), color='red', linestyle='--', label=f'Mean: {df_sample["text_length"].mean():.1f}')
axes[0].legend()

# Word count distribution
axes[1].hist(df_sample['word_count'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Word Count')
axes[1].axvline(df_sample['word_count'].mean(), color='red', linestyle='--', label=f'Mean: {df_sample["word_count"].mean():.1f}')
axes[1].legend()

plt.tight_layout()
plt.show()

## 6. Feature Extraction Demo

Let's extract features using our preprocessing module to get insights about text characteristics.

In [None]:
# Extract features from a subset
sample_subset = df_sample.head(100)
features = sample_subset[text_col].apply(TextPreprocessor.extract_features)
features_df = pd.DataFrame(features.tolist())

print("Feature Extraction Sample:")
print(features_df.head())

# Merge features
df_with_features = pd.concat([sample_subset.reset_index(drop=True), features_df], axis=1)
df_with_features.head()

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Urgency
axes[0, 0].bar(['No Urgency', 'Has Urgency'], 
               [len(features_df[~features_df['has_urgency']]), 
                len(features_df[features_df['has_urgency']])],
               color=['green', 'red'], alpha=0.7)
axes[0, 0].set_title('Urgency Indicators')
axes[0, 0].set_ylabel('Count')

# Emotion counts
emotion_data = pd.DataFrame({
    'Negative': features_df['negative_emotion_count'],
    'Positive': features_df['positive_emotion_count']
})
emotion_data.plot(kind='box', ax=axes[0, 1])
axes[0, 1].set_title('Emotion Word Distribution')
axes[0, 1].set_ylabel('Count')

# Exclamation marks
axes[1, 0].hist(features_df['exclamation_count'], bins=20, edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Exclamation Marks Distribution')
axes[1, 0].set_xlabel('Count')
axes[1, 0].set_ylabel('Frequency')

# Capital letters ratio
axes[1, 1].hist(features_df['caps_ratio'], bins=20, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].set_title('Capital Letters Ratio')
axes[1, 1].set_xlabel('Ratio')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 7. Save Findings

Save key insights and statistics for future reference.

In [None]:
# Summary of findings
summary = {
    'dataset_size': info['num_rows'],
    'dataset_columns': info['num_columns'],
    'sample_size': len(df_sample),
    'text_column': text_col,
    'avg_text_length': df_sample['text_length'].mean(),
    'avg_word_count': df_sample['word_count'].mean(),
    'has_urgency_pct': (features_df['has_urgency'].sum() / len(features_df) * 100),
    'negative_emotion_avg': features_df['negative_emotion_count'].mean(),
    'positive_emotion_avg': features_df['positive_emotion_count'].mean(),
}

print("\n" + "="*60)
print("EDA SUMMARY")
print("="*60)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key:25s}: {value:.2f}")
    else:
        print(f"{key:25s}: {value}")
print("="*60)

print("\n✓ EDA Complete! Ready for model training.")