# Feature Engineering

This notebook extracts all features (network, content, user, temporal), shows feature distributions, analyzes correlations, and selects important features.


In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(str(Path().resolve().parent))
from src import feature_extractor, data_preprocessing, network_builder, visualization
import config

plt.style.use('seaborn-v0_8')
sns.set_palette('Set2')


## Load Data and Build Network


In [None]:
# Load data
df = data_preprocessing.create_sample_dataset(n_samples=1000)

# Build network for network features
G = network_builder.build_interaction_graph(df, user_column="user_id")

# Calculate network features
centrality_df = network_builder.calculate_centrality_measures(G)
communities = network_builder.detect_communities(G)

print(f"Data shape: {df.shape}")
print(f"Network nodes: {G.number_of_nodes()}")
print(f"Network edges: {G.number_of_edges()}")


## Extract All Features


In [None]:
# Initialize feature extractor
# Set use_bert=True if you have transformers installed and want BERT embeddings
extractor = feature_extractor.FeatureExtractor(use_bert=False)

# Extract content features
print("Extracting content features...")
content_features = extractor.extract_content_features(df, text_column="text")

# Extract user features
print("Extracting user features...")
user_features = extractor.extract_user_features(df, user_column="user_id")

# Extract temporal features
print("Extracting temporal features...")
temporal_features = extractor.extract_temporal_features(df, timestamp_column="timestamp")

# Combine all features
all_features = pd.concat([content_features, user_features, temporal_features], axis=1)

print(f"\nTotal features extracted: {len(all_features.columns)}")
print(f"\nFeature categories:")
print(f"  Content features: {len(content_features.columns)}")
print(f"  User features: {len(user_features.columns)}")
print(f"  Temporal features: {len(temporal_features.columns)}")

print(f"\n\nFeature columns:")
print(all_features.columns.tolist())
all_features.head()


## Add Network Features


In [None]:
# Map users to their network features
user_to_idx = {user: idx for idx, user in enumerate(df['user_id'].unique())}
node_mapping = {node: user_to_idx.get(node, -1) for node in G.nodes() if node in user_to_idx}

# Extract network features
network_features = extractor.extract_network_features_from_graph(
    G,
    node_mapping,
    centrality_df=centrality_df,
    communities=communities
)

# Merge network features with other features
# Align by user_id
df_with_features = df.copy()
df_with_features = df_with_features.reset_index(drop=True)

# Add network features
for col in network_features.columns:
    if col in network_features.columns:
        # Map network features to dataframe rows
        user_network_map = {}
        for node, idx in node_mapping.items():
            if idx < len(df_with_features):
                user_network_map[df_with_features.loc[idx, 'user_id']] = network_features.loc[node, col]
        
        df_with_features[f'network_{col}'] = df_with_features['user_id'].map(user_network_map).fillna(0)

print(f"Added {len(network_features.columns)} network features")
print(f"Total features now: {len([c for c in df_with_features.columns if c.startswith('network_') or c in all_features.columns])}")


## Feature Distributions


In [None]:
# Plot feature distributions
numeric_features = all_features.select_dtypes(include=[np.number])

# Select key features to visualize
key_features = ['text_length', 'word_count', 'sentiment_polarity', 
                'sentiment_compound', 'hour_of_day', 'day_of_week']
available_features = [f for f in key_features if f in numeric_features.columns]

if len(available_features) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, col in enumerate(available_features[:6]):
        axes[i].hist(numeric_features[col].dropna(), bins=30, alpha=0.7, color='steelblue', edgecolor='black')
        axes[i].set_title(col.replace('_', ' ').title(), fontweight='bold')
        axes[i].set_xlabel('Value')
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)
    
    # Hide unused subplots
    for i in range(len(available_features), 6):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Statistics
    print("Feature Statistics:")
    print(numeric_features[available_features].describe())


## Feature Correlations


In [None]:
# Plot correlation matrix for key features
key_numeric = numeric_features[available_features] if len(available_features) > 0 else numeric_features.iloc[:, :10]

if len(key_numeric.columns) > 1:
    visualization.plot_feature_correlations(key_numeric)
    
    # Show correlation with target
    if 'label' in df.columns:
        correlations = key_numeric.corrwith(df['label']).sort_values(ascending=False)
        print("\nFeature Correlations with Label:")
        print("="*50)
        print(correlations)


## Save Features


In [None]:
# Save extracted features for use in modeling
features_output = pd.concat([df[['id', 'label']], all_features], axis=1)
features_output.to_csv('../data/processed/extracted_features.csv', index=False)
print(f"Features saved to ../data/processed/extracted_features.csv")
print(f"Total features saved: {len(all_features.columns)}")
