# Content Performance Fusion Engine - FIXED VERSION
## Multi-Signal Integration for Viral Content Prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load and Inspect Signal Data

In [2]:
import pandas as pd
import numpy as np

HAWKES = pd.read_csv("/kaggle/input/apply-hawkes-process-for-self-exciting-momentum/signal_hawkes.csv")
TBI_BURST = pd.read_csv("/kaggle/input/tbi-novelty-detection/tbi_novelty_bursts.csv")
FUNDAMENTALS = pd.read_csv("/kaggle/input/fundamental-analysis-comments-likes-tags/signal_fundamentals.csv")
DECAY = pd.read_csv("/kaggle/input/trend-decay-analysis-multi-signal-combined/signal_decay.csv")

print("Hawkes output: ")
display(HAWKES.head())

print("TBI Burst output: ")
display(TBI_BURST.head())

print("Fundamental analysis output: ")
display(FUNDAMENTALS.head())

print("Trend decay analysis output: ")
display(DECAY.head())

Hawkes output: 


Unnamed: 0,videoId,total_comments,active_hours,duration_hours,peak_hourly_comments,decay_rate,decay_r2,momentum_ratio,recent_momentum,velocity,...,composite_momentum_scaled,momentum_ratio_scaled,recent_momentum_scaled,acceleration_scaled,velocity_scaled,hawkes_R_current_scaled,momentum_category,hawkes_R_max,hawkes_R_mean,hawkes_R_early
0,0,526,309,46946.0,29,1e-05,0.069322,0.571142,0.768463,0.011204,...,6.758358,4.219316,5.750146,10.0,0.124363,6.758358,High,6.758358,6.758358,4.219316
1,11,483,129,1565.0,54,0.001133,0.138372,0.247429,0.165803,0.308626,...,0.0,0.434463,0.0,0.023324,4.91285,0.0,Low,0.0,0.0,0.434463
2,33,2643,1497,14680.0,10,7.6e-05,0.139883,0.507023,1.0,0.180041,...,5.179153,3.469643,8.030363,0.280366,2.842628,5.179153,High,5.179153,5.179153,3.469643
3,45,16074,7087,19895.0,28,2.9e-05,0.042053,0.686303,1.5,0.807942,...,8.554439,5.565786,10.0,5.728832,10.0,8.554439,Viral,8.554439,8.554439,5.565786
4,54,107,49,19760.0,20,4.1e-05,0.047542,0.356923,0.307692,0.005415,...,1.107789,1.714668,1.212391,0.001571,0.031154,1.107789,Low,1.107789,1.107789,1.714668


TBI Burst output: 


Unnamed: 0,novelty_date,novelty_day_index,novelty_strength,novelty_duration,total_activity,peak_activity,avg_tbi,max_tbi,time_series_length,novelty_ratio,components_used,videoId
0,2024-01-19 00:00:00+00:00,80,1.83634,8,526,76,0.772085,1.83634,176,0.045455,3,0
1,2025-04-05 00:00:00+00:00,2,2.732159,1,483,143,0.906893,2.732159,31,0.032258,3,11
2,2023-11-28 00:00:00+00:00,41,2.697786,12,2643,76,0.879547,2.697786,302,0.039735,3,33
3,2024-07-28 00:00:00+00:00,418,2.758616,18,16074,319,1.00052,2.758616,745,0.024161,3,45
4,2024-11-25 00:00:00+00:00,21,2.40781,6,488,75,0.791909,2.40781,140,0.042857,3,67


Fundamental analysis output: 


Unnamed: 0,videoId,total_comments,total_likes,unique_commenters,engagement_ratio,commenter_depth,avg_emojis_per_comment,total_emojis,avg_hashtags_per_comment,lang_diversity,avg_text_length,video_viewCount,saturation,engagement_quality,depth_score,content_richness,fundamental_health
0,0,526,667,526,1.265655,0.998102,0.193916,102,0.0,0.062619,37.762357,,0.0,0.126565,0.998102,0.072157,0.371704
1,1,1,4,1,2.0,0.5,1.0,1,0.0,0.5,27.0,4783.0,0.172921,0.2,0.5,0.1205,0.26615
2,2,1,0,1,0.0,0.5,0.0,0,0.0,0.5,42.0,938.0,0.515996,0.0,0.5,0.063,0.1689
3,6,5,1,5,0.166667,0.833333,0.0,0,0.0,0.333333,103.4,4359.0,0.93301,0.016667,0.833333,0.1551,0.303197
4,7,1,0,1,0.0,0.5,1.0,1,0.0,0.5,1.0,40.0,0.961538,0.0,0.5,0.0815,0.17445


Trend decay analysis output: 


Unnamed: 0,videoId,decay_strength_combined,half_life_hours_combined,state_combined
0,0.0,0.016334,no_decay,rising
1,1.0,0.0,no_decay,no_decay
2,2.0,0.0,no_decay,no_decay
3,6.0,0.0,no_decay,no_decay
4,7.0,0.0,no_decay,no_decay


In [3]:
# Display data shapes and basic info
print("=== DATA SHAPES ===")
print(f"Hawkes: {HAWKES.shape}")
print(f"TBI Burst: {TBI_BURST.shape}")
print(f"Fundamentals: {FUNDAMENTALS.shape}")
print(f"Decay: {DECAY.shape}")

print("\n=== VIDEO ID OVERLAP ===")
hawkes_ids = set(HAWKES['videoId'].unique())
tbi_ids = set(TBI_BURST['videoId'].unique())
fundamental_ids = set(FUNDAMENTALS['videoId'].unique())
decay_ids = set(DECAY['videoId'].unique())

common_all = hawkes_ids & tbi_ids & fundamental_ids & decay_ids
common_fundamental_decay = fundamental_ids & decay_ids

print(f"Videos with all 4 signals: {len(common_all)}")
print(f"Videos with Fundamental + Decay: {len(common_fundamental_decay)}")

=== DATA SHAPES ===
Hawkes: (5383, 28)
TBI Burst: (5387, 12)
Fundamentals: (39938, 17)
Decay: (39938, 4)

=== VIDEO ID OVERLAP ===
Videos with all 4 signals: 4251
Videos with Fundamental + Decay: 39938


## Signal Processing and Normalization - FIXED

In [4]:
class SignalProcessor:
    def __init__(self):
        pass
        
    def safe_normalize(self, series, feature_name=""):
        """Safely normalize a series, handling edge cases and non-numeric data"""
        if len(series) == 0:
            return pd.Series([0] * len(series), index=series.index)
        
        # Convert to numeric if needed
        if not pd.api.types.is_numeric_dtype(series):
            series = pd.to_numeric(series, errors='coerce')
        
        # Fill NaN values
        series = series.fillna(0)
        
        # Handle all same values or constant series
        if series.std() == 0 or pd.isna(series.std()):
            return pd.Series([0.5] * len(series), index=series.index)
        
        # Min-Max normalization
        min_val = series.min()
        max_val = series.max()
        
        if max_val == min_val:
            return pd.Series([0.5] * len(series), index=series.index)
        
        normalized = (series - min_val) / (max_val - min_val)
        return normalized
    
    def process_hawkes_signals(self, df):
        """Process Hawkes process signals"""
        result = df[['videoId']].copy()
        
        # Key Hawkes features
        hawkes_features = {}
        
        # Momentum features
        if 'composite_momentum_scaled' in df.columns:
            hawkes_features['momentum'] = df['composite_momentum_scaled']
        elif 'recent_momentum_scaled' in df.columns:
            hawkes_features['momentum'] = df['recent_momentum_scaled']
        else:
            hawkes_features['momentum'] = df.get('total_comments', pd.Series([0]*len(df)))
        
        # Velocity features
        if 'velocity_scaled' in df.columns:
            hawkes_features['velocity'] = df['velocity_scaled']
        else:
            hawkes_features['velocity'] = df.get('peak_hourly_comments', pd.Series([0]*len(df)))
        
        # Acceleration features
        if 'acceleration_scaled' in df.columns:
            hawkes_features['acceleration'] = df['acceleration_scaled']
        else:
            hawkes_features['acceleration'] = df.get('active_hours', pd.Series([0]*len(df)))
        
        # Normalize features
        for key, series in hawkes_features.items():
            result[f'hawkes_{key}'] = self.safe_normalize(series.fillna(0))
        
        # Create composite Hawkes score
        hawkes_score = (
            result['hawkes_momentum'] * 0.4 +
            result['hawkes_velocity'] * 0.3 +
            result['hawkes_acceleration'] * 0.3
        )
        result['hawkes_health_score'] = self.safe_normalize(hawkes_score)
        
        return result
    
    def process_tbi_signals(self, df):
        """Process TBI burst signals"""
        result = df[['videoId']].copy()
        
        # TBI features
        tbi_features = {}
        
        if 'novelty_strength' in df.columns:
            tbi_features['strength'] = df['novelty_strength']
        else:
            tbi_features['strength'] = df.get('max_tbi', pd.Series([0]*len(df)))
        
        if 'peak_activity' in df.columns:
            tbi_features['activity'] = df['peak_activity']
        else:
            tbi_features['activity'] = df.get('total_activity', pd.Series([0]*len(df)))
        
        if 'novelty_ratio' in df.columns:
            tbi_features['ratio'] = df['novelty_ratio']
        else:
            tbi_features['ratio'] = pd.Series([0]*len(df))
        
        # Normalize features
        for key, series in tbi_features.items():
            result[f'tbi_{key}'] = self.safe_normalize(series.fillna(0))
        
        # Create composite TBI score
        tbi_score = (
            result['tbi_strength'] * 0.4 +
            result['tbi_activity'] * 0.4 +
            result['tbi_ratio'] * 0.2
        )
        result['tbi_burst_score'] = self.safe_normalize(tbi_score)
        
        return result
    
    def process_fundamental_signals(self, df):
        """Process fundamental analysis signals"""
        result = df[['videoId']].copy()
        
        # Fundamental features
        fundamental_features = {}
        
        if 'engagement_ratio' in df.columns:
            fundamental_features['engagement'] = df['engagement_ratio']
        else:
            fundamental_features['engagement'] = pd.Series([0]*len(df))
        
        if 'commenter_depth' in df.columns:
            fundamental_features['depth'] = df['commenter_depth']
        else:
            fundamental_features['depth'] = pd.Series([0]*len(df))
        
        if 'content_richness' in df.columns:
            fundamental_features['richness'] = df['content_richness']
        else:
            fundamental_features['richness'] = pd.Series([0]*len(df))
        
        if 'fundamental_health' in df.columns:
            fundamental_features['health'] = df['fundamental_health']
        else:
            fundamental_features['health'] = pd.Series([0]*len(df))
        
        # Normalize features
        for key, series in fundamental_features.items():
            result[f'fundamental_{key}'] = self.safe_normalize(series.fillna(0))
        
        # Create composite fundamental score
        fundamental_score = (
            result['fundamental_engagement'] * 0.3 +
            result['fundamental_depth'] * 0.25 +
            result['fundamental_richness'] * 0.25 +
            result['fundamental_health'] * 0.2
        )
        result['fundamental_quality_score'] = self.safe_normalize(fundamental_score)
        
        return result
    
    def process_decay_signals(self, df):
        """Process trend decay signals"""
        result = df[['videoId']].copy()
        
        # Decay features
        decay_features = {}
        
        if 'decay_strength_combined' in df.columns:
            # Inverse relationship - lower decay strength is better
            decay_features['strength'] = 1 / (1 + df['decay_strength_combined'].fillna(0))
        else:
            decay_features['strength'] = pd.Series([1]*len(df))  # Default good score
        
        if 'half_life_hours_combined' in df.columns:
            # Longer half-life is better (normalized)
            decay_features['half_life'] = df['half_life_hours_combined'].fillna(0)
        else:
            decay_features['half_life'] = pd.Series([0]*len(df))
        
        # Handle state_combined column (convert strings to numeric values)
        if 'state_combined' in df.columns:
            # Convert string states to numeric values (rising = 1, no_decay = 0.5, falling = 0)
            state_mapping = {'rising': 1.0, 'no_decay': 0.5, 'falling': 0.0, 'unknown': 0.5}
            # Handle any unexpected string values by mapping them to 0.5
            decay_features['state'] = df['state_combined'].map(state_mapping).fillna(0.5)
        else:
            decay_features['state'] = pd.Series([0.5]*len(df))  # Default neutral state
        
        # Normalize features
        for key, series in decay_features.items():
            # Ensure series contains only numeric values before normalization
            if not pd.api.types.is_numeric_dtype(series):
                series = pd.to_numeric(series, errors='coerce').fillna(0)
            result[f'decay_{key}'] = self.safe_normalize(series)
        
        # Create composite decay score
        decay_score = (
            result['decay_strength'] * 0.4 +
            result['decay_half_life'] * 0.4 +
            result['decay_state'] * 0.2
        )
        result['decay_health_score'] = self.safe_normalize(decay_score)
        
        return result

## Fusion Engine Implementation - FIXED

In [5]:
class FusionEngine:
    def __init__(self):
        self.signal_processor = SignalProcessor()
        self.weights = {
            'hawkes': 0.3,
            'tbi': 0.25,
            'fundamental': 0.25,
            'decay': 0.2
        }
        
    def create_unified_dataset(self, hawkes_df, tbi_df, fundamental_df, decay_df):
        """Create a unified dataset with all video IDs and processed signals"""
        print("Creating unified dataset...")
        
        # Get all unique video IDs
        all_video_ids = set()
        all_video_ids.update(hawkes_df['videoId'].unique())
        all_video_ids.update(tbi_df['videoId'].unique())
        all_video_ids.update(fundamental_df['videoId'].unique())
        all_video_ids.update(decay_df['videoId'].unique())
        
        print(f"Total unique video IDs: {len(all_video_ids)}")
        
        # Create base dataframe
        unified_df = pd.DataFrame({'videoId': list(all_video_ids)})
        
        # Process each signal
        print("Processing Hawkes signals...")
        hawkes_processed = self.signal_processor.process_hawkes_signals(hawkes_df)
        
        print("Processing TBI signals...")
        tbi_processed = self.signal_processor.process_tbi_signals(tbi_df)
        
        print("Processing Fundamental signals...")
        fundamental_processed = self.signal_processor.process_fundamental_signals(fundamental_df)
        
        print("Processing Decay signals...")
        decay_processed = self.signal_processor.process_decay_signals(decay_df)
        
        # Merge all processed signals
        print("Merging all signals...")
        unified_df = unified_df.merge(hawkes_processed, on='videoId', how='left')
        unified_df = unified_df.merge(tbi_processed, on='videoId', how='left')
        unified_df = unified_df.merge(fundamental_processed, on='videoId', how='left')
        unified_df = unified_df.merge(decay_processed, on='videoId', how='left')
        
        print(f"Unified dataset shape: {unified_df.shape}")
        return unified_df
    
    def calculate_composite_scores(self, unified_df):
        """Calculate composite scores from unified signals"""
        print("Calculating composite scores...")
        
        # Fill NaN values with 0 for score calculation
        hawkes_score = unified_df.get('hawkes_health_score', pd.Series([0]*len(unified_df))).fillna(0)
        tbi_score = unified_df.get('tbi_burst_score', pd.Series([0]*len(unified_df))).fillna(0)
        fundamental_score = unified_df.get('fundamental_quality_score', pd.Series([0]*len(unified_df))).fillna(0)
        decay_score = unified_df.get('decay_health_score', pd.Series([1]*len(unified_df))).fillna(1)  # Default to good
        
        # Calculate weighted composite score
        composite_score = (
            hawkes_score * self.weights['hawkes'] +
            tbi_score * self.weights['tbi'] +
            fundamental_score * self.weights['fundamental'] +
            decay_score * self.weights['decay']
        )
        
        # Add scores to dataframe
        unified_df['composite_score'] = composite_score
        unified_df['hawkes_component'] = hawkes_score * self.weights['hawkes']
        unified_df['tbi_component'] = tbi_score * self.weights['tbi']
        unified_df['fundamental_component'] = fundamental_score * self.weights['fundamental']
        unified_df['decay_component'] = decay_score * self.weights['decay']
        
        print(f"Composite scores calculated. Range: {composite_score.min():.3f} - {composite_score.max():.3f}")
        return unified_df
    
    def categorize_performance(self, unified_df):
        """Categorize content performance based on composite scores"""
        def categorize_score(score):
            if pd.isna(score):
                return 'Unknown'
            elif score >= 0.8:
                return 'Viral'
            elif score >= 0.6:
                return 'High'
            elif score >= 0.4:
                return 'Medium'
            else:
                return 'Low'
        
        unified_df['performance_category'] = unified_df['composite_score'].apply(categorize_score)
        return unified_df
    
    def generate_insights(self, unified_df):
        """Generate actionable insights from the fused signals"""
        insights = []
        
        for idx, row in unified_df.iterrows():
            video_id = row['videoId']
            insights_text = f"Video {video_id}: "
            
            # Determine strongest signal
            components = {
                'Hawkes': row.get('hawkes_component', 0),
                'TBI Burst': row.get('tbi_component', 0),
                'Fundamental': row.get('fundamental_component', 0),
                'Decay': row.get('decay_component', 0)
            }
            
            # Remove NaN values and zero values
            components = {k: v for k, v in components.items() if not pd.isna(v) and v > 0}
            
            if components:
                strongest_signal = max(components, key=components.get)
                insights_text += f"Strongest driver: {strongest_signal}. "
            else:
                # If all are zero or NaN, find the one with highest raw score
                raw_scores = {
                    'Hawkes': row.get('hawkes_health_score', 0),
                    'TBI Burst': row.get('tbi_burst_score', 0),
                    'Fundamental': row.get('fundamental_quality_score', 0),
                    'Decay': row.get('decay_health_score', 1)
                }
                raw_scores = {k: v for k, v in raw_scores.items() if not pd.isna(v)}
                if raw_scores:
                    strongest_signal = max(raw_scores, key=raw_scores.get)
                    insights_text += f"Strongest driver: {strongest_signal}. "
                else:
                    insights_text += "No clear signal drivers. "
            
            # Add performance category
            category = row.get('performance_category', 'Unknown')
            insights_text += f"Performance: {category}. "
            
            # Add recommendations
            if category == 'Viral':
                insights_text += "Recommended: Amplify and leverage for maximum reach."
            elif category == 'High':
                insights_text += "Recommended: Continue engagement strategies."
            elif category == 'Medium':
                insights_text += "Recommended: Optimize content and engagement tactics."
            else:
                insights_text += "Recommended: Reassess content strategy and engagement tactics."
            
            insights.append(insights_text)
        
        unified_df['insights'] = insights
        return unified_df

## Execute Fusion Engine

In [6]:
# Initialize fusion engine
fusion_engine = FusionEngine()

# Display data info
print("=== DATA OVERVIEW ===")
print(f"Hawkes shape: {HAWKES.shape}")
print(f"TBI shape: {TBI_BURST.shape}")
print(f"Fundamentals shape: {FUNDAMENTALS.shape}")
print(f"Decay shape: {DECAY.shape}")

=== DATA OVERVIEW ===
Hawkes shape: (5383, 28)
TBI shape: (5387, 12)
Fundamentals shape: (39938, 17)
Decay shape: (39938, 4)


In [7]:
# Create unified dataset
print("\n=== CREATING UNIFIED DATASET ===")
unified_dataset = fusion_engine.create_unified_dataset(HAWKES, TBI_BURST, FUNDAMENTALS, DECAY)

# Display sample of unified dataset
print("\nSample of unified dataset:")
display(unified_dataset.head(10))

# Show statistics
score_cols = ['hawkes_health_score', 'tbi_burst_score', 'fundamental_quality_score', 'decay_health_score']
print("\nScore distributions:")
for col in score_cols:
    if col in unified_dataset.columns:
        valid_data = unified_dataset[col].dropna()
        if len(valid_data) > 0:
            print(f"{col}: mean={valid_data.mean():.3f}, std={valid_data.std():.3f}, range={valid_data.min():.3f}-{valid_data.max():.3f}")
        else:
            print(f"{col}: No valid data")
    else:
        print(f"{col}: Column not found")


=== CREATING UNIFIED DATASET ===
Creating unified dataset...
Total unique video IDs: 39938
Processing Hawkes signals...
Processing TBI signals...
Processing Fundamental signals...
Processing Decay signals...
Merging all signals...
Unified dataset shape: (39938, 18)

Sample of unified dataset:


Unnamed: 0,videoId,hawkes_momentum,hawkes_velocity,hawkes_acceleration,hawkes_health_score,tbi_strength,tbi_activity,tbi_ratio,tbi_burst_score,fundamental_engagement,fundamental_depth,fundamental_richness,fundamental_health,fundamental_quality_score,decay_strength,decay_half_life,decay_state,decay_health_score
0,0,0.675836,0.012436,1.0,0.574065,0.148716,0.021879,0.240642,0.180816,0.004178,0.996234,0.116382,0.3447,0.461164,0.967857,0.0,1.0,0.609162
1,1,,,,,,,,,0.006602,0.0,0.194355,0.180587,0.114761,1.0,0.0,0.0,0.412532
2,2,,,,,,,,,0.0,0.0,0.101613,0.029385,0.041411,1.0,0.0,0.0,0.412532
3,6,,,,,,,,,0.00055,0.666686,0.250161,0.238187,0.366729,1.0,0.0,0.0,0.412532
4,7,,,,,,,,,0.0,0.0,0.131452,0.038014,0.053571,1.0,0.0,0.0,0.412532
5,8,,,,,,,,,0.0011,0.925953,0.122736,0.272165,0.419579,0.99798,0.0,0.0,0.411683
6,11,0.0,0.491285,0.002332,0.148085,0.545134,0.041424,0.15797,0.428327,0.015931,0.995896,0.116997,0.566249,0.524584,0.957029,0.0,1.0,0.604611
7,15,,,,,,,,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.412532
8,19,,,,,,,,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.412532
9,22,,,,,,,,,0.0,0.333343,0.220161,0.141407,0.220632,1.0,0.0,0.0,0.412532



Score distributions:
hawkes_health_score: mean=0.263, std=0.181, range=0.000-1.000
tbi_burst_score: mean=0.261, std=0.111, range=0.000-1.000
fundamental_quality_score: mean=0.270, std=0.176, range=0.000-1.000
decay_health_score: mean=0.418, std=0.112, range=0.000-1.000


In [8]:
# Calculate composite scores
print("\n=== CALCULATING COMPOSITE SCORES ===")
final_scores = fusion_engine.calculate_composite_scores(unified_dataset)

# Categorize performance
categorized_results = fusion_engine.categorize_performance(final_scores)

# Generate insights
final_results = fusion_engine.generate_insights(categorized_results)

print("\nFinal results sample:")
result_columns = ['videoId', 'composite_score', 'performance_category', 'insights']
component_cols = ['hawkes_component', 'tbi_component', 'fundamental_component', 'decay_component']
for col in component_cols:
    if col in final_results.columns:
        result_columns.append(col)

display(final_results[result_columns].head(10))


=== CALCULATING COMPOSITE SCORES ===
Calculating composite scores...
Composite scores calculated. Range: 0.052 - 0.758

Final results sample:


Unnamed: 0,videoId,composite_score,performance_category,insights,hawkes_component,tbi_component,fundamental_component,decay_component
0,0,0.454547,Medium,Video 0: Strongest driver: Hawkes. Performance...,0.17222,0.045204,0.115291,0.121832
1,1,0.111197,Low,Video 1: Strongest driver: Decay. Performance:...,0.0,0.0,0.02869,0.082506
2,2,0.092859,Low,Video 2: Strongest driver: Decay. Performance:...,0.0,0.0,0.010353,0.082506
3,6,0.174189,Low,Video 6: Strongest driver: Fundamental. Perfor...,0.0,0.0,0.091682,0.082506
4,7,0.095899,Low,Video 7: Strongest driver: Decay. Performance:...,0.0,0.0,0.013393,0.082506
5,8,0.187231,Low,Video 8: Strongest driver: Fundamental. Perfor...,0.0,0.0,0.104895,0.082337
6,11,0.403576,Medium,Video 11: Strongest driver: Fundamental. Perfo...,0.044426,0.107082,0.131146,0.120922
7,15,0.082506,Low,Video 15: Strongest driver: Decay. Performance...,0.0,0.0,0.0,0.082506
8,19,0.082506,Low,Video 19: Strongest driver: Decay. Performance...,0.0,0.0,0.0,0.082506
9,22,0.137664,Low,Video 22: Strongest driver: Decay. Performance...,0.0,0.0,0.055158,0.082506


## Analysis and Validation

In [9]:
# Detailed analysis summary
print("\n=== FUSION ENGINE ANALYSIS SUMMARY ===")
print(f"Total videos analyzed: {len(final_results)}")

if 'composite_score' in final_results.columns:
    valid_scores = final_results['composite_score'].dropna()
    if len(valid_scores) > 0:
        print(f"Average composite score: {valid_scores.mean():.3f}")
        print(f"Score range: {valid_scores.min():.3f} - {valid_scores.max():.3f}")
        
        # Find best and worst performing videos
        max_score_idx = valid_scores.idxmax()
        min_score_idx = valid_scores.idxmin()
        
        max_video_id = final_results.loc[max_score_idx, 'videoId']
        min_video_id = final_results.loc[min_score_idx, 'videoId']
        
        print(f"Highest performing video: {max_video_id} (Score: {valid_scores.max():.3f})")
        print(f"Lowest performing video: {min_video_id} (Score: {valid_scores.min():.3f})")
    else:
        print("No valid composite scores found")
else:
    print("Composite score column not found")

if 'performance_category' in final_results.columns:
    print("\nPerformance Category Breakdown:")
    category_counts = final_results['performance_category'].value_counts()
    for category, count in category_counts.items():
        percentage = (count / len(final_results)) * 100
        print(f"  {category}: {count} videos ({percentage:.1f}%)")

print("\n=== SAMPLE DETAILED INSIGHTS ===")
if 'insights' in final_results.columns:
    # Show diverse examples
    sample_videos = final_results.sample(min(10, len(final_results)))
    for idx, row in sample_videos.iterrows():
        print(row['insights'])


=== FUSION ENGINE ANALYSIS SUMMARY ===
Total videos analyzed: 39938
Average composite score: 0.171
Score range: 0.052 - 0.758
Highest performing video: 71485 (Score: 0.758)
Lowest performing video: 53332 (Score: 0.052)

Performance Category Breakdown:
  Low: 38530 videos (96.5%)
  Medium: 1399 videos (3.5%)
  High: 9 videos (0.0%)

=== SAMPLE DETAILED INSIGHTS ===
Video 67204: Strongest driver: Fundamental. Performance: Low. Recommended: Reassess content strategy and engagement tactics.
Video 73334: Strongest driver: Decay. Performance: Low. Recommended: Reassess content strategy and engagement tactics.
Video 41893: Strongest driver: Fundamental. Performance: Low. Recommended: Reassess content strategy and engagement tactics.
Video 81854: Strongest driver: Decay. Performance: Low. Recommended: Reassess content strategy and engagement tactics.
Video 11070: Strongest driver: Decay. Performance: Low. Recommended: Reassess content strategy and engagement tactics.
Video 16419: Strongest dr

## Advanced Fusion Features

In [10]:
class AdvancedFusionEngine(FusionEngine):
    def __init__(self):
        super().__init__()
        self.ml_model = RandomForestRegressor(n_estimators=50, random_state=42, max_depth=10)
    
    def predict_future_performance(self, unified_df):
        """Use machine learning to predict future performance"""
        print("Training ML model for future performance prediction...")
        
        # Select feature columns
        feature_cols = []
        
        # Add raw scores as features
        raw_score_cols = ['hawkes_health_score', 'tbi_burst_score', 
                         'fundamental_quality_score', 'decay_health_score']
        for col in raw_score_cols:
            if col in unified_df.columns:
                feature_cols.append(col)
        
        # Add component scores as features
        component_cols = ['hawkes_component', 'tbi_component', 
                         'fundamental_component', 'decay_component']
        for col in component_cols:
            if col in unified_df.columns:
                feature_cols.append(col)
        
        if 'composite_score' not in unified_df.columns or len(feature_cols) == 0:
            print("Warning: Not enough features for ML prediction")
            unified_df['predicted_future_score'] = unified_df.get('composite_score', 0)
            return unified_df
        
        # Prepare data
        X = unified_df[feature_cols].copy()
        y = unified_df['composite_score'].copy()
        
        # Handle missing values
        X = X.fillna(0)
        y = y.fillna(0)
        
        # Remove rows with invalid target values
        valid_mask = ~(np.isinf(y) | np.isnan(y))
        X_valid = X[valid_mask]
        y_valid = y[valid_mask]
        
        if len(X_valid) < 10:
            print(f"Warning: Not enough valid data for ML prediction (only {len(X_valid)} samples)")
            unified_df['predicted_future_score'] = unified_df.get('composite_score', 0)
            return unified_df
        
        try:
            # Train model on a subset if too large
            if len(X_valid) > 10000:
                sample_size = min(5000, len(X_valid))
                sample_indices = np.random.choice(X_valid.index, sample_size, replace=False)
                X_train = X_valid.loc[sample_indices]
                y_train = y_valid.loc[sample_indices]
            else:
                X_train = X_valid
                y_train = y_valid
            
            # Train model
            self.ml_model.fit(X_train, y_train)
            
            # Predict on all data
            X_full = X.fillna(0)
            predictions = self.ml_model.predict(X_full)
            unified_df['predicted_future_score'] = predictions
            
            print(f"ML prediction completed. Range: {predictions.min():.3f} - {predictions.max():.3f}")
            
        except Exception as e:
            print(f"ML prediction failed: {str(e)}")
            unified_df['predicted_future_score'] = unified_df.get('composite_score', 0)
        
        return unified_df
    
    def optimize_weights(self, unified_df):
        """Optimize signal weights based on variance and distribution"""
        print("Optimizing signal weights...")
        
        weights = {}
        
        # Get scores for each signal
        signal_scores = {}
        signal_columns = {
            'hawkes': 'hawkes_health_score',
            'tbi': 'tbi_burst_score',
            'fundamental': 'fundamental_quality_score',
            'decay': 'decay_health_score'
        }
        
        for signal, col_name in signal_columns.items():
            if col_name in unified_df.columns:
                scores = unified_df[col_name].dropna()
                if len(scores) > 0 and scores.std() > 0:
                    signal_scores[signal] = scores.std()
                else:
                    signal_scores[signal] = 0.1  # Small weight for constant signals
            else:
                signal_scores[signal] = 0.1  # Small weight if signal missing
        
        # Normalize weights
        total_weight = sum(signal_scores.values())
        if total_weight > 0:
            weights = {k: v/total_weight for k, v in signal_scores.items()}
        else:
            weights = self.weights  # Keep default weights
        
        print("Optimized weights:")
        for signal, weight in weights.items():
            print(f"  {signal.capitalize()}: {weight:.3f}")
        
        return weights

In [11]:
# Initialize advanced fusion engine
print("\n=== ADVANCED FUSION ANALYSIS ===")
advanced_fusion = AdvancedFusionEngine()

# Optimize weights
optimized_weights = advanced_fusion.optimize_weights(final_results)

# Update weights in the engine
advanced_fusion.weights = optimized_weights

# Recalculate composite scores with optimized weights
final_results_optimized = advanced_fusion.calculate_composite_scores(final_results.copy())
final_results_optimized = advanced_fusion.categorize_performance(final_results_optimized)
final_results_optimized = advanced_fusion.generate_insights(final_results_optimized)

# Predict future performance
predicted_results = advanced_fusion.predict_future_performance(final_results_optimized)

# Display predictions
print("\nFuture Performance Predictions (Sample):")
if 'predicted_future_score' in predicted_results.columns:
    pred_sample = predicted_results[['videoId', 'composite_score', 'predicted_future_score']].head(10)
    for idx, row in pred_sample.iterrows():
        print(f"Video {row['videoId']}: Current={row['composite_score']:.3f}, "
              f"Predicted={row['predicted_future_score']:.3f}")


=== ADVANCED FUSION ANALYSIS ===
Optimizing signal weights...
Optimized weights:
  Hawkes: 0.312
  Tbi: 0.192
  Fundamental: 0.303
  Decay: 0.193
Calculating composite scores...
Composite scores calculated. Range: 0.063 - 0.774
Training ML model for future performance prediction...
ML prediction completed. Range: 0.080 - 0.561

Future Performance Predictions (Sample):
Video 0.0: Current=0.471, Predicted=0.445
Video 1.0: Current=0.114, Predicted=0.114
Video 2.0: Current=0.092, Predicted=0.092
Video 6.0: Current=0.191, Predicted=0.190
Video 7.0: Current=0.096, Predicted=0.096
Video 8.0: Current=0.207, Predicted=0.207
Video 11.0: Current=0.404, Predicted=0.393
Video 15.0: Current=0.080, Predicted=0.080
Video 19.0: Current=0.080, Predicted=0.080
Video 22.0: Current=0.146, Predicted=0.148


## Export Results

In [12]:
# Export final results
export_columns = ['videoId']

# Add core columns
core_columns = ['composite_score', 'performance_category', 'insights']
for col in core_columns:
    if col in predicted_results.columns:
        export_columns.append(col)

# Add component columns
component_columns = ['hawkes_component', 'tbi_component', 'fundamental_component', 'decay_component']
for col in component_columns:
    if col in predicted_results.columns:
        export_columns.append(col)

# Add raw scores
raw_score_columns = ['hawkes_health_score', 'tbi_burst_score', 'fundamental_quality_score', 'decay_health_score']
for col in raw_score_columns:
    if col in predicted_results.columns:
        export_columns.append(col)

# Add prediction column if available
if 'predicted_future_score' in predicted_results.columns:
    export_columns.append('predicted_future_score')

# Filter to available columns
available_export_columns = [col for col in export_columns if col in predicted_results.columns]

if available_export_columns:
    final_export = predicted_results[available_export_columns].copy()
    
    # Handle any remaining NaN values
    for col in final_export.columns:
        if col != 'videoId' and col != 'insights':
            final_export[col] = final_export[col].fillna(0)
    
    # Export to CSV
    final_export.to_csv('all_signals_combined.csv', index=False)
    
    print(f"\nResults exported to 'fusion_engine_results.csv' ({len(final_export)} rows)")
    print("\nSample of exported data:")
    display(final_export.head())
    
    # Show export statistics
    print("\nExport file statistics:")
    print(f"Total rows: {len(final_export)}")
    print(f"Columns: {list(final_export.columns)}")
    
    if 'composite_score' in final_export.columns:
        valid_scores = final_export['composite_score'].dropna()
        if len(valid_scores) > 0:
            print(f"Score range: {valid_scores.min():.3f} - {valid_scores.max():.3f}")
            
            # Category distribution
            if 'performance_category' in final_export.columns:
                category_dist = final_export['performance_category'].value_counts()
                print("\nPerformance categories:")
                for cat, count in category_dist.items():
                    pct = (count / len(final_export)) * 100
                    print(f"  {cat}: {count} ({pct:.1f}%)")
else:
    print("No valid columns to export")


Results exported to 'fusion_engine_results.csv' (39938 rows)

Sample of exported data:


Unnamed: 0,videoId,composite_score,performance_category,insights,hawkes_component,tbi_component,fundamental_component,decay_component,hawkes_health_score,tbi_burst_score,fundamental_quality_score,decay_health_score,predicted_future_score
0,0,0.471024,Medium,Video 0: Strongest driver: Hawkes. Performance...,0.17898,0.034768,0.139696,0.11758,0.574065,0.180816,0.461164,0.609162,0.44483
1,1,0.11439,Low,Video 1: Strongest driver: Decay. Performance:...,0.0,0.0,0.034764,0.079626,0.0,0.0,0.114761,0.412532,0.114243
2,2,0.092171,Low,Video 2: Strongest driver: Decay. Performance:...,0.0,0.0,0.012544,0.079626,0.0,0.0,0.041411,0.412532,0.092175
3,6,0.190717,Low,Video 6: Strongest driver: Fundamental. Perfor...,0.0,0.0,0.11109,0.079626,0.0,0.0,0.366729,0.412532,0.190393
4,7,0.095854,Low,Video 7: Strongest driver: Decay. Performance:...,0.0,0.0,0.016228,0.079626,0.0,0.0,0.053571,0.412532,0.095826



Export file statistics:
Total rows: 39938
Columns: ['videoId', 'composite_score', 'performance_category', 'insights', 'hawkes_component', 'tbi_component', 'fundamental_component', 'decay_component', 'hawkes_health_score', 'tbi_burst_score', 'fundamental_quality_score', 'decay_health_score', 'predicted_future_score']
Score range: 0.063 - 0.774

Performance categories:
  Low: 38169 (95.6%)
  Medium: 1755 (4.4%)
  High: 14 (0.0%)


In [13]:
print("\n=== FUSION ENGINE EXECUTION COMPLETE ===")
print("The fusion engine has successfully processed all four signals and generated a comprehensive content performance analysis.")
print("Key outputs include:")
print("1. Composite performance scores for each video")
print("2. Performance categorization (Viral, High, Medium, Low)")
print("3. Actionable insights for content optimization")
print("4. Component contribution analysis")
print("5. Future performance predictions")
print("6. Exported results in CSV format")

# Final validation
if 'composite_score' in predicted_results.columns:
    scores = predicted_results['composite_score'].dropna()
    if len(scores) > 0:
        unique_scores = scores.nunique()
        print(f"\nValidation: {unique_scores} unique scores generated out of {len(scores)} total scores")
        if unique_scores > 1:
            print("✓ SUCCESS: Diverse score distribution achieved")
        else:
            print("⚠ WARNING: All scores are identical - check data processing")
    else:
        print("⚠ WARNING: No valid scores generated")


=== FUSION ENGINE EXECUTION COMPLETE ===
The fusion engine has successfully processed all four signals and generated a comprehensive content performance analysis.
Key outputs include:
1. Composite performance scores for each video
2. Performance categorization (Viral, High, Medium, Low)
3. Actionable insights for content optimization
4. Component contribution analysis
5. Future performance predictions
6. Exported results in CSV format

Validation: 26386 unique scores generated out of 39938 total scores
✓ SUCCESS: Diverse score distribution achieved
