In [1]:
import os
import json
import unicodedata
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import plotly.graph_objects as go
from scipy import stats
import scipy
from plotly.colors import qualitative

files_path = '../data/'

In [2]:
scenarios = [18,22,23,24,38,39,40,41,42]

In [3]:
# Define the list of target hashtags we personalized the user with

target_hashtags = ["football", "food", "championsleague", "movie", "foodtiktok", "gaming", "film", 
                   "tiktokfood", "gta6", "gta", "minecraft", "marvel", "cat", "dog", "pet", 
                   "dogsoftiktok", "catsoftiktok", "cute", "puppy", "dogs", "cats", "animals", 
                   "petsoftiktok", "kitten", "comedy", "asmr", "learnontiktok", "satisfying", 
                   "lol", "love", "humour", "couple", "foodie", "baby", "car", "cars", "jokes", 
                   "lifehack", "satisfyingvideo", "relationship", "cooking", "laugh", "fun", "roblox"]

In [4]:
def clean_text(text):
    """Ensure text is UTF-8 encoded and strip out non-ASCII characters."""
    # Normalize text to remove accents and unusual unicode characters
    normalized_text = unicodedata.normalize('NFKD', text)
    # Encode to ASCII, ignoring characters that cannot be encoded, then decode back to string
    return normalized_text.encode('ascii', 'ignore').decode('ascii')

def parse_info(video, user_id):
    d = {}
    d["user_id"] = user_id
    
    author = video.get("author", {})
    stats = video.get("stats", {})
    music = video.get("music", {})
    video_stats = video.get("video", {})
    text_extras = video.get("contents", [{}])[0].get("textExtra", [])
    # df["author"] = author
    # df["stats"] = stats
    # df["music"] = music
    # df["video_stats"] = video_stats
    # df["text_extras"] = text_extras

    # Basic video details
    d["play_count"] = stats.get("playCount", 0)
    d["digg_count"] = stats.get("diggCount", 0)
    d["share_count"] = stats.get("shareCount", 0)
    d["comment_count"] = stats.get("commentCount", 0)
    d["collect_count"] = stats.get("collectCount", 0)
    d["repost_count"] = video.get("statsV2", {}).get("repostCount", 0)
    d["loudness"] = video_stats.get("volumeInfo", {}).get("Loudness", "N/A")
    d["duration"] = video_stats.get("duration", "N/A")  # Video duration

    # Author details
    d["author_id"] = author.get("id", "Unknown Author")
    d["author_name"] = clean_text(author.get("nickname", "Unknown Author Name"))
    d["author_bio"] = clean_text(author.get("signature", "No bio"))  # Cleaned Author bio
    d["nickname"] = author.get("uniqueId", "Unknown Nickname")

    # Music details
    d["music_name"] = clean_text(music.get("title", "Unknown Music Title"))
    d["music_id"] = music.get("id", "Unknown Music ID")
    d["music_author"] = clean_text(music.get("authorName", "Unknown Music Author"))
    d["music_album"] = clean_text(music.get("album", "Unknown Album"))  # Cleaned Music album

    # Hashtags from textExtra
    hashtags = [clean_text(extra.get("hashtagName")) for extra in text_extras if extra.get("type") == 1]
    d["hashtags"] = '_'.join(hashtags)

    d["video_id"] = video.get("id", "Unknown ID")

    df = pd.DataFrame([d])
    df["repost_count"] = df["repost_count"].astype(int)
    return df

In [5]:
def load_data_into_df(scenarios: list) -> pd.DataFrame:
    df_jsons = []
    for user_id in scenarios:
        for user_type in ['control', 'experiment']:
            user_path = files_path + f'{user_id}/{user_id}-{user_type}/'

            for root, dirs, files in os.walk(user_path):
                if 'responses' in dirs:
                    responses_path = os.path.join(root, 'responses')
                    # Only look at files directly in responses folder
                    response_files = [f for f in os.listdir(responses_path) 
                                    if f.endswith('.json') and f != 'config.json']
                    for file in response_files:
                        file_path = os.path.join(responses_path, file)
                        with open(file_path, 'r') as f:
                            data = json.load(f)
                            for video in data['itemList']:
                                df_jsons.append(parse_info(video, str(user_id) + '-' + user_type))

    data_combined = pd.concat(df_jsons, ignore_index=True)
    data_combined['run_number'] = data_combined.groupby('user_id').cumcount() + 1

    return data_combined

In [6]:
data_combined = load_data_into_df(scenarios)
data_combined

Unnamed: 0,user_id,play_count,digg_count,share_count,comment_count,collect_count,repost_count,loudness,duration,author_id,author_name,author_bio,nickname,music_name,music_id,music_author,music_album,hashtags,video_id,run_number
0,18-control,47400000,3000000,25600,31900,75200,0,-20.6,268,6980785150019781637,SB Mowing,Please keep up with me on Snapchat or YouTube ...,sbmowing,original sound - SB Mowing,7463305390462094126,SB Mowing,Unknown Album,,7463304946616651050,1
1,18-control,6600000,1400000,78100,1928,146500,0,-14,71,7329520300319540257,Scare Prank USA,ueafollow me uea,scareprankusa54,original sound,7458665057581288214,Scare Prank USA,Unknown Album,scarecam_scareprank_funnyprank_scaring_jumpy_v...,7458665050828492054,2
2,18-control,4900000,324700,967,260,20200,0,-23.9,119,7425531431735985198,user7291424162817,,user7291424162817,original sound,7451878052642016031,user7291424162817,Unknown Album,movie_tiktokmovies_movieclips_clip_movieshowsc...,7451878163384110366,3
3,18-control,584300,25700,155,201,2923,0,-8.7,131,7319790687351882795,Jay rose,,jayrose044,AeuEA,7433574331416529706,Jay rose,Unknown Album,satisfyin_fyp_storytime_foryou_viral_tiktok_tr...,7433574400735104302,4
4,18-control,2300000,258600,4980,1021,30800,0,-11.9,83,6982084681585394693,UniversalCore,(ua Mental health matters ua) \nTrus...,universalcore____,Closer,7309864547584968706,Nuages,Closer,corecore_sadcore_hopecore_coretok_core_mentalh...,7443535065801379115,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18877,42-experiment,1900000,377000,9905,2172,54100,0,-21.8,132,6676293103072658437,Brady LXIX,IG/FB:BRADYLXIX \nother account:bradylxix2\nME...,bradylxix,original sound,7461417766466243370,Brady LXIX,Unknown Album,bradylxix,7461417768965950763,1066
18878,42-experiment,1200000,80300,2216,673,33100,0,-13.8,35,6793632786105713666,CAPCUT TEMPLATE TRENDS,uiMUSIC PROMOTION DMui\n\nCREDITS TO THE OWNER...,capcut_templatetrends,AURA,7387573619378046993,Ogryzek,AURA,capcut_capcutpioneer_capcut_edit_fyp_trend_for...,7447105323816602898,1067
18879,42-experiment,3800000,493000,13500,3063,104800,0,-6.7,62,7250349443498066990,vidz,Trying to influence a bigger audience uoOe\nI ...,vidztoker,original sound,7451284899840953130,vidz,Unknown Album,foryou_fyp_edit,7451284883516771630,1068
18880,42-experiment,1500000,94100,10100,14000,16400,0,-9.3,61,6936022572887360517,hydgjgf,,hydgjgf,original sound,7445855150561905414,hydgjgf,Unknown Album,fyp_fypC_story_satisfyingvideo_comedyvideo_mar...,7445855110065818885,1069


```
analyze_hashtags_for_scenario
```

1. Separating experiment/control groups based on `user_id` patterns
2. Aligning video observation counts between groups
3. Counting occurrences of specified `target_hashtags` in both groups
4. Preparing time-series data for comparing hashtag adoption trends


In [7]:
def analyze_hashtags_for_scenario(data_combined, target_hashtags, scenario, group_type='both'):
    """
    group_type: 'both', 'control', or 'experiment'
    """
    # Create a timeline analysis for selected groups
    experiment_data = None
    control_data = None
    max_videos = None
    
    if group_type in ['both', 'experiment']:
        experiment_data = data_combined[data_combined['user_id'].str.contains(f'{scenario}-experiment')]
        experiment_data = experiment_data.sort_values('run_number')
        
    if group_type in ['both', 'control']:    
        control_data = data_combined[data_combined['user_id'].str.contains(f'{scenario}-control')]
        control_data = control_data.sort_values('run_number')

    # Find the minimum number of videos between selected groups
    if group_type == 'both':
        max_videos = min(experiment_data['run_number'].max(), control_data['run_number'].max())
        experiment_data = experiment_data[experiment_data['run_number'] <= max_videos]
        control_data = control_data[control_data['run_number'] <= max_videos]
    elif group_type == 'experiment':
        max_videos = experiment_data['run_number'].max()
    elif group_type == 'control':
        max_videos = control_data['run_number'].max()

    # Calculate presence of target hashtags over time for selected groups
    def count_target_hashtags(hashtags):
        if not hashtags:
            return 0
        hashtag_list = hashtags.split('_')
        count = 0
        for target in target_hashtags:
            count += sum(1 for hashtag in hashtag_list if target in hashtag)
        return count

    if experiment_data is not None:
        experiment_data['target_hashtag_count'] = experiment_data['hashtags'].apply(count_target_hashtags)
    if control_data is not None:
        control_data['target_hashtag_count'] = control_data['hashtags'].apply(count_target_hashtags)

    return experiment_data, control_data, max_videos

```
plot_hashtag_trends
```

1. Creating rolling averages using video buckets (default 250 videos/bucket)
2. Calculating **Spearman correlation** (ρ) and p-values for trend significance
3. Generating interactive plots comparing experimental/control groups
4. Showing overall averages and bucket-to-bucket changes


In [8]:
def plot_hashtag_trends(scenarios_data, target_hashtags, group_type='both', 
                       exp_legend_name="Experimental", ctrl_legend_name="Control"):
    use_buckets = True
    bucket_size = 250  # Size of buckets for averaging

    # Color schemes
    exp_colors = ['rgb(31, 119, 180)', 'rgb(44, 160, 44)', 'rgb(214, 39, 40)', 
                  'rgb(148, 103, 189)', 'rgb(140, 86, 75)']
    ctrl_colors = ['rgb(255, 127, 14)', 'rgb(227, 119, 194)', 'rgb(127, 127, 127)', 
                  'rgb(188, 189, 34)', 'rgb(23, 190, 207)']

    if use_buckets:
        # Bucket averages across all scenarios
        fig2 = go.Figure()

        all_exp_means = []
        all_ctrl_means = []
        all_exp_first_bucket = []
        all_exp_last_bucket = []
        all_ctrl_first_bucket = []
        all_ctrl_last_bucket = []

        for idx, (scenario, (exp_data, ctrl_data, _)) in enumerate(scenarios_data.items()):
            if group_type in ['both', 'experiment'] and exp_data is not None:
                exp_bucket_means = []
                for i in range(0, len(exp_data), bucket_size):
                    bucket = exp_data.iloc[i:i+bucket_size]
                    if len(bucket) >= bucket_size * 0.75:
                        bucket_mean = bucket['target_hashtag_count'].mean()
                        exp_bucket_means.append({
                            'mean': bucket_mean,
                            'run': (i + bucket_size)  # Use end of bucket as x-coordinate
                        })
                        all_exp_means.append(bucket_mean)
                        
                        if i == 0:
                            all_exp_first_bucket.append(bucket_mean)
                        if i + bucket_size >= len(exp_data) - bucket_size:
                            all_exp_last_bucket.append(bucket_mean)

                        print(f"\n{exp_legend_name} Group Scenario {scenario} Bucket {i//bucket_size + 1}:")
                        print(f"Average: {bucket_mean:.2f}")

                if exp_bucket_means:
                    exp_buckets_df = pd.DataFrame(exp_bucket_means)
                    
                    # Calculate trend line with Spearman correlation
                    rho, p_value = stats.spearmanr(exp_buckets_df['run'], exp_buckets_df['mean'])
                    
                    # Calculate trend line points using linear regression for visualization
                    slope, intercept = np.polyfit(exp_buckets_df['run'], exp_buckets_df['mean'], 1)
                    x_range = np.array([exp_buckets_df['run'].min(), exp_buckets_df['run'].max()])
                    trend_y = slope * x_range + intercept
                    
                    # Add scatter plot
                    fig2.add_trace(go.Scatter(
                        x=exp_buckets_df['run'],
                        y=exp_buckets_df['mean'],
                        mode='lines+markers',
                        line=dict(color=exp_colors[idx % len(exp_colors)], width=2),
                        name=f'{exp_legend_name} {scenario}'
                    ))
                    
                    # Add trend line with Spearman correlation in name
                    fig2.add_trace(go.Scatter(
                        x=x_range,
                        y=trend_y,
                        mode='lines',
                        line=dict(color=exp_colors[idx % len(exp_colors)], width=1, dash='dash'),
                        name=f'{exp_legend_name} {scenario} trend (ρ={rho:.4f}, p={p_value:.3f})'
                    ))

            if group_type in ['both', 'control'] and ctrl_data is not None:
                ctrl_bucket_means = []
                for i in range(0, len(ctrl_data), bucket_size):
                    bucket = ctrl_data.iloc[i:i+bucket_size]
                    if len(bucket) >= bucket_size * 0.75:
                        bucket_mean = bucket['target_hashtag_count'].mean()
                        ctrl_bucket_means.append({
                            'mean': bucket_mean,
                            'run': (i + bucket_size)  # Use end of bucket as x-coordinate
                        })
                        all_ctrl_means.append(bucket_mean)
                        
                        if i == 0:
                            all_ctrl_first_bucket.append(bucket_mean)
                        if i + bucket_size >= len(ctrl_data) - bucket_size:
                            all_ctrl_last_bucket.append(bucket_mean)

                        print(f"\n{ctrl_legend_name} Group Scenario {scenario} Bucket {i//bucket_size + 1}:")
                        print(f"Average: {bucket_mean:.2f}")

                if ctrl_bucket_means:
                    ctrl_buckets_df = pd.DataFrame(ctrl_bucket_means)
                    
                    # Calculate trend line with Spearman correlation
                    rho, p_value = stats.spearmanr(ctrl_buckets_df['run'], ctrl_buckets_df['mean'])
                    
                    # Calculate trend line points using linear regression for visualization
                    slope, intercept = np.polyfit(ctrl_buckets_df['run'], ctrl_buckets_df['mean'], 1)
                    x_range = np.array([ctrl_buckets_df['run'].min(), ctrl_buckets_df['run'].max()])
                    trend_y = slope * x_range + intercept
                    
                    # Add scatter plot
                    fig2.add_trace(go.Scatter(
                        x=ctrl_buckets_df['run'],
                        y=ctrl_buckets_df['mean'],
                        mode='lines+markers',
                        line=dict(color=ctrl_colors[idx % len(ctrl_colors)], width=2),
                        name=f'{ctrl_legend_name} {scenario}'
                    ))
                    
                    # Add trend line with Spearman correlation in name
                    fig2.add_trace(go.Scatter(
                        x=x_range,
                        y=trend_y,
                        mode='lines',
                        line=dict(color=ctrl_colors[idx % len(ctrl_colors)], width=1, dash='dash'),
                        name=f'{ctrl_legend_name} {scenario} trend (ρ={rho:.4f}, p={p_value:.3f})'
                    ))

        fig2.update_layout(
            title=f'Likes hashtags - Average Target Hashtags per {bucket_size}-Video Bucket',
            xaxis_title='Average Run Number in Bucket',
            yaxis_title='Average Number of Target Hashtags',
            showlegend=True,
            template='plotly_white'
        )
        
        fig2.show()

        # Print statistics
        print("\nOverall Averages Across All Scenarios:")
        if group_type in ['both', 'experiment'] and all_exp_means:
            print(f"Overall {exp_legend_name} group average: {sum(all_exp_means)/len(all_exp_means):.2f}")
        if group_type in ['both', 'control'] and all_ctrl_means:
            print(f"Overall {ctrl_legend_name} group average: {sum(all_ctrl_means)/len(all_ctrl_means):.2f}")
        
        print("\nFirst Bucket Averages Across All Scenarios:")
        if group_type in ['both', 'experiment'] and all_exp_first_bucket:
            print(f"{exp_legend_name} group first bucket average: {sum(all_exp_first_bucket)/len(all_exp_first_bucket):.2f}")
        if group_type in ['both', 'control'] and all_ctrl_first_bucket:
            print(f"{ctrl_legend_name} group first bucket average: {sum(all_ctrl_first_bucket)/len(all_ctrl_first_bucket):.2f}")
        
        print("\nLast Bucket Averages Across All Scenarios:")
        if group_type in ['both', 'experiment'] and all_exp_last_bucket:
            print(f"{exp_legend_name} group last bucket average: {sum(all_exp_last_bucket)/len(all_exp_last_bucket):.2f}")
        if group_type in ['both', 'control'] and all_ctrl_last_bucket:
            print(f"{ctrl_legend_name} group last bucket average: {sum(all_ctrl_last_bucket)/len(all_ctrl_last_bucket):.2f}")


In [9]:
scenarios = ['18','22']
scenarios_data = {}

for scenario in scenarios:
    # Call analyze_hashtags_for_scenario with group_type='both' to get both experimental and control data
    exp_data, ctrl_data, max_videos = analyze_hashtags_for_scenario(data_combined, target_hashtags, scenario, group_type='both')
    scenarios_data[scenario] = (exp_data, ctrl_data, max_videos)

# Plot trends and print statistics with default group_type='both'
plot_hashtag_trends(scenarios_data, target_hashtags, group_type='both')


Experimental Group Scenario 18 Bucket 1:
Average: 1.66

Experimental Group Scenario 18 Bucket 2:
Average: 1.09

Experimental Group Scenario 18 Bucket 3:
Average: 0.56

Experimental Group Scenario 18 Bucket 4:
Average: 0.62

Control Group Scenario 18 Bucket 1:
Average: 1.02

Control Group Scenario 18 Bucket 2:
Average: 1.65

Control Group Scenario 18 Bucket 3:
Average: 1.18

Control Group Scenario 18 Bucket 4:
Average: 1.43

Experimental Group Scenario 22 Bucket 1:
Average: 1.01

Experimental Group Scenario 22 Bucket 2:
Average: 1.33

Experimental Group Scenario 22 Bucket 3:
Average: 1.84

Control Group Scenario 22 Bucket 1:
Average: 0.99

Control Group Scenario 22 Bucket 2:
Average: 0.92

Control Group Scenario 22 Bucket 3:
Average: 0.94



Overall Averages Across All Scenarios:
Overall Experimental group average: 1.16
Overall Control group average: 1.16

First Bucket Averages Across All Scenarios:
Experimental group first bucket average: 1.34
Control group first bucket average: 1.00

Last Bucket Averages Across All Scenarios:
Experimental group last bucket average: 1.23
Control group last bucket average: 1.19


`get_target_hashtag_count`

1. Normalizing input tags (removes special chars, lowercase conversion)
2. Using exact string matching against provided target hashtags
3. Returning count of matches while handling empty/missing tags


In [10]:
def get_target_hashtag_count(tags, target_hashtags):
    """Count how many target hashtags appear in a set of tags"""
    if not tags:
        return 0
    cleaned_tags = set(''.join(c for c in tag if c.isalnum()).lower() 
                      for tag in tags.split('_'))
    return sum(1 for tag in cleaned_tags if tag in target_hashtags)

`analyze_and_plot_target_hashtags_agg`

1. Analyzing individual scenarios and aggregated groups in **control and experiment users**
2. Calculating **bucket-based averages** (default 25 videos/bucket) with trend correlations (Spearman's ρ)
3. Generating **visualizations** showing:
   - Experimental vs control group comparisons
   - Statistical annotations (ρ and p-values)
   - Aggregated trend lines across scenario groups
4. **Dynamic scenario grouping** with custom labeling


In [15]:
def analyze_and_plot_target_hashtags_agg(data_combined, scenarios, target_hashtags, bucket_size=25,
                          plot_type='both', # New parameter - can be 'control', 'experiment' or 'both'
                          aggregate_scenarios=None, legend_labels=None):
    """
    Analyze presence of target hashtags in control and/or experiment groups

    Parameters:
    - data_combined: DataFrame with video data
    - scenarios: List of scenario IDs to analyze
    - target_hashtags: List of target hashtags to count
    - bucket_size: Size of analysis buckets
    - plot_type: Type of plots to show ('control', 'experiment', or 'both')
    - aggregate_scenarios: List of scenario groups to aggregate
    - legend_labels: Custom labels for scenario groups
    """
    # Validate plot_type
    valid_plot_types = ['control', 'experiment', 'both']
    if plot_type not in valid_plot_types:
        raise ValueError(f"plot_type must be one of {valid_plot_types}")
    
    # Determine which groups to plot
    if plot_type == 'both':
        plot_groups = ('control', 'experiment')
    else:
        plot_groups = (plot_type,)

    # Clean target hashtags
    target_hashtags = set(''.join(c for c in tag if c.isalnum()).lower() 
                         for tag in target_hashtags)

    # Color setup
    color_palette = qualitative.Plotly
    scenario_colors = {s: color_palette[i%len(color_palette)] 
                      for i, s in enumerate(scenarios)}

    # Store results
    results = {'individual': {}, 'aggregated': {}}

    # Individual scenario analysis
    for scenario in scenarios:
        results['individual'][scenario] = {}
        
        for group in plot_groups:
            # Data preparation
            data = data_combined[data_combined['user_id'] == f'{scenario}-{group}'].sort_values('run_number')
            

            # Bucket processing
            avg_counts = []
            for i in range(0, len(data) - bucket_size + 1, bucket_size):
                bucket = data.iloc[i:i+bucket_size]
                counts = [get_target_hashtag_count(tags, target_hashtags) 
                         for tags in bucket['hashtags']]
                avg_counts.append(np.mean(counts))

            results['individual'][scenario][group] = avg_counts

    # Visualization
    fig_ind = go.Figure()
    fig_agg = go.Figure() if aggregate_scenarios else None

    # Individual scenario plots
    for scenario in scenarios:
        for group in plot_groups:
            counts = results['individual'][scenario][group]
            positions = [i*bucket_size + bucket_size/2 for i in range(len(counts))]
            color = scenario_colors[scenario]
            if plot_type == 'both':
                color = color if group == 'control' else f'rgba{tuple(int(color.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)) + (0.5,)}'

            # Correlation analysis
            rho, p = scipy.stats.spearmanr(positions, counts)

            # Main trace
            fig_ind.add_trace(go.Scatter(
                x=positions,
                y=counts,
                mode='lines+markers',
                name=f'Scenario {scenario} {group} (ρ={rho:.2f}, p={p:.3f})',
                line=dict(color=color),
                marker=dict(color=color)
            ))

            # Trend line
            trend = np.poly1d(np.polyfit(positions, counts, 1))(positions)
            fig_ind.add_trace(go.Scatter(
                x=positions,
                y=trend,
                mode='lines',
                line=dict(color=color, dash='dash'),
                showlegend=False
            ))

    # Aggregated analysis
    if aggregate_scenarios:
        for group_idx, scenario_group in enumerate(aggregate_scenarios):
            color = color_palette[group_idx%len(color_palette)]
            
            for group in plot_groups:
                scenario_lengths = {}
                for scenario in scenario_group:
                    if scenario in results['individual'] and results['individual'][scenario][group]:
                        counts = results['individual'][scenario][group]
                        scenario_lengths[scenario] = len(counts)
                
                # Skip if no scenarios have data
                if not scenario_lengths:
                    print(f"No data available for group {scenario_group} {group}")
                    continue

                # Find scenario with most videos
                max_scenario = max(scenario_lengths.items(), key=lambda x: x[1])
                max_len = max_scenario[1]
                
                if max_len > 0:
                    positions = [i*bucket_size + bucket_size/2 for i in range(max_len)]
                    
                    # For each position, average available counts
                    avg_counts = []
                    for pos in range(max_len):
                        pos_counts = []
                        for scenario in scenario_group:
                            if scenario in results['individual']:
                                counts = results['individual'][scenario][group]
                                if pos < len(counts):
                                    pos_counts.append(counts[pos])
                        if pos_counts:
                            avg_counts.append(np.mean(pos_counts))
                        else:
                            avg_counts.append(results['individual'][max_scenario[0]][group][pos])

                    # Correlation analysis
                    rho, p = scipy.stats.spearmanr(positions, avg_counts)

                    plot_color = color if group == 'control' else f'rgba{tuple(int(color.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)) + (0.5,)}'
                    # Handle single-scenario groups differently for legend labels
                    if len(scenario_group) == 1:
                        label = (legend_labels.get(scenario_group[0], f'Group {scenario_group}') + 
                                f' {group} (ρ={rho:.2f}, p={p:.3f})')
                    else:
                        label = (legend_labels.get(tuple(sorted(scenario_group)), f'Group {scenario_group}') + 
                                f' {group} (ρ={rho:.2f}, p={p:.3f})')

                    # Main trace
                    fig_agg.add_trace(go.Scatter(
                        x=positions,
                        y=avg_counts,
                        mode='lines+markers',
                        name=label,
                        line=dict(color=plot_color),
                        marker=dict(color=plot_color)
                    ))

                    # Trend line
                    trend = np.poly1d(np.polyfit(positions, avg_counts, 1))(positions)
                    fig_agg.add_trace(go.Scatter(
                        x=positions,
                        y=trend,
                        mode='lines',
                        line=dict(color=plot_color, dash='dash'),
                        showlegend=False
                    ))

    # Figure formatting
    fig_ind.update_layout(
        xaxis_title='Video Position',
        yaxis_title='Average Target Hashtag Count',
        template='plotly_white',
        width=800,
        margin=dict(l=50, r=50, t=50, b=50),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="center",
            x=0.5
        )
    )
    fig_ind.show()

    if fig_agg:
        fig_agg.update_layout(
            xaxis_title='Number of watched videos',
            yaxis_title='Average Target Hashtag Count',
            template='plotly_white',
            width=800,
            margin=dict(l=20, r=20, t=1, b=1),
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="center",
                x=0.5
            )
        )
        fig_agg.show()

    return results

In [16]:
test_scenarios = [18,22,38,39,40,41,42] # All scenarios to work with
aggregate_groups = [[18,22],[38,39,40,41,42]] # Aggregate by groups 
group_labels = {
    (18,22): 'Likes hashtags',
    (38,39,40,41,42): 'Watch longer hashtags'
}

analysis_results = analyze_and_plot_target_hashtags_agg(
    data_combined=data_combined,
    scenarios=test_scenarios,
    target_hashtags=target_hashtags,
    bucket_size=50, # Size of the buckets for the rolling average
    plot_type='experiment',  # Can be 'control', 'experiment' or 'both'
    aggregate_scenarios=aggregate_groups,
    legend_labels=group_labels
)

In [17]:
test_scenarios = [18,22,38,39,40,41,42] # All scenarios to work with
aggregate_groups = [[38],[39,40],[41,42]] # Aggregate by groups 
group_labels = {
    (38): 'Watch longer 50%',
    (39,40): 'Watch longer 200%',
    (41,42): 'Watch longer 400%'
}

analysis_results = analyze_and_plot_target_hashtags_agg(
    data_combined=data_combined,
    scenarios=test_scenarios,
    target_hashtags=target_hashtags,
    bucket_size=50, # Size of the buckets for the rolling average
    plot_type='experiment',  # Can be 'control', 'experiment' or 'both'
    aggregate_scenarios=aggregate_groups,
    legend_labels=group_labels
)