In [1]:
import os
import json
import unicodedata
import pandas as pd
import plotly.graph_objects as go
import scipy.stats
import numpy as np
import pandas as pd
from plotly.colors import qualitative

files_path = '../data/'

In [2]:
scenarios = [16,18,22,23,24,28,29,33,34,35,37,38,39,40,41,42]

In [3]:
def clean_text(text):
    """Ensure text is UTF-8 encoded and strip out non-ASCII characters."""
    # Normalize text to remove accents and unusual unicode characters
    normalized_text = unicodedata.normalize('NFKD', text)
    # Encode to ASCII, ignoring characters that cannot be encoded, then decode back to string
    return normalized_text.encode('ascii', 'ignore').decode('ascii')

def parse_info(video, user_id):
    d = {}
    d["user_id"] = user_id
    
    author = video.get("author", {})
    stats = video.get("stats", {})
    music = video.get("music", {})
    video_stats = video.get("video", {})
    text_extras = video.get("contents", [{}])[0].get("textExtra", [])
    # df["author"] = author
    # df["stats"] = stats
    # df["music"] = music
    # df["video_stats"] = video_stats
    # df["text_extras"] = text_extras

    # Basic video details
    d["play_count"] = stats.get("playCount", 0)
    d["digg_count"] = stats.get("diggCount", 0)
    d["share_count"] = stats.get("shareCount", 0)
    d["comment_count"] = stats.get("commentCount", 0)
    d["collect_count"] = stats.get("collectCount", 0)
    d["repost_count"] = video.get("statsV2", {}).get("repostCount", 0)
    d["loudness"] = video_stats.get("volumeInfo", {}).get("Loudness", "N/A")
    d["duration"] = video_stats.get("duration", "N/A")  # Video duration

    # Author details
    d["author_id"] = author.get("id", "Unknown Author")
    d["author_name"] = clean_text(author.get("nickname", "Unknown Author Name"))
    d["author_bio"] = clean_text(author.get("signature", "No bio"))  # Cleaned Author bio

    # Music details
    d["music_name"] = clean_text(music.get("title", "Unknown Music Title"))
    d["music_id"] = music.get("id", "Unknown Music ID")
    d["music_author"] = clean_text(music.get("authorName", "Unknown Music Author"))
    d["music_album"] = clean_text(music.get("album", "Unknown Album"))  # Cleaned Music album

    # Hashtags from textExtra
    hashtags = [clean_text(extra.get("hashtagName")) for extra in text_extras if extra.get("type") == 1]
    d["hashtags"] = '_'.join(hashtags)

    d["video_id"] = video.get("id", "Unknown ID")

    df = pd.DataFrame([d])
    df["repost_count"] = df["repost_count"].astype(int)
    return df

In [4]:
def load_data_into_df(scenarios: list) -> pd.DataFrame:
    df_jsons = []
    for user_id in scenarios:
        for user_type in ['control', 'experiment']:
            user_path = files_path + f'{user_id}/{user_id}-{user_type}/'

            for root, dirs, files in os.walk(user_path):
                if 'responses' in dirs:
                    responses_path = os.path.join(root, 'responses')
                    # Only look at files directly in responses folder
                    response_files = [f for f in os.listdir(responses_path) 
                                    if f.endswith('.json') and f != 'config.json']
                    for file in response_files:
                        file_path = os.path.join(responses_path, file)
                        with open(file_path, 'r') as f:
                            data = json.load(f)
                            for video in data['itemList']:
                                df_jsons.append(parse_info(video, str(user_id) + '-' + user_type))

    data_combined = pd.concat(df_jsons, ignore_index=True)
    data_combined['run_number'] = data_combined.groupby('user_id').cumcount() + 1
    
    # Remove specified hashtags including partial matches
    hashtags_to_ignore = [" ", "fyp", "foryou", "fyr", "'fyp'", "tiktok", "capcut", 
                         "CapCut", "viral", "video", "trend", "' '", "''", "['']"]
    
    # Split hashtags string into list, filter out ignored tags, rejoin
    data_combined['hashtags'] = data_combined['hashtags'].apply(lambda x: '_'.join(
        [tag for tag in x.split('_') if not any(ignore in tag.lower() for ignore in hashtags_to_ignore)]
    ))

    return data_combined

In [5]:
data_combined = load_data_into_df(scenarios)
data_combined

Unnamed: 0,user_id,play_count,digg_count,share_count,comment_count,collect_count,repost_count,loudness,duration,author_id,author_name,author_bio,music_name,music_id,music_author,music_album,hashtags,video_id,run_number
0,16-control,17800000,1300000,14100,4585,123600,0,-18.4,131,7426628785489314862,Alison,,original sound,7440407226923240223,Alison,,movies,7440407367017123102,1
1,16-control,12600000,2000000,161200,3359,174300,0,-10.4,67,7422934866856444970,Scare Prank USA,Follow me uOeuOe,original sound,7462017662344907550,Scare Prank USA,,prank_scaring_jumpy_funnyprank_scareprank_scar...,7462017792586288415,2
2,16-control,3500000,14000,160,0,1456,0,,25985,7070621747376604162,Tiktok for Business,,Promoted Music,7414003614940695312,Unknown Music Author,Unknown Album,,7414003587438611713,3
3,16-control,4700000,251500,1920,752,20700,0,-13.8,175,7375457841778099242,ayejeje,,original sound,7448295532134484782,ayejeje,,pov_funny_pov,7448295527004982574,4
4,16-control,64500000,7800000,630700,25800,682500,0,-16,122,79639104283377664,Akamz,Mail Pro : Contact@atlas-agence.com,son original,7452067172380003094,Akamz,,boyzinthehood,7452067110429150486,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34068,42-experiment,1900000,377000,9905,2172,54100,0,-21.8,132,6676293103072658437,Brady LXIX,IG/FB:BRADYLXIX \nother account:bradylxix2\nME...,original sound,7461417766466243370,Brady LXIX,Unknown Album,bradylxix,7461417768965950763,1066
34069,42-experiment,1200000,80300,2216,673,33100,0,-13.8,35,6793632786105713666,CAPCUT TEMPLATE TRENDS,uiMUSIC PROMOTION DMui\n\nCREDITS TO THE OWNER...,AURA,7387573619378046993,Ogryzek,AURA,edit_smoothvelocity_softslowmotion_softslowmo_...,7447105323816602898,1067
34070,42-experiment,3800000,493000,13500,3063,104800,0,-6.7,62,7250349443498066990,vidz,Trying to influence a bigger audience uoOe\nI ...,original sound,7451284899840953130,vidz,Unknown Album,edit,7451284883516771630,1068
34071,42-experiment,1500000,94100,10100,14000,16400,0,-9.3,61,6936022572887360517,hydgjgf,,original sound,7445855150561905414,hydgjgf,Unknown Album,story_marrkadams,7445855110065818885,1069


In [6]:
def get_hashtag_sets(df):
    """Extract sets of hashtags from each video in the dataframe"""
    def clean_hashtags(tags):
        # Convert to lowercase and remove non-alphanumeric characters
        cleaned = []
        for tag in tags.split('_'):
            # Keep only alphanumeric chars and convert to lowercase
            clean_tag = ''.join(c for c in tag if c.isalnum()).lower()
            if clean_tag:  # Only add if not empty
                cleaned.append(clean_tag)
        return set(cleaned)
    
    return [clean_hashtags(tags) for tags in df['hashtags'] if tags]

`analyze_hashtag_similarity`

1. Comparing **experimental vs control groups** in video buckets (default 150 videos/bucket)
2. Calculating **hashtag overlaps** (≥1 shared hashtag = match)
3. Tracking similarity trends through:
   - First/last bucket comparison metrics
4. Generating formatted tables showing:
   - Scenario-specific similarity progression
   - Aggregate trend differences (last - first bucket)

In [7]:
def analyze_hashtag_similarity(data_combined, scenarios, bucket_size=150):
    """
    Analyze hashtag similarity between videos within buckets.
    A match occurs if two videos share at least one hashtag.
    
    Parameters:
    - data_combined: DataFrame with the video data 
    - scenarios: List of scenario numbers to analyze
    - bucket_size: Size of buckets for aggregating results
    """
    all_similarities = {}
    
    # Process each scenario
    for scenario in scenarios:
        bucket_similarities = []
        
        # Get data for the scenario
        exp_data = data_combined[data_combined['user_id'] == f'{scenario}-experiment'].sort_values('run_number')
        ctrl_data = data_combined[data_combined['user_id'] == f'{scenario}-control'].sort_values('run_number')
            
        # Determine analysis length and get buckets
        max_videos = min(len(exp_data), len(ctrl_data))
        exp_data = exp_data.iloc[:max_videos]
        ctrl_data = ctrl_data.iloc[:max_videos]
        
        # Calculate similarities between buckets
        for i in range(0, max_videos - bucket_size + 1, bucket_size):
            exp_bucket = exp_data.iloc[i:i+bucket_size]
            ctrl_bucket = ctrl_data.iloc[i:i+bucket_size]
            
            exp_tag_sets = get_hashtag_sets(exp_bucket)
            ctrl_tag_sets = get_hashtag_sets(ctrl_bucket)
            
            # Combine both buckets
            all_tag_sets = exp_tag_sets + ctrl_tag_sets
            
            # Calculate matches for each video
            total_matches = 0
            total_comparisons = 0
            
            for idx, video_tags in enumerate(all_tag_sets):
                # Compare with all other videos except itself
                for other_idx in range(len(all_tag_sets)):
                    if idx != other_idx:
                        total_comparisons += 1
                        if video_tags.intersection(all_tag_sets[other_idx]):
                            total_matches += 1
            
            # Calculate average similarity for this bucket
            avg_similarity = total_matches / total_comparisons if total_comparisons > 0 else 0
            bucket_similarities.append(avg_similarity)
        
        # Store results
        all_similarities[scenario] = bucket_similarities
        
    # Print table
    print("\nAVERAGE HASHTAG SIMILARITY BY VIDEO BUCKET")
    print("="*80)
    
    headers = ["Scenario"]
    max_buckets = max(len(sims) for sims in all_similarities.values())
    for i in range(max_buckets):
        start = i * bucket_size + 1
        end = (i + 1) * bucket_size
        headers.append(f"Videos {start}-{end}")
    
    print(" | ".join(headers))
    print("-" * 80)
    
    # Track sums for overall averages
    first_bucket_sum = 0
    last_bucket_sum = 0
    scenario_count = 0
    
    for scenario in scenarios:
        scenario_similarities = all_similarities[scenario]
        row = [str(scenario)]
        
        for i in range(max_buckets):
            if i < len(scenario_similarities):
                row.append(f"{scenario_similarities[i]:.4f}")
                if i == 0:
                    first_bucket_sum += scenario_similarities[0]
                if i == len(scenario_similarities) - 1:
                    last_bucket_sum += scenario_similarities[-1]
            else:
                row.append("-")
        
        scenario_count += 1
        print(" | ".join(row))
    
    # Print average row
    print("-" * 80)
    if scenario_count > 0:
        first_bucket_avg = first_bucket_sum / scenario_count
        last_bucket_avg = last_bucket_sum / scenario_count
        print(f"First Bucket [size {bucket_size}] Average: {first_bucket_avg:.4f}")
        print(f"Last Bucket [size {bucket_size}] Average: {last_bucket_avg:.4f}")
        print(f"Difference (Last - First): {(last_bucket_avg - first_bucket_avg):.4f}")

    return all_similarities

In [8]:
# Example usage:
scenarios = [33,34,35,37,38,39,40,41,42]  # List of scenarios to analyze
results = analyze_hashtag_similarity(
    data_combined=data_combined,
    scenarios=scenarios,
    bucket_size=125
)


AVERAGE HASHTAG SIMILARITY BY VIDEO BUCKET
Scenario | Videos 1-125 | Videos 126-250 | Videos 251-375 | Videos 376-500 | Videos 501-625 | Videos 626-750 | Videos 751-875 | Videos 876-1000 | Videos 1001-1125
--------------------------------------------------------------------------------
33 | 0.0678 | 0.0696 | 0.0707 | 0.0720 | 0.0647 | 0.0535 | 0.0673 | 0.0520 | -
34 | 0.0241 | 0.0140 | 0.0380 | 0.0276 | 0.0423 | 0.0301 | 0.0191 | 0.0236 | -
35 | 0.0639 | 0.0736 | 0.0440 | 0.0692 | 0.0462 | 0.0466 | - | - | -
37 | 0.0624 | 0.0401 | 0.0362 | 0.0509 | 0.0522 | 0.0386 | 0.0509 | 0.0456 | -
38 | 0.0269 | 0.0289 | 0.1183 | 0.1083 | 0.1232 | 0.0973 | 0.0508 | 0.0484 | -
39 | 0.1097 | 0.0760 | 0.0582 | 0.0518 | 0.0663 | 0.0443 | - | - | -
40 | 0.0461 | 0.0311 | 0.0588 | 0.0714 | 0.0979 | - | - | - | -
41 | 0.0206 | 0.0276 | 0.0224 | 0.0357 | 0.0589 | 0.0391 | 0.0430 | 0.0301 | 0.0389
42 | 0.0392 | 0.0326 | 0.0611 | 0.0716 | 0.0793 | 0.0653 | 0.0462 | - | -
------------------------------------

`analyze_control_vs_experiment`

1. Calculating **control vs experiment similarity scores** using:
   - Bucket-based comparisons (default 25 videos/bucket)
   - Intersection ratios of hashtag sets
2. Tracking trends with:
   - Spearman correlation (ρ) and p-values
   - Trend lines
3. Visualizing results through:
   - Individual scenario trajectories
   - Aggregated group comparisons

In [9]:
def analyze_control_vs_experiment(data_combined, scenarios, bucket_size=25, 
                                  aggregate_scenarios=None, legend_labels=None):
    """
    Analyze hashtag similarity between control and experiment groups with color-matched trends

    Parameters:
    - data_combined: DataFrame with video data
    - scenarios: List of scenario IDs to analyze 
    - bucket_size: Size of analysis buckets
    - aggregate_scenarios: List of scenario groups to aggregate
    - legend_labels: Custom labels for scenario groups
    """

    # Color setup
    color_palette = qualitative.Plotly
    scenario_colors = {s: color_palette[i%len(color_palette)] 
                       for i, s in enumerate(scenarios)}

    # Store results
    results = {'individual': {}, 'aggregated': {}}

    # Individual scenario analysis
    for scenario in scenarios:
        # Data preparation
        control = data_combined[data_combined['user_id'] == f'{scenario}-control'].sort_values('run_number')
        experiment = data_combined[data_combined['user_id'] == f'{scenario}-experiment'].sort_values('run_number')
        
        min_length = min(len(control), len(experiment))
        control = control.iloc[:min_length]
        experiment = experiment.iloc[:min_length]

        # Bucket processing
        similarities = []
        for i in range(0, min_length - bucket_size + 1, bucket_size):
            c_bucket = control.iloc[i:i+bucket_size]
            e_bucket = experiment.iloc[i:i+bucket_size]

            c_tags = get_hashtag_sets(c_bucket)
            e_tags = get_hashtag_sets(e_bucket)

            matches = sum(1 for c in c_tags for e in e_tags if c & e)
            total = len(c_tags) * len(e_tags)
            similarities.append(matches/total if total > 0 else 0)

        results['individual'][scenario] = similarities

    # Visualization
    fig_ind = go.Figure()
    fig_agg = go.Figure() if aggregate_scenarios else None

    # Individual scenario plots
    for idx, scenario in enumerate(scenarios):
        similarities = results['individual'][scenario]
        positions = [i*bucket_size + bucket_size/2 for i in range(len(similarities))]
        color = scenario_colors[scenario]

        # Correlation analysis
        rho, p = scipy.stats.spearmanr(positions, similarities)
        slope = np.polyfit(positions, similarities, 1)[0]

        # Main trace
        fig_ind.add_trace(go.Scatter(
            x=positions,
            y=similarities,
            mode='lines+markers',
            name=f'Scenario {scenario} (ρ={rho:.2f}, p={p:.3f})',
            line=dict(color=color),
            marker=dict(color=color)
        ))

        # Trend line
        trend = np.poly1d(np.polyfit(positions, similarities, 1))(positions)
        fig_ind.add_trace(go.Scatter(
            x=positions,
            y=trend,
            mode='lines',
            line=dict(color=color, dash='dash'),
            showlegend=False
        ))

    # Aggregated analysis
    if aggregate_scenarios:
        for group_idx, scenario_group in enumerate(aggregate_scenarios):
            group_sims = []
            positions = None
            color = color_palette[group_idx%len(color_palette)]

            # Get video counts for each scenario
            scenario_lengths = {}
            for scenario in scenario_group:
                if scenario in results['individual']:
                    sims = results['individual'][scenario]
                    scenario_lengths[scenario] = len(sims)    
            
            # Find scenario with most videos
            max_scenario = max(scenario_lengths.items(), key=lambda x: x[1])
            max_len = max_scenario[1]
            
            if max_len > 0:
                positions = [i*bucket_size + bucket_size/2 for i in range(max_len)]
                
                # For each position, average available similarities
                avg_sims = []
                for pos in range(max_len):
                    pos_sims = []
                    for scenario in scenario_group:
                        if scenario in results['individual']:
                            sims = results['individual'][scenario]
                            if pos < len(sims):
                                pos_sims.append(sims[pos])
                    if pos_sims:
                        avg_sims.append(np.mean(pos_sims))
                    else:
                        # If no scenarios have data for this position, use data from longest scenario
                        avg_sims.append(results['individual'][max_scenario[0]][pos])

                # Correlation analysis
                rho, p = scipy.stats.spearmanr(positions, avg_sims)


                label = legend_labels.get(tuple(scenario_group), 
                        f'Group {scenario_group}') + f' (ρ={rho:.2f}, p={p:.3f})'

                # Main trace
                fig_agg.add_trace(go.Scatter(
                    x=positions,
                    y=avg_sims,
                    mode='lines+markers',
                    name=label,
                    line=dict(color=color),
                    marker=dict(color=color)
                ))

                # Trend line
                trend = np.poly1d(np.polyfit(positions, avg_sims, 1))(positions)
                fig_agg.add_trace(go.Scatter(
                    x=positions,
                    y=trend,
                    mode='lines',
                    line=dict(color=color, dash='dash'),
                    showlegend=False
                ))

    # Figure formatting
    fig_ind.update_layout(
        title='Control vs Experiment Hashtag Similarity',
        xaxis_title='Video Position',
        yaxis_title='Similarity Score',
        template='plotly_white'
    )
    fig_ind.show()

    if fig_agg:
        fig_agg.update_layout(
            title='Aggregated Similarity Comparisons for all liking scenarios',
            xaxis_title='Video Position',
            yaxis_title='Similarity Score',
            template='plotly_white'
        )
        fig_agg.show()

    return results

# Example usage
test_scenarios = [16,18,22]
aggregate_groups = [[16,18,22]]
group_labels = {
    (16,18,22): 'Likes hashtags',
}

analysis_results = analyze_control_vs_experiment(
    data_combined=data_combined,
    scenarios=test_scenarios,
    bucket_size=50,
    aggregate_scenarios=aggregate_groups,
    legend_labels=group_labels
)


`analyze_hashtag_similarity_same_group`

1. Comparing **first vs last video buckets** within a single group (control/experiment)
2. Calculating **pairwise similarity scores** using:
   - Undirected hashtag overlaps (≥1 shared tag)
   - Normalized match ratios (matches/total_pairs)
3. Generating formatted comparisons showing:
   - Per-scenario bucket similarity differences
   - Aggregate trend direction (increasing/decreasing)


In [10]:
def analyze_hashtag_similarity_same_group(data_combined, scenarios, group='control', bucket_size=150):
    """
    Analyze hashtag similarity between videos within first and last buckets.
    A match occurs if two videos share at least one hashtag.

    Parameters:
    - data_combined: DataFrame with the video data 
    - scenarios: List of scenario numbers to analyze
    - group: Either 'control' or 'experiment'
    - bucket_size: Size of buckets for aggregating results
    """
    all_results = {}

    # Process each scenario
    for scenario in scenarios:
        scenario_data = data_combined[data_combined['user_id'] == f'{scenario}-{group}'].sort_values('run_number')

        # Calculate similarities for first and last buckets
        similarities = []
        for bucket in [scenario_data.iloc[:bucket_size], 
                       scenario_data.iloc[-bucket_size:]]:

            tag_sets = get_hashtag_sets(bucket)
            total_pairs = len(tag_sets) * (len(tag_sets) - 1)
            matches = 0

            # Compare all unique pairs
            for i in range(len(tag_sets)):
                for j in range(i+1, len(tag_sets)):
                    if tag_sets[i] & tag_sets[j]:
                        matches += 1

            # Calculate similarity score (undirected pairs)
            similarity = (2 * matches) / total_pairs if total_pairs > 0 else 0
            similarities.append(similarity)

        all_results[scenario] = similarities

    # Generate formatted output
    print(f"\n{'='*60}")
    print(f"HASHTAG SIMILARITY ANALYSIS - {group.upper()} GROUP")
    print(f"{'='*60}")
    print(f"{'Scenario':<10} | {'First Bucket':<12} | {'Last Bucket':<12} | {'Difference':<10}")
    print(f"{'-'*60}")

    first_avg, last_avg = [], []
    for scenario, sims in all_results.items():
        diff = sims[1] - sims[0]
        first_avg.append(sims[0])
        last_avg.append(sims[1])
        print(f"{scenario:<10} | {sims[0]:<12.4f} | {sims[1]:<12.4f} | {diff:+.4f}")

    # Calculate overall averages
    print(f"{'-'*60}")
    print(f"{'Overall':<10} | {np.mean(first_avg):<12.4f} | {np.mean(last_avg):<12.4f} | "
          f"{np.mean(last_avg)-np.mean(first_avg):+.4f}")

    return all_results

# Example usage
scenarios = [33,34,35,37,38,39,40,41,42]
results = analyze_hashtag_similarity_same_group(
    data_combined=data_combined,
    scenarios=scenarios,
    group='control',
    bucket_size=125
)



HASHTAG SIMILARITY ANALYSIS - CONTROL GROUP
Scenario   | First Bucket | Last Bucket  | Difference
------------------------------------------------------------
33         | 0.0807       | 0.0687       | -0.0121
34         | 0.0243       | 0.0327       | +0.0084
35         | 0.0762       | 0.0774       | +0.0012
37         | 0.0598       | 0.0519       | -0.0079
38         | 0.0352       | 0.0581       | +0.0229
39         | 0.0417       | 0.0527       | +0.0109
40         | 0.0583       | 0.0526       | -0.0057
41         | 0.0249       | 0.0467       | +0.0218
42         | 0.0446       | 0.0344       | -0.0102
------------------------------------------------------------
Overall    | 0.0495       | 0.0528       | +0.0033
