In [1]:
import os
import json
import unicodedata
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go

files_path = '../data/'

In [2]:
scenarios = [18,22,23,24,38,39,40,41,42]

In [3]:
# Define target nicknames
target_nicknames = ["charlidamelio",
        "bellapoarch",
        "addisonre",
        "willsmith",
        "zachking",
        "tiktok",
        "dixiedamelio",
        "jasonderulo",
        "spencerx",
        "lorengray",
        "justmaiko",
        "kyliejenner",
        "brentrivera",
        "itsjojosiwa",
        "selenagomez",
        "avani",
        "joealbanese",
        "jamescharles",
        "dobretwins",
        "babyariel",
        "wigofellas",
        "qpark",
        "aehaunted",
        "ketikajazan",
        "tiffytoky",
        "dylan.page",
        "ooliviakathleenn",
        "funny.pets090",
        "nbcsnl",
        "tvshows_analyst",
        "ishowspeed",
        "corevibezzz_",
        "oneilthomas97",
        "bonjourmusic",
        "virall.music",
        "brainrotdaily_",
        "lambotelevision",
        "nitronuke"
        ]

In [4]:
def clean_text(text):
    """Ensure text is UTF-8 encoded and strip out non-ASCII characters."""
    # Normalize text to remove accents and unusual unicode characters
    normalized_text = unicodedata.normalize('NFKD', text)
    # Encode to ASCII, ignoring characters that cannot be encoded, then decode back to string
    return normalized_text.encode('ascii', 'ignore').decode('ascii')

def parse_info(video, user_id):
    d = {}
    d["user_id"] = user_id
    
    author = video.get("author", {})
    stats = video.get("stats", {})
    music = video.get("music", {})
    video_stats = video.get("video", {})
    text_extras = video.get("contents", [{}])[0].get("textExtra", [])
    # df["author"] = author
    # df["stats"] = stats
    # df["music"] = music
    # df["video_stats"] = video_stats
    # df["text_extras"] = text_extras

    # Basic video details
    d["play_count"] = stats.get("playCount", 0)
    d["digg_count"] = stats.get("diggCount", 0)
    d["share_count"] = stats.get("shareCount", 0)
    d["comment_count"] = stats.get("commentCount", 0)
    d["collect_count"] = stats.get("collectCount", 0)
    d["repost_count"] = video.get("statsV2", {}).get("repostCount", 0)
    d["loudness"] = video_stats.get("volumeInfo", {}).get("Loudness", "N/A")
    d["duration"] = video_stats.get("duration", "N/A")  # Video duration

    # Author details
    d["author_id"] = author.get("id", "Unknown Author")
    d["author_name"] = clean_text(author.get("nickname", "Unknown Author Name"))
    d["author_bio"] = clean_text(author.get("signature", "No bio"))  # Cleaned Author bio
    d["nickname"] = author.get("uniqueId", "Unknown Nickname")

    # Music details
    d["music_name"] = clean_text(music.get("title", "Unknown Music Title"))
    d["music_id"] = music.get("id", "Unknown Music ID")
    d["music_author"] = clean_text(music.get("authorName", "Unknown Music Author"))
    d["music_album"] = clean_text(music.get("album", "Unknown Album"))  # Cleaned Music album

    # Hashtags from textExtra
    hashtags = [clean_text(extra.get("hashtagName")) for extra in text_extras if extra.get("type") == 1]
    d["hashtags"] = '_'.join(hashtags)

    d["video_id"] = video.get("id", "Unknown ID")

    df = pd.DataFrame([d])
    df["repost_count"] = df["repost_count"].astype(int)
    return df

In [5]:
def load_data_into_df(scenarios: list) -> pd.DataFrame:
    df_jsons = []
    for user_id in scenarios:
        for user_type in ['control', 'experiment']:
            user_path = files_path + f'{user_id}/{user_id}-{user_type}/'

            for root, dirs, files in os.walk(user_path):
                if 'responses' in dirs:
                    responses_path = os.path.join(root, 'responses')
                    # Only look at files directly in responses folder
                    response_files = [f for f in os.listdir(responses_path) 
                                    if f.endswith('.json') and f != 'config.json']
                    for file in response_files:
                        file_path = os.path.join(responses_path, file)
                        with open(file_path, 'r') as f:
                            data = json.load(f)
                            for video in data['itemList']:
                                df_jsons.append(parse_info(video, str(user_id) + '-' + user_type))

    data_combined = pd.concat(df_jsons, ignore_index=True)
    data_combined['run_number'] = data_combined.groupby('user_id').cumcount() + 1

    return data_combined

In [6]:
data_combined = load_data_into_df(scenarios)
data_combined

Unnamed: 0,user_id,play_count,digg_count,share_count,comment_count,collect_count,repost_count,loudness,duration,author_id,author_name,author_bio,nickname,music_name,music_id,music_author,music_album,hashtags,video_id,run_number
0,18-control,47400000,3000000,25600,31900,75200,0,-20.6,268,6980785150019781637,SB Mowing,Please keep up with me on Snapchat or YouTube ...,sbmowing,original sound - SB Mowing,7463305390462094126,SB Mowing,Unknown Album,,7463304946616651050,1
1,18-control,6600000,1400000,78100,1928,146500,0,-14,71,7329520300319540257,Scare Prank USA,ueafollow me uea,scareprankusa54,original sound,7458665057581288214,Scare Prank USA,Unknown Album,scarecam_scareprank_funnyprank_scaring_jumpy_v...,7458665050828492054,2
2,18-control,4900000,324700,967,260,20200,0,-23.9,119,7425531431735985198,user7291424162817,,user7291424162817,original sound,7451878052642016031,user7291424162817,Unknown Album,movie_tiktokmovies_movieclips_clip_movieshowsc...,7451878163384110366,3
3,18-control,584300,25700,155,201,2923,0,-8.7,131,7319790687351882795,Jay rose,,jayrose044,AeuEA,7433574331416529706,Jay rose,Unknown Album,satisfyin_fyp_storytime_foryou_viral_tiktok_tr...,7433574400735104302,4
4,18-control,2300000,258600,4980,1021,30800,0,-11.9,83,6982084681585394693,UniversalCore,(ua Mental health matters ua) \nTrus...,universalcore____,Closer,7309864547584968706,Nuages,Closer,corecore_sadcore_hopecore_coretok_core_mentalh...,7443535065801379115,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18877,42-experiment,1900000,377000,9905,2172,54100,0,-21.8,132,6676293103072658437,Brady LXIX,IG/FB:BRADYLXIX \nother account:bradylxix2\nME...,bradylxix,original sound,7461417766466243370,Brady LXIX,Unknown Album,bradylxix,7461417768965950763,1066
18878,42-experiment,1200000,80300,2216,673,33100,0,-13.8,35,6793632786105713666,CAPCUT TEMPLATE TRENDS,uiMUSIC PROMOTION DMui\n\nCREDITS TO THE OWNER...,capcut_templatetrends,AURA,7387573619378046993,Ogryzek,AURA,capcut_capcutpioneer_capcut_edit_fyp_trend_for...,7447105323816602898,1067
18879,42-experiment,3800000,493000,13500,3063,104800,0,-6.7,62,7250349443498066990,vidz,Trying to influence a bigger audience uoOe\nI ...,vidztoker,original sound,7451284899840953130,vidz,Unknown Album,foryou_fyp_edit,7451284883516771630,1068
18880,42-experiment,1500000,94100,10100,14000,16400,0,-9.3,61,6936022572887360517,hydgjgf,,hydgjgf,original sound,7445855150561905414,hydgjgf,Unknown Album,fyp_fypC_story_satisfyingvideo_comedyvideo_mar...,7445855110065818885,1069


`analyze_nicknames_for_scenario`

1. Aligning experiment/control group video counts for **direct comparison**
2. Tracking target nickname appearances across:
   - Chronological video sequences (using `run_number`)
   - Both experimental and control groups
3. Generating **time-series data** for:
   - Per-video nickname occurrences
   - Maximum comparable observation window

In [7]:
def analyze_nicknames_for_scenario(data_combined, target_nicknames, scenario):
    # Create a timeline analysis for both groups
    experiment_data = data_combined[data_combined['user_id'].str.contains(f'{scenario}-experiment')]
    control_data = data_combined[data_combined['user_id'].str.contains(f'{scenario}-control')]

    # Sort both datasets
    experiment_data = experiment_data.sort_values('run_number')
    control_data = control_data.sort_values('run_number')

    # Find the minimum number of videos between control and experimental groups
    max_videos = min(experiment_data['run_number'].max(), control_data['run_number'].max())

    # Limit analysis to the minimum number of videos
    experiment_data = experiment_data[experiment_data['run_number'] <= max_videos]
    control_data = control_data[control_data['run_number'] <= max_videos]

    # Calculate presence of target nicknames over time for both groups
    def count_target_nicknames(nickname):
        return 1 if nickname in target_nicknames else 0

    experiment_data['target_nickname_count'] = experiment_data['nickname'].apply(count_target_nicknames)
    control_data['target_nickname_count'] = control_data['nickname'].apply(count_target_nicknames)

    return experiment_data, control_data, max_videos

`plot_nickname_trends`

1. Creating **comparisons** between control/experiment groups using:
   - 250-video buckets
2. Calculating **three key metrics**:
   - Overall group averages
   - First/last bucket comparisons
   - Per-scenario temporal patterns


In [8]:
def plot_nickname_trends(scenarios_data, target_nicknames):
    bucket_size = 250  # Size of buckets for averaging

    # Plot bucket averages across all scenarios
    fig2 = go.Figure()

    all_exp_means = []
    all_ctrl_means = []
    all_exp_first_bucket = []
    all_exp_last_bucket = []
    all_ctrl_first_bucket = []
    all_ctrl_last_bucket = []

    for scenario, (exp_data, ctrl_data, _) in scenarios_data.items():
        exp_bucket_means = []
        ctrl_bucket_means = []
        
        # Calculate bucket averages for experimental group
        for i in range(0, len(exp_data), bucket_size):
            bucket = exp_data.iloc[i:i+bucket_size]
            if len(bucket) >= bucket_size * 0.75:  # Only include if bucket is at least 75% full
                bucket_mean = bucket['target_nickname_count'].mean()
                exp_bucket_means.append({
                    'mean': bucket_mean,
                    'run': bucket['run_number'].mean()
                })
                all_exp_means.append(bucket_mean)
                
                if i == 0:
                    all_exp_first_bucket.append(bucket_mean)
                if i + bucket_size >= len(exp_data) - bucket_size:
                    all_exp_last_bucket.append(bucket_mean)

                print(f"\nExperimental Group Scenario {scenario} Bucket {i//bucket_size + 1}:")
                print(f"Average: {bucket_mean:.2f}")

        # Calculate bucket averages for control group
        for i in range(0, len(ctrl_data), bucket_size):
            bucket = ctrl_data.iloc[i:i+bucket_size]
            if len(bucket) >= bucket_size * 0.75:
                bucket_mean = bucket['target_nickname_count'].mean()
                ctrl_bucket_means.append({
                    'mean': bucket_mean,
                    'run': bucket['run_number'].mean()
                })
                all_ctrl_means.append(bucket_mean)
                
                if i == 0:
                    all_ctrl_first_bucket.append(bucket_mean)
                if i + bucket_size >= len(ctrl_data) - bucket_size:
                    all_ctrl_last_bucket.append(bucket_mean)

                print(f"\nControl Group Scenario {scenario} Bucket {i//bucket_size + 1}:")
                print(f"Average: {bucket_mean:.2f}")

        # Convert to DataFrames and plot if we have data
        if exp_bucket_means and ctrl_bucket_means:
            exp_buckets_df = pd.DataFrame(exp_bucket_means)
            ctrl_buckets_df = pd.DataFrame(ctrl_bucket_means)

            # Add experimental bucket averages
            fig2.add_trace(go.Scatter(
                x=exp_buckets_df['run'],
                y=exp_buckets_df['mean'],
                mode='lines+markers',
                name=f'Experimental {scenario}',
                line=dict(color='blue', width=2)
            ))
            
            # Add control bucket averages
            fig2.add_trace(go.Scatter(
                x=ctrl_buckets_df['run'],
                y=ctrl_buckets_df['mean'],
                mode='lines+markers',
                name=f'Control {scenario}',
                line=dict(color='red', width=2)
            ))

    fig2.update_layout(
        title=f'Average Target Nicknames per {bucket_size}-Video Bucket',
        xaxis_title='Average Run Number in Bucket',
        yaxis_title='Average Number of Target Nicknames',
        showlegend=True,
        template='plotly_white'
    )
    
    fig2.show()

    # Calculate and print the required averages
    all_exp_total = []
    all_ctrl_total = []
    
    for scenario, (exp_data, ctrl_data, _) in scenarios_data.items():
        all_exp_total.extend(exp_data['target_nickname_count'])
        all_ctrl_total.extend(ctrl_data['target_nickname_count'])

    print("\nOverall Averages Across All Scenarios:")
    if len(all_exp_total) > 0:
        print(f"Overall experimental group average: {sum(all_exp_total)/len(all_exp_total):.2f}")
    if len(all_ctrl_total) > 0:
        print(f"Overall control group average: {sum(all_ctrl_total)/len(all_ctrl_total):.2f}")
    
    print("\nFirst Bucket Averages Across All Scenarios:")
    if len(all_exp_first_bucket) > 0:
        print(f"Experimental group first bucket average: {sum(all_exp_first_bucket)/len(all_exp_first_bucket):.2f}")
    if len(all_ctrl_first_bucket) > 0:
        print(f"Control group first bucket average: {sum(all_ctrl_first_bucket)/len(all_ctrl_first_bucket):.2f}")
    
    print("\nLast Bucket Averages Across All Scenarios:")
    if len(all_exp_last_bucket) > 0:
        print(f"Experimental group last bucket average: {sum(all_exp_last_bucket)/len(all_exp_last_bucket):.2f}")
    if len(all_ctrl_last_bucket) > 0:
        print(f"Control group last bucket average: {sum(all_ctrl_last_bucket)/len(all_ctrl_last_bucket):.2f}")

In [9]:
# Analyze data for each scenario
scenarios = ['23','24']
scenarios_data = {}

for scenario in scenarios:
    exp_data, ctrl_data, max_videos = analyze_nicknames_for_scenario(data_combined, target_nicknames, scenario)
    scenarios_data[scenario] = (exp_data, ctrl_data, max_videos)

# Plot trends and print statistics
plot_nickname_trends(scenarios_data, target_nicknames)


Experimental Group Scenario 23 Bucket 1:
Average: 0.10

Experimental Group Scenario 23 Bucket 2:
Average: 0.03

Experimental Group Scenario 23 Bucket 3:
Average: 0.03

Experimental Group Scenario 23 Bucket 4:
Average: 0.05

Control Group Scenario 23 Bucket 1:
Average: 0.05

Control Group Scenario 23 Bucket 2:
Average: 0.00

Control Group Scenario 23 Bucket 3:
Average: 0.03

Control Group Scenario 23 Bucket 4:
Average: 0.01

Experimental Group Scenario 24 Bucket 1:
Average: 0.04

Experimental Group Scenario 24 Bucket 2:
Average: 0.02

Experimental Group Scenario 24 Bucket 3:
Average: 0.01

Experimental Group Scenario 24 Bucket 4:
Average: 0.04

Control Group Scenario 24 Bucket 1:
Average: 0.05

Control Group Scenario 24 Bucket 2:
Average: 0.02

Control Group Scenario 24 Bucket 3:
Average: 0.02

Control Group Scenario 24 Bucket 4:
Average: 0.03



Overall Averages Across All Scenarios:
Overall experimental group average: 0.04
Overall control group average: 0.03

First Bucket Averages Across All Scenarios:
Experimental group first bucket average: 0.07
Control group first bucket average: 0.05

Last Bucket Averages Across All Scenarios:
Experimental group last bucket average: 0.05
Control group last bucket average: 0.02
