In [50]:
import pandas as pd
import os
import numpy as np
from tqdm.auto import tqdm
from collections import Counter
import pickle
import re
def extract_v_id(string):
    m = re.findall(r'\d{19}', string)
    return int(m[0])

In [5]:
ddp_path = "data/DDPs_Sept_2023/"
ddp_files = os.listdir(ddp_path)

In [6]:
# rename action var
actions_dict = {'You posted a video':"posting", 
                'You blocked someone':"blocking",
                'You watched a video':"watching", 
                'You liked a video':"liking", 
                'You logged in':"logging_in",
                'You searched for something':"searching", 
                'You followed someone':"following",
               'You favorited a video':"favorites",
                'You shared a video':"sharing",
               'You sent/received a private message':"PM", 
                'You watched a live stream':"watching_live",
               'You posted a comment':"commenting",
               }

# Filter for sessions and keep only the ones without duplciates

In [8]:
usability_info = []
no_watching = []
usable_total = 0
unusable_total = 0

for ddp_f in tqdm(ddp_files):
#for ddp_f in tqdm(["ID_8613.csv"]):
    path = ddp_path+ddp_f
    ddp_temp = pd.read_csv(path)
    
    # format date
    ddp_temp["Date"] = [pd.to_datetime(i.replace("UTC","")) for i in ddp_temp["Date"]]
    ddp_temp["Date"] = pd.to_datetime(ddp_temp["Date"])
    
    # format Action
    ddp_temp["Action"] = ddp_temp["Action"].map(actions_dict)
    
    # format video_id
    v_ids = []
    for i in ddp_temp["Url"]:
        if pd.notna(i):
            try:
                v_ids.append(extract_v_id(i))
            except:
                v_ids.append("live stream") # live stream links
        else:
            v_ids.append("no video")
    ddp_temp["v_ids"] = v_ids

    # subset from first watch entry in ddp
    first_video = ddp_temp[ddp_temp["Action"]=="watching"]["Date"].min()
    ddp_temp = ddp_temp[ddp_temp["Date"]>=first_video]
    
    # get delta between time stamps and delete first 
    ddp_temp = ddp_temp.reset_index(drop=True)
    activity_time = ddp_temp["Date"].diff(periods=1)[1:].dt.total_seconds()
    # drop the final activity per ddp to align with deltas
    ddp_temp = ddp_temp.drop(ddp_temp.tail(1).index)
    # add deltas as activity time (moved by one row up)
    ddp_temp["activity_time"] = list(activity_time)
    
    ### split in sessions
    # get indices for each session
    split_idxs = list(activity_time[activity_time>=105].index)
    split_idys = list(split_idxs)[1:]
    split_idys.extend([-1])
    
    
    usable_count = 0
    unusable_count = 0
    no_w_count = 0
    for x,y in zip(split_idxs, split_idys):
        
        session = ddp_temp.loc[x:y-1]   
        
        # erase any session without watching! not usefull
        
        # drop "logging_in" & "following"
        session = session[session["Action"]!="logging_in"]
        session = session[session["Action"]!="following"]
        
        # remove non watching activities at the beginning of a session
        remove_until = 0
        for action in session.Action:
            if action != "watching":
                remove_until += 1
            else:
                break
        session = session.reset_index(drop=True)
        session = session.iloc[remove_until:]
        session = session.reset_index(drop=True)


        if "watching" in list(session.Action):
            
            # filter for not usable sessions
            if len(session) >= 2: # if 
                duplicates = session.Date.duplicated(keep="first")
                duplicates_counter = Counter(duplicates)

                if duplicates_counter[True]/len(session) != 0:
                    unusable_count += 1

                else: 
                    usable_count += 1
                    # save successfull sessions
                    f_name = f"{ddp_f[:-4]+'_'+str(usable_count)}.csv"
                    session.to_csv(f"data/session_dataframes/{f_name}", index=False)
        else:
            no_w_count += 1
            
            
    frac_unusable = unusable_count/(usable_count+unusable_count)
    usability_info.append(frac_unusable)
    no_watching.append(no_w_count)
    
    unusable_total += unusable_count
    usable_total += usable_count
    print(ddp_f, len(ddp_temp), usable_count, unusable_count, frac_unusable, no_w_count)

  0%|          | 0/18 [00:00<?, ?it/s]

ID_6592.csv 94387 1401 1642 0.5395990798554059 212
ID_8613.csv 2519 65 111 0.6306818181818182 34
ID_4256.csv 2536 31 65 0.6770833333333334 26
ID_2354.csv 57081 660 807 0.5501022494887525 60
ID_3000.csv 2992 34 68 0.6666666666666666 2
ID_6173.csv 1742 9 11 0.55 5
ID_7005.csv 472 2 14 0.875 11
ID_6871.csv 18108 337 484 0.5895249695493301 912
ID_9228.csv 23701 25 212 0.8945147679324894 596
ID_8802.csv 118025 2534 1079 0.29864378632715194 326
ID_6266.csv 193985 640 4057 0.8637428145624867 1657
ID_5209.csv 88428 3080 1201 0.28054192945573464 576
ID_4855.csv 1211 17 19 0.5277777777777778 5
ID_8720.csv 15999 194 374 0.6584507042253521 70
ID_5319.csv 8785 370 199 0.34973637961335674 31
ID_6994.csv 37612 1046 726 0.40970654627539504 109
ID_1435.csv 1557 59 10 0.14492753623188406 6
ID_1065.csv 50109 3613 1671 0.31623769871309615 359


In [13]:
print(f"Sessions without duplicates in total (usable sessions):{usable_total}")
print(f"Sessions with duplicates in total (unusable sessions):{unusable_total}")
print(f"Fraction of unusable sessions: {(unusable_total/(unusable_total+usable_total))*100}")

Sessions without duplicates in total (usable sessions):14117
Sessions with duplicates in total (unusable sessions):12750
Fraction of unusable sessions: 47.45598689842558


# Extract video IDS of latest 100 sessions for each user for meta data collection

In [18]:
session_files = os.listdir("data/session_dataframes/")
session_files = [i for i in session_files if "._" not in i]

ids = [int(i[3:7]) for i in session_files]
u_ids = list(set(ids))

In [25]:
session_sample = []
v_ids_sample = []

for u_id in tqdm(u_ids):    
    for i in range(1,101):
        
        try:
            file_name = f"ID_{u_id}_{i}"
            temp_df = pd.read_csv(f"data/session_dataframes/{file_name}.csv")
            session_sample.append(file_name)
            
            temp_df = temp_df[temp_df["Action"]=="watching"]
            v_ids = [extract_v_id(i) for i in temp_df.Url if pd.notna(i)] 
            v_ids_sample.extend(v_ids)
            
            
        except FileNotFoundError:
            pass

  0%|          | 0/18 [00:00<?, ?it/s]

In [27]:
print(f"Total number of sessions in sample: {len(set(session_sample))}")
print(f"Total number of videos in sample: {len(set(v_ids_sample))}")

Total number of sessions in sample: 1242
Total number of videos in sample: 13342


In [28]:
with open("data/session_sample_vIDs_100.pkl", "wb") as f:
    pickle.dump(session_sample,f)
with open("data/video_ids_sample_100.pkl", "wb") as f:
    pickle.dump(list(set(v_ids_sample)),f)

# Generate csv. for analysis, add video length from metadata

In [53]:
active_endorsement = ["liking", "favorites","sharing","commenting"]
other_activities = ["searching","PM","watching_live"]

In [67]:
# prepare functions

def generate_active_endorsment_dict(df,active_endorsements):
    endorsement_dict = {}

    df = df.reset_index(drop=True)
    
    
    active_endorsement_col = []
    for i in df.index:
    
        user = df.loc[i,"ID"]
        session = df.loc[i,"session"]
        activity = df.loc[i,"Action"]
        engagement_count = 0
        if activity in active_endorsement:
            for j in range(len(active_endorsement)): #up to x active endorsements are possible
                idx = j+1
                prev_activity = df.loc[i-idx,"Action"]
                if prev_activity == "watching":
                    video_id = df.loc[i-idx,"v_ids"]
                    time_stamp = df.loc[i-idx,"Date"]
                    identity = str(video_id)+"_"+str(time_stamp)
                    engagement_count += 1
                    break  #, if previous activity was watching no more endorsement related to it, can be

                elif prev_activity in other_activities:
                    break
                    # if previous activity was in "other activities" we can not relate the endorsement reliably anymore
                else: 
                    pass # in any other case (prev was itself endorsement), we assume stacked endorsement
        
        
        active_endorsement_col.append(engagement_count)
        
    return active_endorsement_col

def return_video_length(v_id):
    try:
        path = f"data/meta_data_small/{str(v_id)}.pkl"
        with open(path, "rb") as f:
            meta_temp = pickle.load(f)
        video_duration = meta_temp["video_duration"]
    except:
        video_duration = None
        
    return video_duration

def delta(df):

    df = df.reset_index(drop=True)
    
    watch_time = df["Date"].diff(periods=1)[1:].dt.total_seconds()
    
    augmented_session = df.drop(df.tail(1).index)
    augmented_session["watch_time"] = list(watch_time)
    
    return augmented_session

In [68]:
with open("data/session_sample_vIDs_100.pkl", "rb") as f:
    session_sample = pickle.load(f)

In [69]:
dfs = []

for i in tqdm(session_sample):
    temp_df = pd.read_csv(f"data/session_dataframes/{i}.csv")
    _,u_id,n_session = i.split("_")
    temp_df["ID"] = u_id
    temp_df["session"] = n_session
        
    # add active endorsment
    a_engagement_col = generate_active_endorsment_dict(temp_df, active_endorsement)
    a_engagement_col = a_engagement_col[1:]
    a_engagement_col.append(0)
    temp_df["active_engament"] = a_engagement_col
    
    # Add activity times of active endorsement to the respective video
    # delete rows of such activities, then
    temp_df = temp_df.reset_index(drop=True)
    
    for idx in range(len(temp_df)):
        activity = temp_df.loc[idx,"Action"]
        a_engagement = temp_df.loc[idx,"active_engament"]

        if a_engagement >0:
            wt_toadd = 0
            for i in range(a_engagement):
                if idx+i >= len(temp_df):
                    break
                else:

                    follow_activity = temp_df.loc[idx+(i+1),"Action"]
                    if follow_activity in active_endorsement:
                        follow_wt = temp_df.loc[idx+(i+1),"activity_time"]
                        wt_toadd += follow_wt
            
            temp_df.loc[idx,"activity_time"] += wt_toadd

    keep = ['watching']
    mask = temp_df['Action'].isin(keep)
    temp_df = temp_df[mask]
    
    
    # add passive endorsement
    durations = []
    for v_id in temp_df.v_ids:
        durations.append(return_video_length(v_id))
    temp_df["video_length"] = durations
    
    temp_df["wt_frac"] = temp_df["activity_time"]/temp_df["video_length"]
    
    # remove last video in session (better underestimate than over estimate)
    
    temp_df = temp_df.iloc[:-1]
    
    
    dfs.append(temp_df)

  0%|          | 0/1242 [00:00<?, ?it/s]

In [70]:
final_df = pd.concat(dfs)
final_df = final_df.replace(np.inf, np.nan) # inf are pictore galleries (no default length)
final_df = final_df.dropna(subset=["video_length"], axis=0)
final_df["wt_frac"] = final_df.wt_frac.astype(float)
final_df.head()

Unnamed: 0,Date,Action,Url,OperatingSystem,likes,v_ids,activity_time,ID,session,active_engament,video_length,wt_frac
0,2023-03-17 11:37:38,watching,https://www.tiktokv.com/share/video/6984429060...,,,6984429060914990342,26.0,6592,1,0,24.0,1.083333
2,2023-03-17 11:38:05,watching,https://www.tiktokv.com/share/video/7211168761...,,,7211168761695604014,20.0,6592,1,0,12.0,1.666667
3,2023-03-17 11:38:25,watching,https://www.tiktokv.com/share/video/7211204623...,,,7211204623296630022,6.0,6592,1,0,10.0,0.6
5,2023-03-17 11:38:34,watching,https://www.tiktokv.com/share/video/7194066849...,,,7194066849594756354,37.0,6592,1,0,26.0,1.423077
6,2023-03-17 11:39:11,watching,https://www.tiktokv.com/share/video/7208232985...,,,7208232985055841542,9.0,6592,1,0,15.0,0.6


In [73]:
# passive engagement at median watch time PER USER! different
final_df.wt_frac.describe()
# median at 74% of watch time

count    11253.000000
mean         0.961998
std          1.142212
min          0.000000
25%          0.181818
50%          0.818182
75%          1.214286
max         16.800000
Name: wt_frac, dtype: float64

In [74]:
median_wt_per_user = {}

for u_id in final_df.ID.unique():
    
    temp_df = final_df[final_df["ID"]==u_id]
    desc = temp_df.wt_frac.describe()
    median_wt_per_user[u_id] = desc.mean()


In [75]:
median_wt_per_user

{'6592': 187.83593233952533,
 '4256': 18.786942219821103,
 '8802': 242.40239207219622,
 '8613': 37.552077240620676,
 '5319': 84.17364958473631,
 '1065': 80.50147508370902,
 '4855': 12.34585705910883,
 '7005': 1.9813758689865584,
 '9228': 16.135553216298923,
 '8720': 46.34780046179741,
 '2354': 168.81254493075667,
 '6994': 127.7015668163112,
 '6871': 81.63068129702766,
 '3000': 44.870085187922285,
 '5209': 93.40742343324051,
 '6266': 57.57636616175423,
 '1435': 114.56333260487574,
 '6173': 20.49731915680694}

In [76]:
final_df["passive_endorsement"] = 0

In [77]:
final_df.reset_index(drop=True, inplace=True)

In [78]:
p_e_col = []
control = []
for i in range(len(final_df)):
    
    u_id = final_df.loc[i,"ID"]
    frac_ac_time = final_df.loc[i,"wt_frac"]
    if frac_ac_time*100 >= median_wt_per_user[u_id]:
        p_e_col.append(1)
    else:
        p_e_col.append(0)
    

In [79]:
final_df["passive_endorsement"] = p_e_col


In [80]:
len(final_df)
Counter(p_e_col)

Counter({0: 8016, 1: 3459})

# Add classification

In [81]:
import pickle

In [85]:
with open('classifier/SVM_text_long_classifier.pkl', 'rb') as f:
    clf = pickle.load(f)    

In [86]:
with open("data/video_ids_sample_100.pkl", "rb") as f:
    v_ids_sample = pickle.load(f)
len(v_ids_sample)

13342

In [87]:
class_dict = {}
no_data = []
for v_id in tqdm(v_ids_sample):
    try:
        with open(f"/Volumes/Intenso/TT_explore_DB/video_data/embeddings/description_512/{v_id}.pkl", "rb") as f:
             text = pickle.load(f)
        pred = clf.predict(text.reshape(1,-1))
        class_dict[v_id] = pred[0]
        
    except:
        no_data.append(v_id)
        class_dict[v_id] = 9 #no data
        

  0%|          | 0/13342 [00:00<?, ?it/s]

In [88]:
final_df.reset_index(drop=True, inplace=True)
final_df["v_ids"] = final_df["v_ids"].astype(int)

In [89]:
final_df["class"] = final_df["v_ids"].map(class_dict)

In [90]:
final_df

Unnamed: 0,Date,Action,Url,OperatingSystem,likes,v_ids,activity_time,ID,session,active_engament,video_length,wt_frac,passive_endorsement,class
0,2023-03-17 11:37:38,watching,https://www.tiktokv.com/share/video/6984429060...,,,6984429060914990342,26.0,6592,1,0,24.0,1.083333,0,0
1,2023-03-17 11:38:05,watching,https://www.tiktokv.com/share/video/7211168761...,,,7211168761695604014,20.0,6592,1,0,12.0,1.666667,0,0
2,2023-03-17 11:38:25,watching,https://www.tiktokv.com/share/video/7211204623...,,,7211204623296630022,6.0,6592,1,0,10.0,0.600000,0,0
3,2023-03-17 11:38:34,watching,https://www.tiktokv.com/share/video/7194066849...,,,7194066849594756354,37.0,6592,1,0,26.0,1.423077,0,1
4,2023-03-17 11:39:11,watching,https://www.tiktokv.com/share/video/7208232985...,,,7208232985055841542,9.0,6592,1,0,15.0,0.600000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11470,2023-07-27 11:04:01,watching,https://www.tiktokv.com/share/video/7253144510...,,,7253144510421536005,2.0,6173,7,0,17.0,0.117647,0,0
11471,2023-08-19 08:10:13,watching,https://www.tiktokv.com/share/video/7259306127...,,,7259306127991508250,8.0,6173,8,0,89.0,0.089888,0,0
11472,2023-09-03 12:51:59,watching,https://www.tiktokv.com/share/video/7267256594...,,,7267256594923130145,10.0,6173,9,0,127.0,0.078740,0,0
11473,2023-09-03 12:53:44,watching,https://www.tiktokv.com/share/video/7274322118...,,,7274322118974672170,1.0,6173,9,0,131.0,0.007634,0,1


In [91]:
final_df.to_csv("data/session_df.csv", index=False)

In [92]:
Counter(final_df["class"])

Counter({0: 10933, 1: 542})