In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load CSVs as dataframes

ht_df = pd.read_csv('./csv/sponsored/samples/hashtag_sponsored_2021-06-08 16_58_04.csv')
user_vids_df = pd.read_csv('./csv/sponsored/samples/users_videos_sponsored.csv')
user_dets_df = pd.read_csv('./csv/sponsored/samples/get_user_details_sponsored_2021-06-08 22_46_32.csv')

In [3]:
# Convert epoch times to datetime

user_vids_df['creation_time'] = pd.to_datetime(user_vids_df['creation_time'],unit='s')
user_dets_df['account_created'] = pd.to_datetime(user_dets_df['account_created'],unit='s')

In [4]:
# Calculate engagement metric for each video

user_vids_df['video_engagement'] = ((user_vids_df.diggs + 
                                     user_vids_df.comments + 
                                     user_vids_df.shares) / 
                                    user_vids_df.plays)

In [5]:
# Dropping and renaming columns

user_dets_df.drop(columns=['heart_count'], inplace=True)

In [6]:
user_vids_df['sponsored-ad_ht'] = np.where(user_vids_df['description'].str.contains('#Sponsored|#sponsored|#ad |#Ad ', na=False), 1, 0)

In [7]:
user_vids_df['sponsored-ad_ht'].value_counts(normalize=True)

0    0.98104
1    0.01896
Name: sponsored-ad_ht, dtype: float64

In [8]:
recent_engagement = (user_vids_df.groupby('username').apply(lambda x: x.sort_values('creation_time', ascending=False).head(20)['diggs'].sum()) +
user_vids_df.groupby('username').apply(lambda x: x.sort_values('creation_time', ascending=False).head(20)['comments'].sum()) +
user_vids_df.groupby('username').apply(lambda x: x.sort_values('creation_time', ascending=False).head(20)['shares'].sum())) / user_vids_df.groupby('username').apply(lambda x: x.sort_values('creation_time', ascending=False).head(20)['plays'].sum())

recent_engagement = pd.DataFrame(recent_engagement, columns=['recent_engagement'])

user_dets_df = pd.merge(user_dets_df, recent_engagement, left_on='user_name', right_on='username')

In [9]:
user_vids_df.groupby('username')['sponsored-ad_ht'].sum()

username
208skindoc          16
accelbyzantine       1
adampukeonhaters     1
adamselihi           2
alexcortex           1
                    ..
yoeslan              2
yoga.classes         1
yun_bao              2
zacharyyryann        5
zhangarang           6
Name: sponsored-ad_ht, Length: 193, dtype: int64

In [10]:
sponsored_ad_engagement = user_vids_df.groupby(['username','sponsored-ad_ht']).agg({'diggs': 'sum', 
                                                                                    'shares': 'sum', 
                                                                                    'comments': 'sum', 
                                                                                    'plays': 'sum'})

sponsored_ad_engagement

Unnamed: 0_level_0,Unnamed: 1_level_0,diggs,shares,comments,plays
username,sponsored-ad_ht,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
208skindoc,0,27201284,331045,245127,272625546
208skindoc,1,366481,13945,3057,3536900
accelbyzantine,0,336666,1966,4627,1639764
accelbyzantine,1,91800,138,1539,840100
adampukeonhaters,0,40993684,130234,299052,242621379
...,...,...,...,...,...
yun_bao,1,133200,1779,1006,1152900
zacharyyryann,0,81498789,485906,137188,365170210
zacharyyryann,1,1601973,24158,5901,7447776
zhangarang,0,12029943,308184,61770,64832462


In [11]:
overall_ad_engagement = []

for author in user_dets_df['user_name']:
    ad_engagement = (sponsored_ad_engagement.xs((author, 1))['diggs'] + sponsored_ad_engagement.xs((author, 1))['shares'] +
    sponsored_ad_engagement.xs((author, 1))['comments']) / sponsored_ad_engagement.xs((author, 1))['plays']
    overall_ad_engagement.append([author, ad_engagement])

ad_engagement = pd.DataFrame(overall_ad_engagement, columns=['username', 'ad_engagement'])

user_dets_df = pd.merge(user_dets_df, ad_engagement, left_on='user_name', right_on='username')
user_dets_df.drop(columns=['username'], inplace=True)

In [12]:
overall_nonad_engagement = []

for author in user_dets_df['user_name']:
    try:
        nonad_engagement = (sponsored_ad_engagement.xs((author, 0))['diggs'] + sponsored_ad_engagement.xs((author, 0))['shares'] + sponsored_ad_engagement.xs((author, 0))['comments']) / sponsored_ad_engagement.xs((author, 0))['plays']
        overall_nonad_engagement.append([author, nonad_engagement])
    except:
        overall_nonad_engagement.append([author, 0])

non_ad_engagement = pd.DataFrame(overall_nonad_engagement, columns=['username', 'non_ad_engagement'])
user_dets_df = pd.merge(user_dets_df, non_ad_engagement, left_on='user_name', right_on='username')
user_dets_df.drop(columns=['username'], inplace=True)

In [13]:
sponsored_vid_pct = user_vids_df.groupby('username')['sponsored-ad_ht'].sum() / user_vids_df.groupby('username')['sponsored-ad_ht'].count()

sponsored_vid_pct = pd.DataFrame(sponsored_vid_pct)

user_dets_df = pd.merge(user_dets_df, sponsored_vid_pct, left_on='user_name', right_on='username')

user_dets_df.rename(columns={"sponsored-ad_ht": "sponsored_vids_pct"}, inplace=True)

In [14]:
user_dets_df.head()

Unnamed: 0,user_name,user_id,nickname,account_created,verified,bio_link,followers,following,heart,videos,diggs,recent_engagement,ad_engagement,non_ad_engagement,sponsored_vids_pct
0,soularty,6652451661401948166,Art for the Soul,2019-02-02 03:46:06,False,-,70700,149,1400000,162,0,0.046565,0.160401,0.070733,0.195946
1,ricardo.p1nto,6566533986121826310,Ricardo Pinto,2018-06-13 04:02:23,False,-,27400,633,187000,209,0,0.130764,0.122916,0.17547,0.65534
2,dripnas,29014327,Naseer Johnson,2016-01-12 12:06:10,False,-,244600,743,4800000,146,0,0.18464,0.139831,0.207193,0.068493
3,miyaevarenae,6623756337799806982,Miyaeva Renae ♡,2018-11-23 02:21:27,False,https://beacons.page/miyaevarenae,4800000,164,142000000,475,0,0.178419,0.141785,0.191881,0.033755
4,linhbarbie,67563924000,✨ Linh Barbie ✨,2017-08-28 21:27:24,True,https://www.youtube.com/channel/UCDZ3_SK5RJgMm...,16200000,224,262400000,1070,0,0.149684,0.080241,0.121661,0.00188


In [15]:
user_dets_trimmed_df = user_dets_df[['user_name', 'recent_engagement', 'ad_engagement', 'non_ad_engagement', 'sponsored_vids_pct']].copy()

In [16]:
user_vids_df.head()

Unnamed: 0,video_id,description,creation_time,duration,author_id,username,nickname,music_id,song_title,music_author_name,diggs,shares,comments,plays,video_engagement,sponsored-ad_ht
0,6971501331164990725,WE SING A LOT HERE. BE MY BOYFRIEND FOR THE WE...,2021-06-08 19:05:45,22,6747459471943418885,chris,Chris Olsen,6.971501e+18,original sound,Chris Olsen,7274,4,32,52400,0.139504,0
1,6971222650995625221,HE ALWAYS TURNS IT AROUND @ THE END 😭 @ianpaget_,2021-06-08 01:04:21,41,6747459471943418885,chris,Chris Olsen,6.971223e+18,original sound,Chris Olsen,221900,1726,890,766700,0.292834,0
2,6971214894595099910,HE GOT SO MAD AT RHE END 💀 @ianpaget_ #couple,2021-06-08 00:34:15,36,6747459471943418885,chris,Chris Olsen,6.971215e+18,original sound,Chris Olsen,254000,602,387,1300000,0.196145,0
3,6970845224281066757,WHAT’D SHE SAYY AT THE ENDDD?? We love Lola ❤️🇵🇭,2021-06-07 00:39:44,33,6747459471943418885,chris,Chris Olsen,6.746993e+18,Monkeys Spinning Monkeys,Kevin MacLeod,328200,565,855,1400000,0.235443,0
4,6970796148659391749,#stitch with @zaytashon88 Lola’s rockin w it 😤🇵🇭,2021-06-06 21:29:17,14,6747459471943418885,chris,Chris Olsen,6.970796e+18,original sound,Chris Olsen,1900000,7270,6067,8200000,0.233334,0
