In [5]:
from pyspark.sql import SparkSession
import pyspark.sql
from pyspark.sql.types import *


## View-Sub ratio

In [6]:
spark:SparkSession = SparkSession.builder.getOrCreate()

In [7]:

v_schema = StructType([
StructField('video_id', StringType()),
StructField('channel_id', StringType()),
StructField('title', StringType()),
StructField('video_type', StringType()),
StructField('published_date', DateType()),
StructField('published_time', TimestampType()),
StructField('duration', StringType()),

])

vl_schema = StructType([
StructField('video_id', StringType()),
StructField('view_count', IntegerType()),
StructField('like_count', IntegerType()),
StructField('comment_count', IntegerType()),
StructField('created_date', DateType()),
StructField('created_at', TimestampType()),
])


c_schema = StructType([
StructField('channel_id', StringType()),
StructField('name', StringType()),
StructField('customurl', StringType()),
StructField('published_date', DateType()),
StructField('thumbnail_url', StringType()),
StructField('description', TimestampType()),
StructField('country', StringType()),
StructField('keywords', StringType()),
StructField('topic', StringType()),
StructField('created_at', TimestampType()),
StructField('active', StringType()),

])

cl_schema = StructType([
StructField('channel_id', StringType()),
StructField('view_count', IntegerType()),
StructField('sub_count', IntegerType()),
StructField('video_count', IntegerType()),
StructField('created_at', TimestampType()),
StructField('created_date', DateType()),
])

In [12]:
c = spark.read.csv('../sample_data/channel.csv',schema=c_schema)
cl = spark.read.csv('../sample_data/channel_log.csv',schema=cl_schema)
v = spark.read.csv('../sample_data/video.csv',schema=v_schema)
vl = spark.read.csv('../sample_data/video_log.csv',schema=vl_schema)

In [16]:
# This section of for pulling data from sample_data

v = v[v['video_type']=='video']
video = pd.merge(v[['video_id','channel_id','published_date','video_type']],vl,how='right',on='video_id')
video = pd.merge(video,c[['channel_id','name']],how='left',on='channel_id')
video = pd.merge(video,cl[['channel_id','sub_count']],how='left',on='channel_id')
del cl,v,vl

In [17]:
video = video.drop_duplicates(subset='video_id',keep='last')
video['published_date'] = pd.to_datetime(video['published_date'])
video['created_date'] = pd.to_datetime(video['created_date'])
video = video[video['created_date']-video['published_date']>datetime.timedelta(days=30)]
video['view_sub_ratio'] = video['view_count']/video['sub_count']

In [18]:
exclude_list = ['周杰倫 Jay Chou','凱文羊','福茂唱片']
video = video[~video['name'].isin(exclude_list)]

In [19]:
video_ratio_top_10 = video[video['video_type']=='video'
                           ].groupby('name')['view_sub_ratio'].mean().sort_values(ascending=False)[:10]
video_ratio_top_10.index

Index(['HOOK', '好棒Bump', 'Ku's dream酷的夢-', '人生肥宅x尊', 'cheap', '欸你這週要幹嘛',
       '這群人TGOP', '千千進食中', '胡子Huzi', '啾啾鞋'],
      dtype='object', name='name')

In [20]:
trace1 = go.Scatter(x=video_ratio_top_10.index.tolist(),
                      y=video[video['name'].isin(video_ratio_top_10.index)].drop_duplicates(subset='name').set_index('name').reindex(video_ratio_top_10.index).sub_count.values,
                      name='Subscriber Count',
                      yaxis='y1',
                      mode='markers',
                      marker_color="#ffe476",
                      marker_size=8
                      )
boxs = []
for name in video_ratio_top_10.index.tolist():
    boxs.append(go.Box(
                y=video[(video['video_type']=='video') & (video['name']==name)]['view_sub_ratio'],
                boxpoints=False,
                yaxis='y2',
                name=name,
                marker_color="#7B4173",
                showlegend=False,
                ))

dummy = go.Box(
    x=[None],
    y=[None],
    name="Video Views",
    marker_color="#7B4173"
)

data = [trace1,*boxs,dummy]
layout = go.Layout(title='Video Performance (past 30 days)',
                    yaxis=dict(title='Subscribers',
                               overlaying='y2',
                                side='right'),
                    yaxis2=dict(title='Video views',
                                ),
                    template="plotly_dark")
                    
fig = go.Figure(data=data,layout=layout,)

fig.update_layout(
    legend=dict(
    y=1.15,
    x=1.05),
    autosize=False,
    width=800,
    height=500
    )

fig.show()

In [21]:
fig.write_image('images/view_sub_ratio.png')