# DSCI 320: Project Milestone 2

### Read Tidied Data (from Global YouTube Statistics 2023)
Original dataset can be found at: 
https://www.kaggle.com/datasets/nelgiriyewithana/global-youtube-statistics-2023 

In [1]:
import altair as alt
import pandas as pd
import os
from vega_datasets import data

# Read subset of relevant data:
clean_df = pd.read_csv('Data/clean_df.csv').drop(columns=['Unnamed: 0'])
clean_df.head()

Unnamed: 0,subscribers,youtuber,video_views,channel_category,channel_uploads,country,lowest_yearly_earnings,highest_yearly_earnings,created_year,tertiary_education_enrollment,population,unemployment_rate,channel_type,Latitude,Longitude,region
0,245000000,T-Series,228000000000.0,Music,20082,India,6800000.0,108400000.0,2006,28.1,1366418000.0,5.36,Music,20.593684,78.96288,Asia
1,170000000,YouTube Movies,0.0,Media,1,United States,0.04,0.58,2006,88.2,328239500.0,14.7,Games,37.09024,-95.712891,N.America
2,166000000,MrBeast,28368840000.0,Entertainment,741,United States,4000000.0,64700000.0,2012,88.2,328239500.0,14.7,Entertainment,37.09024,-95.712891,N.America
3,162000000,Cocomelon - Nursery Rhymes,164000000000.0,Education,966,United States,5900000.0,94800000.0,2006,88.2,328239500.0,14.7,Education,37.09024,-95.712891,N.America
4,159000000,SET India,148000000000.0,Media,116536,India,5500000.0,87500000.0,2006,28.1,1366418000.0,5.36,Entertainment,20.593684,78.96288,Asia


## Data Viz for Tasks

#### TASK 1: "What are the top earners among YouTube channel types on a yearly basis?"

In [2]:
dropdown = alt.binding_select(
    options=['highest_yearly_earnings', 'subscribers', 'video_views'],
    name='Heatmap color attribute: ')

xcol_param = alt.param(
    value='highest_yearly_earnings',
    bind=dropdown
)

chart1 = alt.Chart(clean_df).mark_rect().encode(
    alt.X('created_year:O', title = 'Created Year',axis=alt.Axis(tickCount=1)),
    alt.Y('channel_category', title = 'Channel Category'), 
    color=alt.Color('sum(x):Q',title='Attribute Sum', scale=alt.Scale(scheme='orangered')
                  #  legend=alt.Legend(orient='left')
                   ),
).transform_calculate(
    x=f'datum[{xcol_param.name}]'
).add_params(
    xcol_param
).properties(height=200, width = 250,
             title = 'YouTube Insight by Year')

from IPython.display import HTML
display(HTML("""
<style>
.vega-bind {
  text-align:left;
}
</style>
"""))

chart1

#### TASK 2: "Is there a trend between unemployment rates and highest yearly income for different countries?" 

In [3]:
# Wrangling specific to Task 2:
clean_df['highest_yearly_earnings_group'] = pd.cut(
    clean_df['highest_yearly_earnings'],
    bins=[0, 1000000, 10000000, 20000000, float('inf')],
    labels=['0-1M', '1M-10M', '10M-20M', '20M+'],right=False
)

country_ids_url='https://raw.githubusercontent.com/joelostblom/teaching-datasets/main/country-ids-and-continents.csv'
country_ids = pd.read_csv(country_ids_url).rename(columns = {'Country':'country'})
combined_df = pd.merge(clean_df, country_ids)

listregions = ['N.America','S.America'] 
combined_df = combined_df[combined_df['region'].isin(listregions)]

countries = alt.topo_feature(data.world_110m.url, 'countries')

In [4]:
# Task 2 Viz
chart2a = alt.Chart(combined_df).mark_geoshape(
).transform_lookup(
        lookup='ID', from_=alt.LookupData(countries, key='id',fields=["type", "properties", "geometry"])
).encode(
    alt.Color('unemployment_rate', title='Unemployment Rate (%)', scale=alt.Scale(scheme='lightmulti')),
    alt.Tooltip(['unemployment_rate', 'country:N']),
).project('equalEarth'
         ).properties(width=250, height=200)

chart2b = alt.Chart(combined_df).mark_geoshape(
).transform_lookup(
        lookup='ID', from_=alt.LookupData(countries, key='id',fields=["type", "properties", "geometry"])
).encode(
    alt.Color('highest_yearly_earnings_group:O', title = "Highest Yearly Earnings", scale=alt.Scale(scheme='lightmulti')),
    alt.Tooltip(['country:N','highest_yearly_earnings']),
).project('equalEarth').properties(width=180, height=200)

chart2 = (chart2a | chart2b).properties(spacing=0, 
    title=
    alt.TitleParams(text='YouTube Earnings and Unemployment (N. & S. America)',
                          anchor='middle')
)
chart2

### TASK 3 
#### "How does the number of subscribers and video views correlate with the channel's earnings, and what are the top 10 YouTube channels within each category based on subscribers?

In [5]:
selection = alt.selection_point(fields=["channel_category"])
tooltip = alt.Tooltip(['youtuber', 'subscribers', 'video_views', 'highest_yearly_earnings', 'channel_category'])

scatter3 = alt.Chart(clean_df).mark_circle(opacity=0.7).encode(
    alt.X('subscribers:Q', title = 'Subscribers Count' ,axis=alt.Axis(format='~s')),
    alt.Y('average(highest_yearly_earnings):Q', title = 'Highest Average Yearly Earnings', axis=alt.Axis(format='~s')),
    alt.Size("video_views:Q",legend=alt.Legend(orient='bottom',format='.2s'), title = 'Video Views Count'),
    color=alt.condition(selection, "channel_category", alt.value('lightgray'), scale=alt.Scale(scheme='set2')),
    tooltip = tooltip
).add_params(selection).properties(title = 'Earnings by Subscribers', width=250,height=200).interactive()

hist3 = alt.Chart(clean_df).mark_bar().encode(
    alt.Y("youtuber:N",title='YouTuber', 
         # axis=alt.Axis(labelExpr='datum.value')).sort('-x'),
       axis=alt.Axis(labelExpr='substring(datum.value, 0, 12) + "..."')).sort('-x'),
    alt.X("subscribers", title = 'Subscribers Count', axis=alt.Axis(format='~s')),
    color=alt.condition(selection, "channel_category", alt.value('lightgray'), scale=alt.Scale(scheme='set2'), 
                        #legend=alt.Legend(orient='left'), 
                        #legend=None,
                        title='Channel Category'),
    tooltip = tooltip
).transform_filter(selection).transform_window(
    rank='rank(subscribers)',
).transform_filter(
    alt.datum.rank <= 10
).properties(title = 'Top 10 Channels by Subscribers', width=250, height=200)

chart3 = alt.hconcat(
    scatter3, hist3
)

chart3

#### TASK 4: “Are there distinct patterns/trends in YouTube channel creation within each category over time?”

In [6]:
chart4 = alt.Chart(clean_df).mark_area().encode(
    alt.X('created_year:O',title='Created Year'),
    alt.Y('count(youtuber):Q', title = 'YouTube Channels Count').stack('center'),
    alt.Color('channel_category:N',title = 'Channel Category',scale=alt.Scale(scheme='set2'), 
              #legend=alt.Legend(orient='left')
             ),
    alt.Tooltip(['count(youtuber)','channel_category',]),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(
    selection
).properties(width=250, height=200, title = 'YouTube Channels by Category Over Time')

chart4

### DASHBOARD

In [7]:
chart4=chart4.transform_filter(selection)
dash3hist = hist3.encode(color=alt.condition(selection, "channel_category", alt.value('lightgray'), legend=None, scale=alt.Scale(scheme='spectral'), title='Channel Category'),)

dash3 = alt.hconcat(
    scatter3, hist3
)


top = alt.hconcat(scatter3, hist3, chart4).resolve_legend(size='independent')
bottom = alt.hconcat(chart1 , chart2).resolve_legend(color='independent')


dashboard = alt.vconcat(top ,bottom).properties(
    title=alt.TitleParams(text='DASHBOARD: YouTube Data Analysis',
                          anchor='middle', fontSize=20))
dashboard