## 1. Introduction
### 1.1 Problem definition
### 1.2 Dataset

## 2. Environment
### 2.1 Packages

In [1]:
import os
import pandas as pd
import numpy as np

# own utils
from data_eng_utils import calc_df_size, schema_output, type_cast_df

In [2]:
os.listdir()

['.ipynb_checkpoints',
 'Aggregated_Metrics_By_Country_And_Subscriber_Status.csv',
 'Aggregated_Metrics_By_Video.csv',
 'All_Comments_Final.csv',
 'data_eng_utils.py',
 'kengee_youtube_data-Copy1.ipynb',
 'kengee_youtube_data.ipynb',
 'prelim_data_eng_template.ipynb',
 'Video_Performance_Over_Time.csv',
 '__pycache__']

### 2.2 Parameters

In [3]:
# file names
COUNTRY_SUB_METRICS = 'Aggregated_Metrics_By_Country_And_Subscriber_Status.csv'
VIDEO_METRICS = 'Aggregated_Metrics_By_Video.csv'
ALL_COMMENTS = 'All_Comments_Final.csv'
VIDEO_PERFORMANCE = 'Video_Performance_Over_Time.csv'

### 2.3 Data

In [4]:
# Reading data
country_metrics = pd.read_csv(COUNTRY_SUB_METRICS, encoding='utf-8')
video_metrics = pd.read_csv(VIDEO_METRICS, encoding='utf-8')
comments = pd.read_csv(ALL_COMMENTS, encoding='utf-8')
video_performance = pd.read_csv(VIDEO_PERFORMANCE, encoding='utf-8')

In [5]:
# File size prior to data engineering
size = calc_df_size(country_metrics)
print(f'Size is {size} MB')

Size is 6.248124 MB


## 3. Wrangling

In [6]:
# Analyis of country_metrics 
country_metrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55292 entries, 0 to 55291
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Video Title                 55292 non-null  object 
 1   External Video ID           55292 non-null  object 
 2   Video Length                55292 non-null  int64  
 3   Thumbnail link              55292 non-null  object 
 4   Country Code                54906 non-null  object 
 5   Is Subscribed               55292 non-null  bool   
 6   Views                       55292 non-null  int64  
 7   Video Likes Added           55292 non-null  int64  
 8   Video Dislikes Added        55292 non-null  int64  
 9   Video Likes Removed         55292 non-null  int64  
 10  User Subscriptions Added    55292 non-null  int64  
 11  User Subscriptions Removed  55292 non-null  int64  
 12  Average View Percentage     53854 non-null  float64
 13  Average Watch Time          538

In [7]:
country_metrics.head(2)

Unnamed: 0,Video Title,External Video ID,Video Length,Thumbnail link,Country Code,Is Subscribed,Views,Video Likes Added,Video Dislikes Added,Video Likes Removed,User Subscriptions Added,User Subscriptions Removed,Average View Percentage,Average Watch Time,User Comments Added
0,🌶 Hot Topics in Tech: Data Science Explained #...,OtqQYqRNDGI,59,https://i.ytimg.com/vi/OtqQYqRNDGI/hqdefault.jpg,HK,True,23,1,0,0,2,0,0.67187,39.640348,0
1,🌶 Hot Topics in Tech: Data Science Explained #...,OtqQYqRNDGI,59,https://i.ytimg.com/vi/OtqQYqRNDGI/hqdefault.jpg,ME,True,3,0,0,0,0,0,0.49887,29.433333,0


In [8]:
country_metrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55292 entries, 0 to 55291
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Video Title                 55292 non-null  object 
 1   External Video ID           55292 non-null  object 
 2   Video Length                55292 non-null  int64  
 3   Thumbnail link              55292 non-null  object 
 4   Country Code                54906 non-null  object 
 5   Is Subscribed               55292 non-null  bool   
 6   Views                       55292 non-null  int64  
 7   Video Likes Added           55292 non-null  int64  
 8   Video Dislikes Added        55292 non-null  int64  
 9   Video Likes Removed         55292 non-null  int64  
 10  User Subscriptions Added    55292 non-null  int64  
 11  User Subscriptions Removed  55292 non-null  int64  
 12  Average View Percentage     53854 non-null  float64
 13  Average Watch Time          538

In [15]:
schema_output(country_metrics)

_SCHEMA = {
	"Video Title": {
		"title": "",
		"data_type": ""
	},
	"External Video ID": {
		"title": "",
		"data_type": ""
	},
	"Video Length": {
		"title": "",
		"data_type": ""
	},
	"Thumbnail link": {
		"title": "",
		"data_type": ""
	},
	"Country Code": {
		"title": "",
		"data_type": ""
	},
	"Is Subscribed": {
		"title": "",
		"data_type": ""
	},
	"Views": {
		"title": "",
		"data_type": ""
	},
	"Video Likes Added": {
		"title": "",
		"data_type": ""
	},
	"Video Dislikes Added": {
		"title": "",
		"data_type": ""
	},
	"Video Likes Removed": {
		"title": "",
		"data_type": ""
	},
	"User Subscriptions Added": {
		"title": "",
		"data_type": ""
	},
	"User Subscriptions Removed": {
		"title": "",
		"data_type": ""
	},
	"Average View Percentage": {
		"title": "",
		"data_type": ""
	},
	"Average Watch Time": {
		"title": "",
		"data_type": ""
	},
	"User Comments Added": {
		"title": "",
		"data_type": ""
	}
}


In [10]:
COUNTRY_METRICS_SCHEMA = {
    "Video Title": {
        "title": "title",
        "data_type": str
    },
    "External Video ID": {
        "title": "video_id",
        "data_type": str
    },
    "Video Length": {
        "title": "length_sec",
        "data_type": int
    },
    "Thumbnail link": {
        "title": "thumbnail",
        "data_type": str
    },
    "Country Code": {
        "title": "coutry",
        "data_type": str
    },
    "Is Subscribed": {
        "title": "subscribed",
        "data_type": bool
    },
    "Views": {
        "title": "views",
        "data_type": int
    },
    "Video Likes Added": {
        "title": "likes_added",
        "data_type": int
    },
    "Video Dislikes Added": {
        "title": "dislikes_added",
        "data_type": int
    },
    "Video Likes Removed": {
        "title": "likes_removed",
        "data_type": int
    },
    "User Subscriptions Added": {
        "title": "subscription_added",
        "data_type": int
    },
    "User Subscriptions Removed": {
        "title": "subscription_removed",
        "data_type": int
    },
    "Average View Percentage": {
        "title": "avg_views_percent",
        "data_type": float
    },
    "Average Watch Time": {
        "title": "avg_watch_time",
        "data_type": float
    },
    "User Comments Added": {
        "title": "comments",
        "data_type": int
    }
}

### 3.1 Typecasting and 3.2 Renaming data

In [11]:
country_metrics_upt = type_cast_df(country_metrics, COUNTRY_METRICS_SCHEMA)

In [12]:
size_upt = calc_df_size(country_metrics_upt)
print(f'Before: {size}, After: {size_upt}')
print(f'Decrease in size = {round((size - size_upt) / size * 100, 2)}')

Before: 6.248124, After: 4.036444
Decrease in size = 35.4


### 3.3 Sorting data

In [13]:
# country_metrics_upt.sort_values(by="date", inplace=True)

## 4. Conclusion
### 4.1 Data Observation

In [14]:
country_metrics_upt.describe(include='all')

Unnamed: 0,title,video_id,length_sec,thumbnail,coutry,subscribed,views,likes_added,dislikes_added,likes_removed,subscription_added,subscription_removed,avg_views_percent,avg_watch_time,comments
count,55292,55292,55292.0,55292,54906,55292,55292.0,55292.0,55292.0,55292.0,55292.0,55292.0,53854.0,53854.0,55292.0
unique,223,223,,223,233,2,,,,,,,,,
top,How I Would Learn Data Science (If I Had to St...,4OZip0cgOho,,https://i.ytimg.com/vi/4OZip0cgOho/hqdefault.jpg,GB,False,,,,,,,,,
freq,428,428,,428,445,28588,,,,,,,,,
mean,,,896.678073,,,,100.573157,4.239311,0.106055,0.175577,2.365279,0.053389,0.347821,205.29834,0.0
std,,,927.380801,,,,1704.966002,65.291451,2.507351,3.601171,65.889539,0.766945,0.259732,206.541626,0.0
min,,,47.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,375.0,,,,2.0,0.0,0.0,0.0,0.0,0.0,0.145948,85.442894,0.0
50%,,,545.0,,,,7.0,0.0,0.0,0.0,0.0,0.0,0.320983,170.139725,0.0
75%,,,934.0,,,,27.0,1.0,0.0,0.0,0.0,0.0,0.486153,259.449951,0.0


### 4.2 Wrap up
1. Video metrics's column titles include unicode characters which are not read properly by pandas
    - function to update column data types considers only the updated titles from each individual dataframe's schema
2. Video metrics includes a row describing total of each column, the final total row is saved under the `video_metrics_tot` variable, and is dropped from our dataframe
3. There exists missing values in all of the data