In [2]:
import requests
import pandas as pd
import matplotlib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from datetime import datetime, timedelta

In [4]:
driver = webdriver.Chrome()
driver.get('https://www.youtube.com/c/GenshinImpact/videos')

# Scrolling chrome page to end to get max results
# https://stackoverflow.com/a/64777192
height = driver.execute_script("return document.documentElement.scrollHeight")
previousHeight = -1

while previousHeight < height:
    previousHeight = height
    driver.execute_script(f'window.scrollTo(0,{height + 10000})')
    time.sleep(1)
    height = driver.execute_script("return document.documentElement.scrollHeight")

genshin_yt = []
count = 0
videos = driver.find_elements(By.CLASS_NAME, 'style-scope ytd-grid-video-renderer')
length_thumb = driver.find_elements(By.XPATH, 
            '//*[name() = "ytd-grid-video-renderer" and @class="style-scope ytd-grid-renderer"]')


# Looping to get information about each video. 
for video in videos: 
    title = video.find_element(By.XPATH, './/*[@id="video-title"]').text
    views = video.find_element(By.XPATH, './/*[@id="metadata-line"]/span[1]').text
    upload_date = video.find_element(By.XPATH, './/*[@id="metadata-line"]/span[2]').text

    vd = {
        'Title':title,
        'Views':views,
        'Upload Date':upload_date,
    }
    genshin_yt.append(vd)
    count += 1

# Getting length of videos
count2 = 0
duration_list = []
for i in length_thumb:
    duration = i.find_element(By.XPATH, './/span[contains(@class,"time-status")]').text
    duration_list.append(duration)
    count2 += 1

### Undisturbed DataFrame Updated June 1st, 2022

In [27]:
genshin_df = pd.DataFrame(genshin_yt)
genshin_df['Duration'] = duration_list
genshin_df.head()

Unnamed: 0,Title,Views,Upload Date,Duration
0,"Collected Miscellany - ""Yelan: Traceless Steal...",652K views,1 day ago,6:10
1,"Character Demo - ""Yelan: Shadow in the Rain"" |...",2.4M views,2 days ago,2:46
2,"Character Teaser - ""Yelan: Inevitable Justice""...",3M views,7 days ago,1:29
3,Version 2.7 Special Program｜Genshin Impact,1.3M views,12 days ago,36:06
4,"Version 2.7 ""Hidden Dreams in the Depths"" Trai...",2.2M views,12 days ago,3:48


In [None]:
# Converting to a csv file
# genshin_df.to_csv(r'C:\Users\Roast\Desktop\genshin_yt.csv', index = False, header=True)

In [29]:
# Importing fixed csv file
genshin_df1 = pd.read_csv(r'C:\Users\Roast\Desktop\genshin_yt.csv')
genshin_df1.head(5)

Unnamed: 0,Title,Views,Upload Date,Duration
0,"Collected Miscellany - ""Yelan: Traceless Steal...",652K views,1 day ago,6:10
1,"Character Demo - ""Yelan: Shadow in the Rain"" |...",2.4M views,2 days ago,2:46
2,"Character Teaser - ""Yelan: Inevitable Justice""...",3M views,7 days ago,1:29
3,Version 2.7 Special Program｜Genshin Impact,1.3M views,12 days ago,36:06:00
4,"Version 2.7 ""Hidden Dreams in the Depths"" Trai...",2.2M views,12 days ago,3:48


### Cleaning DataFrame

In [30]:
genshin_df1['Title'] = genshin_df1['Title'].astype('string')
genshin_df1['Views'] = genshin_df1['Views'].astype('string')
genshin_df1['Upload Date'] = genshin_df1['Upload Date'].astype('string')
genshin_df1['Duration'] = genshin_df1['Duration'].astype('string')

In [None]:
# Removing "views" from Views column
tens = {'K': 10e2, 'M': 10e5, 'B': 10e8}
conv = lambda x: int(float(x[:-1])*tens[x[-1]])

for i in range(len(genshin_df1['Views'])):
    genshin_df1['Views'][i] = genshin_df1['Views'][i].replace(' views', "")

genshin_df1['Views'] = genshin_df1['Views'].astype('object')
for i in range(len(genshin_df1['Views'])):
    genshin_df1['Views'][i] = conv(genshin_df1['Views'][i])

In [32]:
genshin_df1['Views'] = genshin_df1['Views'].astype('int64')
print(genshin_df1.info())
genshin_df1.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        228 non-null    string
 1   Views        228 non-null    int64 
 2   Upload Date  228 non-null    string
 3   Duration     228 non-null    string
dtypes: int64(1), string(3)
memory usage: 7.2 KB
None


Unnamed: 0,Title,Views,Upload Date,Duration
0,"Collected Miscellany - ""Yelan: Traceless Steal...",652000,1 day ago,6:10
1,"Character Demo - ""Yelan: Shadow in the Rain"" |...",2400000,2 days ago,2:46
2,"Character Teaser - ""Yelan: Inevitable Justice""...",3000000,7 days ago,1:29
3,Version 2.7 Special Program｜Genshin Impact,1300000,12 days ago,36:06:00
4,"Version 2.7 ""Hidden Dreams in the Depths"" Trai...",2200000,12 days ago,3:48


### Initial EDA Queries for Views

In [33]:
top_views = genshin_df1.sort_values(by='Views', ascending=False, axis=0, ignore_index= True)
top_views.iloc[:10] #Top 10 videos by view count. 

Unnamed: 0,Title,Views,Upload Date,Duration
0,Genshin Impact Story Teaser: We Will Be Reunit...,29000000,1 year ago,2:15
1,"Character Demo - ""Zhongli: The Listener"" | Gen...",27000000,1 year ago,1:55
2,"Collected Miscellany - ""Qiqi: Fortune-Preservi...",26000000,1 year ago,4:32
3,"New Character Demo - ""Eula: Flickering Candlel...",22000000,1 year ago,1:56
4,"""Pyro Chapter"" - Chef de Cuisine Xiangling｜Gen...",21000000,2 years ago,1:03
5,TGA 2021 Genshin Impact Entry Video｜Genshin Im...,19000000,5 months ago,1:41
6,"Character Demo - ""Hu Tao: Let the Living Bewar...",17000000,1 year ago,1:41
7,Teyvat Chapter Storyline Preview: Travail｜Gens...,16000000,1 year ago,4:19
8,"New Character Demo - ""Kaedehara Kazuha: Wander...",16000000,11 months ago,2:04
9,"Version 2.0 ""The Immovable God and the Eternal...",16000000,10 months ago,5:10


In [34]:
least_views = genshin_df1.sort_values(by='Views', ascending=True, axis=0, ignore_index= True)
least_views.iloc[:10] #Least 10 viewed videos by count

Unnamed: 0,Title,Views,Upload Date,Duration
0,#13 Knighthood Excellence｜Genshin Impact,105000,1 year ago,1:42
1,#11 A Tale of Two Dragons｜Genshin Impact,122000,1 year ago,2:16
2,#10 The Edge of the Prairie｜Genshin Impact,139000,1 year ago,1:20
3,Islands of the Lost and Forgotten - Disc 2: Be...,140000,1 month ago,44:03:00
4,#12 Rite of Battle｜Genshin Impact,150000,1 year ago,4:29
5,#07 A Day in Mondstadt｜Genshin Impact,156000,1 year ago,1:11
6,#09 Pure Sky｜Genshin Impact,162000,1 year ago,1:14
7,#15 The Wind Catcher from a Foreign Land｜Gensh...,180000,1 year ago,1:59
8,#06 Lone Sojourner｜Genshin Impact,199000,1 year ago,0:58
9,Islands of the Lost and Forgotten - Disc 1: Is...,235000,1 month ago,45:42:00


In [35]:
mean_views = genshin_df1['Views'].mean()
print('Mean: %.2f' % (mean_views)) 

Mean: 3901192.97


In [36]:
std_views = genshin_df1['Views'].std()

more_views_mean = len(genshin_df1.query('Views >= @mean_views'))
less_views_mean = len(genshin_df1.query('Views <= @mean_views'))

print('Overall percentage above mean: %.2f' % (more_views_mean/len(genshin_df1)*100))
print('Overall percentage below mean: %.2f' % (less_views_mean/len(genshin_df1)*100))


Overall percentage above mean: 32.46
Overall percentage below mean: 67.54


In [37]:
# Videos above/below the mean
print("Number of videos above mean: ", more_views_mean)
print("Number of videos below mean: ", less_views_mean)


Number of videos above mean:  74
Number of videos below mean:  154


###  Cleaning Duration and Upload Date columns

In [38]:
# mm:ss:ms duration should be hh:mm:ss

genshin_df1.query('Duration.str.len() >= 8')

for i in range(len(genshin_df1['Duration'])):
    if len(genshin_df1['Duration'][i]) >= 8:
        genshin_df1['Duration'][i] = genshin_df1['Duration'][i][:-3]
    else:
        pass
genshin_df1['Duration'][0:10:2] # output check

0     6:10
2     1:29
4     3:48
6     3:12
8    27:14
Name: Duration, dtype: string

In [39]:
# mm:ss 
for i in range(len(genshin_df1['Duration'])):
    if len(genshin_df1['Duration'][i]) == 4:
        genshin_df1['Duration'][i] = '00:0' + genshin_df1['Duration'][i]
    elif len(genshin_df1['Duration'][i]) == 5:
        genshin_df1['Duration'][i] = '00:' + genshin_df1['Duration'][i]
    else:
        pass
    
genshin_df1

Unnamed: 0,Title,Views,Upload Date,Duration
0,"Collected Miscellany - ""Yelan: Traceless Steal...",652000,1 day ago,00:06:10
1,"Character Demo - ""Yelan: Shadow in the Rain"" |...",2400000,2 days ago,00:02:46
2,"Character Teaser - ""Yelan: Inevitable Justice""...",3000000,7 days ago,00:01:29
3,Version 2.7 Special Program｜Genshin Impact,1300000,12 days ago,00:36:06
4,"Version 2.7 ""Hidden Dreams in the Depths"" Trai...",2200000,12 days ago,00:03:48
...,...,...,...,...
223,New Area Announcement: Of the Land Amidst Mono...,8800000,2 years ago,00:01:12
224,Genshin Impact - Nintendo Switch (Official Jap...,1900000,2 years ago,00:00:34
225,Come explore Teyvat with Amber! (Japanese Vers...,509000,2 years ago,00:02:03
226,Come explore Teyvat with Amber!｜Genshin Impact,523000,2 years ago,00:04:30


In [40]:
# Convert duration to seconds only
def get_sec(time_str):
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

for i in range(len(genshin_df1['Duration'])):
    genshin_df1['Duration'][i] = str(get_sec(genshin_df1['Duration'][i]))

In [41]:
genshin_df1['Duration'] = genshin_df1['Duration'].astype('int64')
genshin_df1.head(5)

Unnamed: 0,Title,Views,Upload Date,Duration
0,"Collected Miscellany - ""Yelan: Traceless Steal...",652000,1 day ago,370
1,"Character Demo - ""Yelan: Shadow in the Rain"" |...",2400000,2 days ago,166
2,"Character Teaser - ""Yelan: Inevitable Justice""...",3000000,7 days ago,89
3,Version 2.7 Special Program｜Genshin Impact,1300000,12 days ago,2166
4,"Version 2.7 ""Hidden Dreams in the Depths"" Trai...",2200000,12 days ago,228


In [52]:
# Transform upload date column to be "Month/Year" fashion
# Calculated on June 1st, 2022
now = datetime.datetime.today()
print("Today date is: ", now)

Today date is:  2022-06-01 20:02:34.925480


In [43]:
import datetime
from dateutil.relativedelta import relativedelta

def get_past_date(str_days_ago):
    TODAY = datetime.date.today()
    splitted = str_days_ago.split()
    if len(splitted) == 1 and splitted[0].lower() == 'today':
        return str(TODAY.isoformat())
    elif len(splitted) == 1 and splitted[0].lower() == 'yesterday':
        date = TODAY - relativedelta(days=1)
        return str(date.isoformat())
    elif splitted[1].lower() == 'hours':
        date = datetime.datetime.now() - relativedelta(hours=int(splitted[0]))
        return str(date.date().isoformat())
    elif splitted[1].lower() in ['day', 'days']:
        date = TODAY - relativedelta(days=int(splitted[0]))
        return str(date.isoformat())
    elif splitted[1].lower() in ['week','weeks']:
        date = TODAY - relativedelta(weeks=int(splitted[0]))
        return str(date.isoformat())
    elif splitted[1].lower() in ['month', 'months']:
        date = TODAY - relativedelta(months=int(splitted[0]))
        return str(date.isoformat())
    elif splitted[1].lower() in ['year','years']:
        date = TODAY - relativedelta(years=int(splitted[0]))
        return str(date.isoformat())

get_past_date(genshin_df1['Upload Date'][0])


'2022-05-31'

In [44]:
for i in range(len(genshin_df1['Upload Date'])):
    genshin_df1['Upload Date'][i] = get_past_date(genshin_df1['Upload Date'][i])

In [46]:
genshin_df1[5:12]

Unnamed: 0,Title,Views,Upload Date,Duration
5,Genshin Impact EP - The Shirasagi's Gentle Con...,715000,2022-05-01,259
6,"""Tsubaki in Thawing Snow"" Short Trailer | Gens...",4200000,2022-05-01,192
7,Story Teaser: Tale of the Five Kasen | Genshin...,737000,2022-05-01,138
8,Islands of the Lost and Forgotten - Disc 3: Ba...,261000,2022-05-01,1634
9,Islands of the Lost and Forgotten - Disc 2: Be...,140000,2022-05-01,2643
10,Islands of the Lost and Forgotten - Disc 1: Is...,235000,2022-05-01,2742
11,"""Song of Innocence"": Inazuma Chapter OST Album...",660000,2022-05-01,187


In [47]:
genshin_df1['Upload Date'] = pd.to_datetime(genshin_df1['Upload Date'])
genshin_df1.dtypes

Title                  string
Views                   int64
Upload Date    datetime64[ns]
Duration                int64
dtype: object

### Final DataFrame

In [53]:
genshin_df1.head(10)

Unnamed: 0,Title,Views,Upload Date,Duration
0,"Collected Miscellany - ""Yelan: Traceless Steal...",652000,2022-05-31,370
1,"Character Demo - ""Yelan: Shadow in the Rain"" |...",2400000,2022-05-30,166
2,"Character Teaser - ""Yelan: Inevitable Justice""...",3000000,2022-05-25,89
3,Version 2.7 Special Program｜Genshin Impact,1300000,2022-05-20,2166
4,"Version 2.7 ""Hidden Dreams in the Depths"" Trai...",2200000,2022-05-20,228
5,Genshin Impact EP - The Shirasagi's Gentle Con...,715000,2022-05-01,259
6,"""Tsubaki in Thawing Snow"" Short Trailer | Gens...",4200000,2022-05-01,192
7,Story Teaser: Tale of the Five Kasen | Genshin...,737000,2022-05-01,138
8,Islands of the Lost and Forgotten - Disc 3: Ba...,261000,2022-05-01,1634
9,Islands of the Lost and Forgotten - Disc 2: Be...,140000,2022-05-01,2643


In [48]:
genshin_df1.to_csv('genshin_yt.csv')