In [27]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib import cm
from datetime import datetime
import glob
import os
import json
import pickle
import six
import charset_normalizer

sns.set()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None

In [28]:
AllCSV = [i for i in glob.glob('*.{}'.format('csv'))]
AllCSV

['GBvideos.csv', 'USvideos.csv']

In [29]:
all_dataframes = [] # list to store each data frame separately

for csv in AllCSV:
    print(f'csv: {csv}')
    df = pd.read_csv(csv)
    df['country'] = csv[0:2] # adding column 'country' so that each dataset could be identified uniquely
    all_dataframes.append(df)

all_dataframes[0].head() # index 0 to 9 for [CA, DE, FR, GB, IN, JP, KR, MX, RU, US] datasets

csv: GBvideos.csv
csv: USvideos.csv


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,country
0,Jw1Y-zhQURU,17.14.11,John Lewis Christmas Ad 2017 - #MozTheMonster,John Lewis,26,2017-11-10T07:38:29.000Z,"christmas|""john lewis christmas""|""john lewis""|...",7224515,55681,10247,9479,https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg,False,False,False,Click here to continue the story and make your...,GB
1,3s1rvMFUweQ,17.14.11,Taylor Swift: …Ready for It? (Live) - SNL,Saturday Night Live,24,2017-11-12T06:24:44.000Z,"SNL|""Saturday Night Live""|""SNL Season 43""|""Epi...",1053632,25561,2294,2757,https://i.ytimg.com/vi/3s1rvMFUweQ/default.jpg,False,False,False,Musical guest Taylor Swift performs …Ready for...,GB
2,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10T17:00:03.000Z,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",17158579,787420,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,GB
3,PUTEiSjKwJU,17.14.11,Goals from Salford City vs Class of 92 and Fri...,Salford City Football Club,17,2017-11-13T02:30:38.000Z,"Salford City FC|""Salford City""|""Salford""|""Clas...",27833,193,12,37,https://i.ytimg.com/vi/PUTEiSjKwJU/default.jpg,False,False,False,Salford drew 4-4 against the Class of 92 and F...,GB
4,rHwDegptbI4,17.14.11,Dashcam captures truck's near miss with child ...,Cute Girl Videos,25,2017-11-13T01:45:13.000Z,[none],9815,30,2,30,https://i.ytimg.com/vi/rHwDegptbI4/default.jpg,False,False,False,Dashcam captures truck's near miss with child ...,GB


In [30]:
for df in all_dataframes:
    # video_id 
    df['video_id'] = df['video_id'].astype('str') 
    
    # trending date
    df['trending_date'] = df['trending_date'].astype('str') 
    date_pieces = (df['trending_date']
                   .str.split('.')
                  )

    df['Year'] = date_pieces.str[0].astype(int)
    df['Day'] = date_pieces.str[1].astype(int)
    df['Month'] = date_pieces.str[2].astype(int)
    
    updatedyear = []
    for i in range(len(df)) : 
        y = df.loc[i, "Year"]
        newy = y+2000
        updatedyear.append(newy)
        
    #print(f'updatedyear: {updatedyear}')
    
    for i in range(len(df)):
        newy = updatedyear[i]
        tr = df.loc[i, "Year"]
        df['Year'].replace(to_replace = tr, value = newy, inplace=True)  
        #print(df['Year'])
        
    del df['trending_date']
    df['trending_date'] = pd.to_datetime(df[['Year', 'Month', 'Day']], format = "%Y-%m-%d")
    #print(df['trending_date'])
    del df['Year']
    del df['Day']
    del df['Month']
    
    
    #title
    df['title'] = df['title'].astype('str')
    
    #channel_title
    df['channel_title'] = df['channel_title'].astype('str')
    #category_id
    df['category_id'] = df['category_id'].astype(str) 
    
    #tags
    df['tags'] = df['tags'].astype('str')
    
    # views, likes, dislikes, comment_count are already in correct data types i.e int64
    
    #thumbnail_link
    df['thumbnail_link'] = df['thumbnail_link'].astype('str') 
    
    #description
    df['description'] = df['description'].astype('str')
    
    # Changing comments_disabled, ratings_disabled, video_error_or_removed from bool to categorical
    df['comments_disabled'] = df['comments_disabled'].astype('category') 
    df['ratings_disabled'] = df['ratings_disabled'].astype('category') 
    df['video_error_or_removed'] = df['video_error_or_removed'].astype('category') 
    
    # publish_time 
    df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce', format='%Y-%m-%dT%H:%M:%S.%fZ')
    


0       2017-11-14
1       2017-11-14
2       2017-11-14
3       2017-11-14
4       2017-11-14
5       2017-11-14
6       2017-11-14
7       2017-11-14
8       2017-11-14
9       2017-11-14
10      2017-11-14
11      2017-11-14
12      2017-11-14
13      2017-11-14
14      2017-11-14
15      2017-11-14
16      2017-11-14
17      2017-11-14
18      2017-11-14
19      2017-11-14
20      2017-11-14
21      2017-11-14
22      2017-11-14
23      2017-11-14
24      2017-11-14
25      2017-11-14
26      2017-11-14
27      2017-11-14
28      2017-11-14
29      2017-11-14
30      2017-11-14
31      2017-11-14
32      2017-11-14
33      2017-11-14
34      2017-11-14
35      2017-11-14
36      2017-11-14
37      2017-11-14
38      2017-11-14
39      2017-11-14
40      2017-11-14
41      2017-11-14
42      2017-11-14
43      2017-11-14
44      2017-11-14
45      2017-11-14
46      2017-11-14
47      2017-11-14
48      2017-11-14
49      2017-11-14
50      2017-11-14
51      2017-11-14
52      2017

0       2017-11-14
1       2017-11-14
2       2017-11-14
3       2017-11-14
4       2017-11-14
5       2017-11-14
6       2017-11-14
7       2017-11-14
8       2017-11-14
9       2017-11-14
10      2017-11-14
11      2017-11-14
12      2017-11-14
13      2017-11-14
14      2017-11-14
15      2017-11-14
16      2017-11-14
17      2017-11-14
18      2017-11-14
19      2017-11-14
20      2017-11-14
21      2017-11-14
22      2017-11-14
23      2017-11-14
24      2017-11-14
25      2017-11-14
26      2017-11-14
27      2017-11-14
28      2017-11-14
29      2017-11-14
30      2017-11-14
31      2017-11-14
32      2017-11-14
33      2017-11-14
34      2017-11-14
35      2017-11-14
36      2017-11-14
37      2017-11-14
38      2017-11-14
39      2017-11-14
40      2017-11-14
41      2017-11-14
42      2017-11-14
43      2017-11-14
44      2017-11-14
45      2017-11-14
46      2017-11-14
47      2017-11-14
48      2017-11-14
49      2017-11-14
50      2017-11-14
51      2017-11-14
52      2017

In [31]:
for df in all_dataframes:
    df.insert(4, 'publish_date', df['publish_time'].dt.date) # loc, column name, values for column to be inserted
    df['publish_time'] = df['publish_time'].dt.time
# Changing data type for 'publish_date' from object to 'datetime64[ns]'
for df in all_dataframes:
     df['publish_date'] = pd.to_datetime(df['publish_date'], format = "%Y-%m-%d")