# Predict Success of Youtube Influencers
#### Date: 2017-11-21
#### User: Julianne

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext version_information
%version_information numpy, scipy, matplotlib, pandas

Software,Version
Python,3.6.0 64bit [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]
IPython,5.1.0
OS,Darwin 16.1.0 x86_64 i386 64bit
numpy,1.11.3
scipy,0.18.1
matplotlib,2.0.0
pandas,0.19.2
Tue Nov 21 21:35:11 2017 EST,Tue Nov 21 21:35:11 2017 EST


In [5]:
from __future__ import division
from datetime import datetime 
import requests
from lxml import html, etree
import json
from textblob import TextBlob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from collections import Counter

#never print matching warnings
warnings.filterwarnings('ignore') 

#set how pandas displays data
pd.options.display.max_columns = 100
pd.options.display.max_rows = 35
pd.options.display.width = 120

#change the max # of rows pandas will display
pd.options.display.max_rows = 9000

#set to display all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#import seaborn to draw pretty graphs
import seaborn as sns
sns.set_style('whitegrid')

## Define Functions

In [3]:
def _search_list(q="", publishedAfter=None, publishedBefore=None, pageToken="", channelId="", maxResults=""):
    parameters = {
                  "channelId": channelId,
                  "part": "id",
                  "maxResults": maxResults,
                  "order": "date", #will sort based on ViewCount
                  "pageToken": pageToken,
                  "q": q,
                  "type": "video",
                  "key": api_key,
                  }
    url = "https://www.googleapis.com/youtube/v3/search"
    
    if publishedAfter: parameters["publishedAfter"] = publishedAfter
    if publishedBefore: parameters["publishedBefore"] = publishedBefore
    
    page = requests.request(method="get", url=url, params=parameters)
    return json.loads(page.text)

def search_list(q="", publishedAfter=None, publishedBefore=None, channelId="", maxResults=""):
    more_results = True
    pageToken=""
    results = []
    
    for counter in range(maxResults):
        j_results = _search_list(q=q, 
                                 publishedAfter=publishedAfter, 
                                 publishedBefore=publishedBefore, 
                                 pageToken=pageToken, 
                                 channelId=channelId,
                                 maxResults=maxResults)
        items = j_results.get("items", None)
        
        #if there are items in list
        if items:
            # adds video ID to a list
            results += [item["id"]["videoId"] for item in j_results["items"]]
            if "nextPageToken" in j_results:
                pageToken = j_results["nextPageToken"]
            else:
                return results
        else:
            return results
    return results ## returns list of video IDs

def _video_list(video_id_list, maxResults):
    parameters = {"part": "statistics",
                  "id": ",".join(video_id_list),
                  "key": api_key,
                  "maxResults": maxResults
                  }
    url = "https://www.googleapis.com/youtube/v3/videos"
    page = requests.request(method="get", url=url, params=parameters)
    j_results = json.loads(page.text)
    df = pd.DataFrame([item["statistics"] for item in j_results["items"]])
    df["video_id"] = [item["id"] for item in j_results["items"]]
    
    parameters["part"] = "snippet"
    page = requests.request(method="get", url=url, params=parameters)
    j_results = json.loads(page.text)
    df["publishedAt"] = [item["snippet"]["publishedAt"] for item in j_results["items"]]
    df["publishedAt"] = df["publishedAt"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z"))
    df["date"] = df["publishedAt"].apply(lambda x: x.date())
    df["week"] = df["date"].apply(lambda x: x.isocalendar()[1])
    df["channelId"] = [item["snippet"]["channelId"] for item in j_results["items"]]
    df["title"] = [item["snippet"]["title"] for item in j_results["items"]]
    df["description"] = [item["snippet"]["description"] for item in j_results["items"]]
    df["channelTitle"] = [item["snippet"]["channelTitle"] for item in j_results["items"]]
    df["categoryId"] = [item["snippet"]["categoryId"] for item in j_results["items"]]
    return df

def video_list(video_id_list, maxResults):
    values = []
    # goes to first item in result
    for index, item in enumerate(video_id_list[::maxResults]):
        t_index = index * 50
        values.append(_video_list(video_id_list[t_index:t_index+maxResults],maxResults))
    return pd.concat(values)

In [4]:
def get_data(keyword, publishedAfter, publishedBefore, channelId, maxResults, vidtype):
    results_list = []
    for channel in channelId:
        results = search_list(q=keyword,
                              publishedAfter=publishedAfter,
                              publishedBefore=publishedBefore,
                              channelId=channel,
                              maxResults=maxResults)

        stat_data_set = video_list(results, maxResults)
        stat_data_set["key_word"] = keyword*len(stat_data_set)
        stat_data_set["type"] = vidtype*len(stat_data_set)
        results_list.append(stat_data_set)
    data_set = pd.concat(results_list)
    return data_set

In [5]:
def get_data_YYYYbegin(keyword, year, channelId, maxResults, vidtype):
    start=str(year)+"-01-01T00:00:00Z"
    end=str(year)+"-6-30T00:00:00Z"
    return get_data(keyword, publishedAfter=start, publishedBefore=end, channelId=channelId, maxResults=maxResults, vidtype=vidtype)

def get_data_YYYYend(keyword, year, channelId, maxResults, vidtype):
    start=str(year)+"-07-01T00:00:00Z"
    end=str(year)+"-12-31T00:00:00Z"
    return get_data(keyword, publishedAfter=start, publishedBefore=end, channelId=channelId, maxResults=maxResults, vidtype=vidtype)

### Define channel IDs

In [6]:
#YOGA
YWA_channelId = ["UCFKE7WVJfvaHW5q283SxchA"] #Yoga with Adriene
YWK_channelId = ["UCX32D3gKXENrhOXdZjWWtMA"] #Yoga with Kassandra

#BEAUTY
PatrickStarrr_channelId = ["UCDHQbU57NZilrhbuZNbQcRA"] #PatrickStarrr*
JaclynHill_channelId = ["UC6jgzx2g3nlbaYkd8EMweKA"] #Jaclyn Hill
LauraLee_channelId = ["UCKMugoa0uHpjUuq14yOpagw"] #Laura Lee
SimplyNailogical_channelId = ["UCGCVyTWogzQ4D170BLy2Arw"] #Simply Nailogical
MannyMua_channelId = ["UCbO9bltbkYwa56nZFQx6XJg"] #Manny Mua

#FASHION
TessChristine_channelId = ["UCVsTboAhpnuL6j-tDePvNwQ"] #Tess Christine*
AmandaSteele_channelId = ["UCZ90tZPn-ue2BmRAcDD88tA"] #Amanda Steele
FENTYPUMA_channelId = [""] #FENTY PUMA by Rihanna
KelseySimone_channelId = ["UCSeeUM-1TJjWfxFQfbyg6eA"] #Kelsey Simone
SneakerShopping_channelId = ["UCE_--R1P5-kfBzHTca0dsnw"] #Sneaker Shopping (series by Complex)

#FOOD
Babish_channelId = ["UCJHA_jMfCvEnv-3kRjTCQXw"] #Binging with Babish
HotOnes_channelId = ["UCPD_bxCRGpmmeQcbe2kpPaA"] #Hot Ones (series by First We Feast)
NerdyNummies_channelId = ["UCjwmbv6NE4mOh8Z8VhPUx1Q"] #Nerdy Nummies (series by Rosanna Pansino)
Tasty_channelId = ["UC0n9yiP-AD2DpuuYCDwlNxQ"] #Tasty
WorthIt_channelId = ["UCpko_-a4wgz2u_DgDgd9fqA"] #Worth It* (video series by Buzzfeed)

#GAMING
markiplier_channelId = ["UC7_YxT-KID8kRbqZo7MyscQ"] #Markiplier
Achievement_channelId = ["UCsB0LwkHPWyjfZ-JwvtwEXw"] #Achievement Hunter
iHasCupquake_channelId = ["UCqg2eLFNUu3QN3dttNeOWkw"] #iHasCupquake
JuegaGerman_channelId = ["UCYiGq8XF7YQD00x7wAd62Zg"] #JuegaGerman
Smosh_channelId = ["UCJ2ZDzMRgSrxmwphstrm8Ww"] #Smosh Games*

#KIDS AND FAMILY
Annie_channelId = ["UCuQ9PbS08dQJNWfwjQE-Fnw"] #Annie LeBlanc
Brooklyn_channelId = ["UC6QWhGQqf0YDYdRb0n6ojWw"] #Brooklyn & Bailey*
Guava_channelId = ["UCMNmwqCtCSpftrbvR3KkHDA"] #Guava Juice
Roman_channelId = ["UC-SV8-bUJfXjrRMnp7F8Wzw"] #Roman Atwood Vlogs
WhatsInside_channelId = ["UCSrPuHtKbst7Zy8pyWn_3Cg"] #What’s Inside?

#LIFESTYLE
Andrea_channelId = ["UCJLCmVUYSbyMGtB2pHOi_QQ"] #Andrea Russett
Baby_channelId = ["UCrv1Jwqqo_Xc7iJiwTGi5mg"] #Baby Ariel
LaurDIY_channelId = ["UCap97Ue8K_BpKlrvQRYd6JA"] #LaurDIY*
MyLifeAsEva_channelId = ["UCAoMPWcQKA_9Af5YhWdrZgw"] #MyLifeAsEva
Niki_channelId = ["UCuVHOs0H5hvAHGr8O4yIBNQ"] #Niki and Gabi

#NEWS AND CULTURE
Cheddar_channelId = [""] #Cheddar
Complex_channelId = ["UCpFHkjOa7ia6bH5_6cDsDXg"] #Complex News
NowThis_channelId = ["UCn4sPeUomNGIr26bElVdDYg"] #NowThis
Philip_channelId = ["UClFSU9_bUb4Rc6OYfTt5SPw"] #The Philip DeFranco Show
Turks_channelId = ["UC1yBKRuGpC1tSM73A0ZjYjQ"] #The Young Turks*

#SCIENCE
Everything_channelId = ["UCfIqCzQJXvYj9ssCoHq327g"] #How To Make Everything
Random_channelId = ["UC1zZE_kJ8rQHgLTVfobLi_g"] #The King of Random
Marques_channelId = ["UCBJycsmduvYEL83R_U4JriQ"] #Marques Brownlee
MindField_channelId = ["UC6nSFpj9HTCZ5t-N3Rm3-HA"] #Mind Field (series of vsauce)
Veritasium_channelId = ["UCHnyfMqiRRG1u-2MsSQLbXA"] #Veritasium*

#SPORTS AND WELLNESS
thirty_channelId = ["UCiWLfSweyRNmLpgEHekhoAg"] #30 for 30 Shorts (series on ESPN)
blogilates_channelId = ["UCIJwWYOfsCfz6PjxbONYXSg"] #blogilates*
KevinDurant_channelId = [""] #Kevin Durant
QB1_channelId = ["UCa9gVui7SBECUX61bN1i2bg"] #QB1: Beyond the Lights (series from G0x90 zone)
TannerFox_channelId = ["UCDLmS9vkPcTz3cAc-c9QIzg"] #Tanner Fox

## Pull data using Youtube's API

In [7]:
#youtube API key
api_key = "AIzaSyAKuyO0o3iU7aga6xlge-WRtesFMAksl6I"

In [8]:
maxResults=50

In [9]:
vidtype=["beauty"]
keyword = [""]
channelId=[
        PatrickStarrr_channelId,
        JaclynHill_channelId,
        LauraLee_channelId,
        SimplyNailogical_channelId,
        MannyMua_channelId
        ]
beauty_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
beauty_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
beauty_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

In [10]:
vidtype=["fashion"]
keyword = [""]
channelId=[
        TessChristine_channelId,
        AmandaSteele_channelId,
        #FENTYPUMA_channelId,
        KelseySimone_channelId,
        #SneakerShopping_channelId (series by Complex)
        ]
fashion1_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
fashion1_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
fashion1_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

keyword = ["Sneaker Shopping"]
channelId=[SneakerShopping_channelId] #(series by Complex)]
fashion2_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
fashion2_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
fashion2_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

In [11]:
vidtype=["food"]
keyword = [""]
channelId=[
        Babish_channelId,
        #HotOnes_channelId,
        #NerdyNummies_channelId,
        Tasty_channelId
        #WorthIt_channelId
        ]
food1_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
food1_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
food1_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

keyword = ["Hot Ones"]
channelId=[HotOnes_channelId]
food2_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
food2_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
food2_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

keyword = ["Nerdy Nummies"]
channelId=[NerdyNummies_channelId]
food3_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
food3_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
food3_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

keyword = ["Worth It"]
channelId=[WorthIt_channelId] ## didnt start until end of 2016
food4_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
food4_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

In [12]:
vidtype=["gaming"]
keyword = [""]
channelId=[
        markiplier_channelId,
        Achievement_channelId,
        iHasCupquake_channelId,
        JuegaGerman_channelId,
        Smosh_channelId
        ]
gaming_2016_begin = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
gaming_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
gaming_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

In [13]:
vidtype=["family"]
keyword = [""]
channelId=[
        Annie_channelId,
        Brooklyn_channelId,
        Guava_channelId,
        Roman_channelId,
        WhatsInside_channelId
        ]
family_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
family_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
family_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

In [14]:
vidtype=["lifestyle"]
keyword = [""]
channelId=[
        Andrea_channelId,
        Baby_channelId,
        LaurDIY_channelId,
        MyLifeAsEva_channelId,
        Niki_channelId
        ]
lifestyle_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
lifestyle_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
lifestyle_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

In [15]:
vidtype=["news"]
keyword = [""]
channelId=[
        #Cheddar_channelId,
        Complex_channelId,
        NowThis_channelId,
        Philip_channelId,
        Turks_channelId
        ]
news_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
news_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
news_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

In [16]:
vidtype=["science"]
keyword = [""]
channelId=[
        Everything_channelId,
        Random_channelId,
        Marques_channelId,
        #MindField_channelId,
        Veritasium_channelId
        ]
science1_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
science1_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
science1_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

keyword = ["Mind Field"]
channelId=[MindField_channelId] ## didn't start until 2017
science2_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

In [17]:
vidtype=["sports"]
keyword = [""]
channelId=[
        #thirty_channelId, (series on ESPN))
        blogilates_channelId,
        #KevinDurant_channelId,
        #QB1_channelId, (series from G0x90 zone)
        TannerFox_channelId
        ]
sports1_2016_begin = get_data_YYYYbegin(keyword, 2016, channelId, maxResults, vidtype)
sports1_2016_end = get_data_YYYYend(keyword, 2016, channelId, maxResults, vidtype)
sports1_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

keyword = ["30 shorts"]
channelId=[thirty_channelId] ## didn't start until 2017
sports2_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

keyword = ["QB1"]
channelId=[QB1_channelId] #(series from G0x90 zone) didn't start until 2017
sports3_2017_begin = get_data_YYYYbegin(keyword, 2017, channelId, maxResults, vidtype)

## Compile data into single dataframe

In [63]:
list = [
beauty_2016_begin,
beauty_2016_end,
beauty_2017_begin,
    
fashion1_2016_begin,
fashion1_2016_end,
fashion1_2017_begin,
fashion2_2016_begin,
fashion2_2016_end,
fashion2_2017_begin,
    
food1_2016_begin,
food1_2016_end,
food1_2017_begin,
food2_2016_begin,
food2_2016_end,
food2_2017_begin,
food3_2016_begin,
food3_2016_end,
food3_2017_begin,
food4_2016_end,
food4_2017_begin,

gaming_2016_begin,
gaming_2016_end,
gaming_2017_begin,

family_2016_begin,
family_2016_end,
family_2017_begin,
    
lifestyle_2016_begin,
lifestyle_2016_end,
lifestyle_2017_begin,

news_2016_begin,
news_2016_end,
news_2017_begin,

science1_2016_begin,
science1_2016_end,
science1_2017_begin,
science2_2017_begin,
    
sports1_2016_begin,
sports1_2016_end,
sports1_2017_begin,
sports2_2017_begin,
sports3_2017_begin   
]

for i in range(0,len(list)):
    tmp=list[i]
    tmp=tmp.reset_index()
    del tmp['index']
    if i==0:
        df=tmp
    else:
        df2=tmp
        df=df.append(df2, ignore_index=True) ## need to use ignore_index=True in order to make new index
df.shape[0]

11509

### Add column with video length

In [64]:
def get_video_duration(video_id):
    parameters = {"part": "contentDetails",
              "id": video_id,
              "key": api_key
              }
    url = "https://www.googleapis.com/youtube/v3/videos"
    page = requests.request(method="get", url=url, params=parameters)
    j_results = json.loads(page.text)
    items = j_results.get("items", None)
    if items is None:
        duration='NA'
    else:
        duration=[]
        duration += [item["contentDetails"]["duration"] for item in j_results["items"]]
    return duration[0]

In [65]:
df['duration'] = df['video_id'].apply(func=get_video_duration)
df.head(3)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration
0,739,191,0,11178,274412,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M
1,1010,224,0,11737,319477,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S
2,1230,284,0,27857,544192,mz2e0niwZhU,2016-06-19 21:14:21,2016-06-19,24,UCDHQbU57NZilrhbuZNbQcRA,BACK TO SCHOOL DANCE MAKEUP TUTORIAL | Patric...,OMG! IT HAS BEEN TOO LONG! I have been low key...,PatrickStarrr,26,,beauty,PT10M27S


### Define functions to convert duration into hours, mins, secs, total_min

In [66]:
def convert_time(df):
    
    df['hours']=0
    df['minutes']=0
    df['seconds']=0
    df['total_minutes']=0
    df['total_seconds']=0

    for i in range(len(df)): # using 0 in range(0,len(df)) is not necessary
        
        #print(i)
        
        hours=0
        minutes=0
        seconds=0
        
        curr_str=df['duration'][i]
        
        split_str1=curr_str.split('T')
        curr_str=split_str1[1]

        if 'H' in curr_str:
            split_str =curr_str.split('H')
            hours     =int(split_str[0])
            curr_str  =split_str[1]
        
        if 'M' in curr_str:
            split_str =curr_str.split('M')
            minutes   =int(split_str[0])
            curr_str  =split_str[1]

        if 'S' in curr_str:
            split_str = curr_str.split('S')
            seconds   = int(split_str[0])
            
        df['hours'][i]=hours
        df['minutes'][i]=minutes
        df['seconds'][i]=seconds
        df['total_minutes'][i]=hours*60+minutes
        df['total_seconds'][i]=hours*60*60+minutes*60+seconds
    
    return df

In [67]:
def convert_hours(duration):
        
    hours=0
    minutes=0
    seconds=0
    
    if 'PT' in duration:
        duration=duration.split('PT')[1]

    if 'H' in duration:
        hours = duration.split('H')[0]
        hours = int(hours)
    
    return hours

In [68]:
def convert_minutes(duration):
        
    hours=0
    minutes=0
    seconds=0
    
    if 'PT' in duration:
        duration=duration.split('PT')[1]

    if 'H' in duration:
        duration = duration.split('H')[1]

    if 'M' in duration:
        minutes = duration.split('M')[0]
        minutes = int(minutes)
    
    return minutes

In [69]:
def convert_seconds(duration):
        
    hours=0
    minutes=0
    seconds=0
    
    if 'PT' in duration:
        duration=duration.split('PT')[1]

    if 'H' in duration:
        duration = duration.split('H')[1]

    if 'M' in duration:
        duration = duration.split('M')[1]
        
    if 'S' in duration:
        seconds = duration.split('S')[0]
        seconds = int(seconds)
    
    return seconds

In [70]:
def convert_totalmins(df):
    
    hours=df[0]
    minutes=df[1]
    total_mins=hours*60+minutes
    
    return total_mins

In [71]:
def convert_totalsecs(df):
    
    hours=df[0]
    minutes=df[1]
    seconds=df[2]
    total_mins=hours*60*60+minutes*60+seconds
    
    return total_mins

In [72]:
df['hours'] = df['duration'].apply(func=convert_hours)
df['minutes'] = df['duration'].apply(func=convert_minutes)
df['seconds'] = df['duration'].apply(func=convert_seconds)
df['total_mins'] = df[['hours','minutes']].apply(func=convert_totalmins, axis=1)
df['total_secs'] = df[['hours','minutes','seconds']].apply(func=convert_totalsecs, axis=1)

df.head(3)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs
0,739,191,0,11178,274412,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900
1,1010,224,0,11737,319477,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712
2,1230,284,0,27857,544192,mz2e0niwZhU,2016-06-19 21:14:21,2016-06-19,24,UCDHQbU57NZilrhbuZNbQcRA,BACK TO SCHOOL DANCE MAKEUP TUTORIAL | Patric...,OMG! IT HAS BEEN TOO LONG! I have been low key...,PatrickStarrr,26,,beauty,PT10M27S,0,10,27,10,627


### Add month, year and monthyear column

In [73]:
from datetime import datetime

df['date'] = pd.to_datetime(df['date'])
df['year'], df['month'] = df['date'].dt.year, df['date'].dt.month, 
df.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month
0,739,191,0,11178,274412,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6
1,1010,224,0,11737,319477,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6


In [74]:
df['yearmonth']=[0]*len(df)
for i in range(0,len(df)):
    df['yearmonth'][i]=df['year'][i]*100+df['month'][i]
df.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth
0,739,191,0,11178,274412,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606
1,1010,224,0,11737,319477,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606


## Characterize if video is vlog or not

In [75]:
def determine_vlog(title):
    
    if 'vlog' in title.lower():
        vlog='yes'
    else:
        vlog='no'
    
    if vlog=='yes':
        return 'vlog'
    else:
        return 'notvlog'

In [76]:
df['vlog'] = df['title'].apply(func=determine_vlog)
df.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog
0,739,191,0,11178,274412,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog
1,1010,224,0,11737,319477,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog


## Determine length of title

In [77]:
def determine_titlelength(title):
    return len(title)

In [78]:
df['title_length'] = df['title'].apply(func=determine_titlelength)
df.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog,title_length
0,739,191,0,11178,274412,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog,39
1,1010,224,0,11737,319477,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog,42


## Determine length of description

In [79]:
def determine_desclength(description):
    return len(description)

In [80]:
df['description_length'] = df['description'].apply(func=determine_desclength)
df.head(3)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog,title_length,description_length
0,739,191,0,11178,274412,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog,39,1516
1,1010,224,0,11737,319477,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog,42,1595
2,1230,284,0,27857,544192,mz2e0niwZhU,2016-06-19 21:14:21,2016-06-19,24,UCDHQbU57NZilrhbuZNbQcRA,BACK TO SCHOOL DANCE MAKEUP TUTORIAL | Patric...,OMG! IT HAS BEEN TOO LONG! I have been low key...,PatrickStarrr,26,,beauty,PT10M27S,0,10,27,10,627,2016,6,201606,notvlog,53,1718


## Determine if title contains username

In [8]:
def determine_CT_in_title(df):
    
    title=str(df[0])
    channel=str(df[1])
    
    if channel in title:
        output= 'includes_CT'
    else:
        output= 'excludes_CT'
        
    return output

In [9]:
df['title_channel'] = df[['title','channelTitle']].apply(func=determine_CT_in_title, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog,title_length,description_length,title_channel
0,0,739.0,191.0,0,11178.0,274412.0,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog,39,1516,includes_CT
1,1,1010.0,224.0,0,11737.0,319477.0,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog,42,1595,includes_CT


## Determine month when video was published

In [10]:
def determine_month(month):
    
    if month == 1:
        curr_month='Jan'
    if month == 2:
        curr_month='Feb'
    if month == 3:
        curr_month='Mar'
    if month == 4:
        curr_month='Apr'
    if month == 5:
        curr_month='May'
    if month == 6:
        curr_month='June'
    if month == 7:
        curr_month='July'
    if month == 8:
        curr_month='Aug'
    if month == 9:
        curr_month='Sept'
    if month == 10:
        curr_month='Oct'
    if month == 11:
        curr_month='Nov'
    if month == 12:
        curr_month='Dec'

    return curr_month

In [11]:
df['month_name'] = df['month'].apply(func=determine_month)
df.head(2)

Unnamed: 0.1,Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog,title_length,description_length,title_channel,month_name
0,0,739.0,191.0,0,11178.0,274412.0,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog,39,1516,includes_CT,June
1,1,1010.0,224.0,0,11737.0,319477.0,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog,42,1595,includes_CT,June


## Determine season

In [75]:
def determine_season(month):
    
    if month == 'Dec' or month == 'Jan' or month == 'Feb':
        season='winter'
    if month == 'Mar' or month == 'Apr' or month == 'May':
        season='spring'
    if month == 'June' or month == 'July' or month == 'Aug':
        season='summer'
    if month == 'Sept' or month == 'Oct' or month == 'Nov':
        season='Fall'

    return season

In [76]:
df['season'] = df['month_name'].apply(func=determine_season)
df.head(2)

Unnamed: 0.1,Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog,title_length,description_length,title_channel,month_name,season,day_of_week
0,0,739.0,191.0,0,11178.0,274412.0,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog,39,1516,includes_CT,June,summer,Fri
1,1,1010.0,224.0,0,11737.0,319477.0,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog,42,1595,includes_CT,June,summer,Thurs


## Determine day of the week

In [49]:
def get_day_code(output):
    if output==0:
        return 'Mon'
    if output==1:
        return 'Tues'
    if output==2:
        return 'Wed'
    if output==3:
        return 'Thurs'
    if output==4:
        return 'Fri'
    if output==5:
        return 'Sat'
    if output==6:
        return 'Sun'

def get_day_of_week(date):
    day=datetime.strptime(date, '%Y-%m-%d %H:%M:%S').weekday()
    weekday=get_day_code(day)
    return weekday

In [59]:
df['day_of_week']=df['date'].apply(func=get_day_of_week)
df.head(2)

Unnamed: 0.1,Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog,title_length,description_length,title_channel,month_name,season,day_of_week
0,0,739.0,191.0,0,11178.0,274412.0,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog,39,1516,includes_CT,June,Fall,Fri
1,1,1010.0,224.0,0,11737.0,319477.0,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog,42,1595,includes_CT,June,Fall,Thurs


## Determine if weekday or not

In [82]:
def determine_weekday(day_of_week):
    if day_of_week == 'Mon' or day_of_week == 'Tues' or day_of_week == 'Wed' or day_of_week == 'Thurs' or day_of_week == 'Fri':
        return 'weekday'
    if day_of_week == 'Sat' or day_of_week == 'Sun':
        return 'weekend'  

In [80]:
df['type_of_day']=df['day_of_week'].apply(func=determine_weekday)
df.head(2)

Unnamed: 0.1,Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog,title_length,description_length,title_channel,month_name,season,day_of_week,type_of_day
0,0,739.0,191.0,0,11178.0,274412.0,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog,39,1516,includes_CT,June,summer,Fri,weekday
1,1,1010.0,224.0,0,11737.0,319477.0,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog,42,1595,includes_CT,June,summer,Thurs,weekday


## Determine if video is part of a series

In [98]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def determine_series(title):
    if ' part ' in title.lower() and hasNumbers(title):
        return 'yes'
    else:
        return 'no' 

In [101]:
df['series']=df['title'].apply(func=determine_series)
df.head(2)

#note not 100% accurate but will provide good estimate on if this variable is impactful or not

Unnamed: 0.1,Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,key_word,type,duration,hours,minutes,seconds,total_mins,total_secs,year,month,yearmonth,vlog,title_length,description_length,title_channel,month_name,season,day_of_week,type_of_day,series
0,0,739.0,191.0,0,11178.0,274412.0,zfmi2JatvlY,2016-06-24 23:22:46,2016-06-24 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,WHAT'S IN MY TRAVEL BAG | PatrickStarrr,Hey everyone! I've been traveling a lot lately...,PatrickStarrr,26,,beauty,PT15M,0,15,0,15,900,2016,6,201606,notvlog,39,1516,includes_CT,June,summer,Fri,weekday,no
1,1,1010.0,224.0,0,11737.0,319477.0,XlawoU0xCwQ,2016-06-23 02:46:20,2016-06-23 00:00:00,25,UCDHQbU57NZilrhbuZNbQcRA,I AM A GEISHA | JAPAN VLOG | PatrickStarrr,"Discount on TATCHA use code ""PSTARRR"" \nHuge t...",PatrickStarrr,26,,beauty,PT28M32S,0,28,32,28,1712,2016,6,201606,vlog,42,1595,includes_CT,June,summer,Thurs,weekday,no


## Save to csv

In [102]:
df.to_csv('Youtube_Project_20171121.csv')