# Notes on how to use YouTube's API
#### Date: 2017-10-04
#### User: Julianne

## information on what will be done today
- [done] Part 1: download top 50 videos
    * sorted by viewcount
    * title contains word 'yoga'
    * from years 2010-2016

## record server information

In [7]:
# Magics first (server issues)
%matplotlib inline

#https://ipython.org/ipython-doc/dev/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

#install_ext http://raw.github.com/jrjohansson/version_information/master/version_information
#need to install using pip install version_information
%load_ext version_information
%version_information numpy, scipy, matplotlib, pandas

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Software,Version
Python,3.6.0 64bit [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]
IPython,5.1.0
OS,Darwin 16.1.0 x86_64 i386 64bit
numpy,1.11.3
scipy,0.18.1
matplotlib,2.0.0
pandas,0.19.2
Wed Oct 04 12:32:43 2017 EDT,Wed Oct 04 12:32:43 2017 EDT


## Record website where code was gotten from

In [27]:
from IPython.display import IFrame #show website
url='http://nbviewer.jupyter.org/github/twistedhardware/mltutorial/blob/master/notebooks/data-mining/2.%20YouTube%20Data.ipynb'
IFrame(url, width=900, height=350)

## Set-up libraries

In [2]:
from __future__ import division
from datetime import datetime 
import requests
from lxml import html, etree
import json
from textblob import TextBlob
import pandas as pd
import matplotlib.pyplot as plt
import warnings

#never print matching warnings
warnings.filterwarnings('ignore') 

#set how pandas displays data
pd.options.display.max_columns = 100
pd.options.display.max_rows = 35
pd.options.display.width = 120



# Part 1: download top 50 videos with word 'yoga'

### Record API key

In [3]:
#youtube API key
api_key = "AIzaSyAKuyO0o3iU7aga6xlge-WRtesFMAksl6I"

### Define functions that will be used

In [4]:
def _search_list(q="", publishedAfter=None, publishedBefore=None, pageToken=""):
    parameters = {"part": "id",
                  "maxResults": 50,
                  "order": "viewCount", #will sort based on ViewCount
                  "pageToken": pageToken,
                  "q": q,
                  "type": "video",
                  "key": api_key,
                  }
    url = "https://www.googleapis.com/youtube/v3/search"
    
    if publishedAfter: parameters["publishedAfter"] = publishedAfter
    if publishedBefore: parameters["publishedBefore"] = publishedBefore
    
    page = requests.request(method="get", url=url, params=parameters)
    return json.loads(page.text)

def search_list(q="", publishedAfter=None, publishedBefore=None, max_requests=10):
    more_results = True
    pageToken=""
    results = []
    
    for counter in range(max_requests):
        j_results = _search_list(q=q, publishedAfter=publishedAfter, publishedBefore=publishedBefore, pageToken=pageToken)
        items = j_results.get("items", None)
        if items:
            results += [item["id"]["videoId"] for item in j_results["items"]]
            #if j_results.has_key("nextPageToken"):
            if "nextPageToken" in j_results:
                pageToken = j_results["nextPageToken"]
            else:
                return results
        else:
            return results
    return results

def _video_list(video_id_list):
    parameters = {"part": "statistics",
                  "id": ",".join(video_id_list),
                  "key": api_key,
                  "maxResults": 50
                  }
    url = "https://www.googleapis.com/youtube/v3/videos"
    page = requests.request(method="get", url=url, params=parameters)
    j_results = json.loads(page.text)
    df = pd.DataFrame([item["statistics"] for item in j_results["items"]])
    df["video_id"] = [item["id"] for item in j_results["items"]]
    
    parameters["part"] = "snippet"
    page = requests.request(method="get", url=url, params=parameters)
    j_results = json.loads(page.text)
    df["publishedAt"] = [item["snippet"]["publishedAt"] for item in j_results["items"]]
    df["publishedAt"] = df["publishedAt"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z"))
    df["date"] = df["publishedAt"].apply(lambda x: x.date())
    df["week"] = df["date"].apply(lambda x: x.isocalendar()[1])
    df["channelId"] = [item["snippet"]["channelId"] for item in j_results["items"]]
    df["title"] = [item["snippet"]["title"] for item in j_results["items"]]
    df["description"] = [item["snippet"]["description"] for item in j_results["items"]]
    df["channelTitle"] = [item["snippet"]["channelTitle"] for item in j_results["items"]]
    df["categoryId"] = [item["snippet"]["categoryId"] for item in j_results["items"]]
    return df

def video_list(video_id_list):
    values = []
    for index, item in enumerate(video_id_list[::50]):
        t_index = index * 50
        values.append(_video_list(video_id_list[t_index:t_index+50]))
    return pd.concat(values)

In [5]:
def get_data(candidates, publishedAfter, publishedBefore):
    results_list = []
    for q in candidates:
        results = search_list(q=q,
                              publishedAfter=publishedAfter,
                              publishedBefore=publishedBefore,
                              max_requests=50)

        stat_data_set = video_list(results)
        stat_data_set["candidate_name"] = q
        results_list.append(stat_data_set)
    data_set = pd.concat(results_list)
    return data_set

def get_2010_data(candidates):
    return get_data(candidates, publishedAfter="2010-01-01T00:00:00Z", publishedBefore="2010-12-30T00:00:00Z")

def get_2011_data(candidates):
    return get_data(candidates, publishedAfter="2011-01-01T00:00:00Z", publishedBefore="2011-12-30T00:00:00Z")

def get_2012_data(candidates):
    return get_data(candidates, publishedAfter="2012-01-01T00:00:00Z", publishedBefore="2012-12-30T00:00:00Z")

def get_2013_data(candidates):
    return get_data(candidates, publishedAfter="2013-01-01T00:00:00Z", publishedBefore="2013-12-30T00:00:00Z")

def get_2014_data(candidates):
    return get_data(candidates, publishedAfter="2014-01-01T00:00:00Z", publishedBefore="2014-12-30T00:00:00Z")

def get_2015_data(candidates):
    return get_data(candidates, publishedAfter="2015-01-01T00:00:00Z", publishedBefore="2015-12-30T00:00:00Z")

def get_2016_data(candidates):
    return get_data(candidates, publishedAfter="2016-01-01T00:00:00Z", publishedBefore="2016-12-30T00:00:00Z")

### Run code to extract data

In [29]:
candidates = ["yoga"]
yoga_2010 = get_2010_data(candidates)
yoga_2011 = get_2011_data(candidates)
yoga_2012 = get_2012_data(candidates)
yoga_2013 = get_2013_data(candidates)
yoga_2014 = get_2014_data(candidates)
yoga_2015 = get_2015_data(candidates)
yoga_2016 = get_2016_data(candidates)


In [30]:
yoga_2010.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,candidate_name
0,3493,3823,0,75362,17142504,0o0kNeOyH98,2010-12-06 09:58:49,2010-12-06,49,UCsVQi0GCmZZq0soNhm93P1A,Yoga for Complete Beginners - Yoga Class 20 M...,Yoga for complete beginners. 20 minute gentle ...,Yoga Practice Videos - Yoga Vidya,24,yoga
1,209,959,0,1288,12216798,lrvVvMXA5k0,2010-04-30 06:51:17,2010-04-30,17,UCZH9UwnGUOpLDTJVdF7SwUw,SherlyN chopra's HOT YOGA,"sherlyn Chopra's ""HOT YOGA"" Bollywood's Bombsh...",takisawant,43,yoga


In [31]:
yoga_2011.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,candidate_name
0,2012,10049,0,18798,34325008,8wwRM6z3_Co,2011-03-23 19:48:19,2011-03-23,12,UCG7OjYz1GuhijsTASme53VQ,yoga for relaxation,more beginning yoga,HotYogaAtHome,24,yoga
1,2729,922,0,65910,11582259,loszrEZvS_k,2011-12-27 15:18:46,2011-12-27,52,UCnY3kkkuVmN2F-rUoiaAP2A,Yoga By Equinox,Eager to master the arm balance? Equinox's Bri...,Furthermore from Equinox,27,yoga


In [32]:
yoga_2012.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,candidate_name
0,1384,1111,0,52942,23682410,xi6wOaZ61-U,2012-02-10 03:23:14,2012-02-10,6,UCCsUQXwhDj4iVlhG4VCq6Kg,林宥嘉 Yoga Lin [傻子 Fool] Official MV (電影｢愛LOVE｣主題曲),線上音樂收聽「LOVE電影原聲帶」-\n【KKBOX】http://kkbox.fm/1a0...,華研國際,10,yoga
1,4694,7171,0,58211,21010556,dfeqKsgTjPo,2012-12-21 15:30:12,2012-12-21,51,UCPDXXXJj9nax0fr0Wfc048g,Hardly Working: Yoga Teachers,LIKE us on: http://www.facebook.com/collegehum...,CollegeHumor,23,yoga


In [33]:
yoga_2013.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,candidate_name
0,82,,0,,39338818,BH-_R1rHI3M,2013-12-19 06:56:24,2013-12-19,51,UCByl_CQNnNu5ZGONUJ-X2lw,Sheer Lululemon Yoga Pants - chicks get laid,***** Please SUBSCRIBE over at the to youtube...,Hollie Lee,23,yoga
1,4196,7175.0,0,94461.0,21420855,f77SKdyn-1Y,2013-08-07 12:00:12,2013-08-07,32,UCBK-IXazHYcYit0K3ESnaxg,Calming Seas #1 - 11 Hours Ocean Waves Sounds ...,Black Screen Version: https://youtu.be/HU3KTa5...,Relaxing Music & Yoga,10,yoga


In [34]:
yoga_2014.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,candidate_name
0,27387,15825,0,398509,46450667,wngsO-Z4n7A,2014-03-02 22:57:29,2014-03-02,9,UC1r4VtVE__5K6c_L_3Vlxxg,YOGA PANTS PRANK!,Click Here For My NEW YouTube Channel: \nhttps...,fouseyTUBE,24,yoga
1,5805,14736,0,150478,41009648,QZbuj3RJcjI,2014-08-14 16:49:53,2014-08-14,33,UCb_kshGodseYhLPcDtxWv5w,"3 HOURS Relaxing Music ""Evening Meditation"" Ba...","3 HOURS Relaxing Music ""Evening Meditation"". R...",Meditation Relax Music,24,yoga


In [35]:
yoga_2015.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,candidate_name
0,13696,17818,0,334936,39789314,0OkB6p_FZAw,2015-04-13 07:00:00,2015-04-13,16,UCF16SjpLvjwG8O6_LZvj8Xw,"Janelle Monáe, Jidenna - Yoga","Download ""Wondaland Presents: The Eephus"" at i...",WondalandVEVO,10,yoga
1,1505,2024,0,76092,37841448,Mqr-kjvXsk8,2015-11-09 04:00:00,2015-11-09,46,UCCsUQXwhDj4iVlhG4VCq6Kg,林宥嘉 Yoga Lin [ 兜圈 ] Official Music Video (偶像劇｢...,**歡迎透過下方連結為這支MV製作不同語言的歌詞字幕**\nhttp://www.youtu...,華研國際,10,yoga


In [36]:
yoga_2016.head(2)

Unnamed: 0,commentCount,dislikeCount,favoriteCount,likeCount,viewCount,video_id,publishedAt,date,week,channelId,title,description,channelTitle,categoryId,candidate_name
0,5182,7844,0,48739,22328793,pd2gAJsvwLU,2016-04-20 18:08:31,2016-04-20,16,UC8iOxoMyDKmcVO8TL_20kyg,SSG vs. SGG Ultimate Yoga Challenge,SevenSuperGirls and SevenGymnasticsGirls chall...,SevenGymnasticsGirls,17,yoga
1,4617,8476,0,179901,17314797,hN3ClgNDH30,2016-11-17 06:30:00,2016-11-17,46,UC-TFz4MD1ZzcY6NuZAP3Ndw,EXTREME YOGA Challenge TWINS Vs Sisters!,Hey guys! We attempted some really tricky yoga...,The Rybka Twins,1,yoga


# Part 1: download top 50 videos with word 'Yoga With Adriene'

### Record api key

In [37]:
#youtube API key
api_key = "AIzaSyAKuyO0o3iU7aga6xlge-WRtesFMAksl6I"

### Define functions that will be used

In [40]:
def _search_list(q="", publishedAfter=None, publishedBefore=None, pageToken=""):
    parameters = {"part": "id",
                  "maxResults": 50,
                  "order": "viewCount", #will sort based on ViewCount
                  "pageToken": pageToken,
                  "q": q,
                  "type": "video",
                  "key": api_key,
                  }
    url = "https://www.googleapis.com/youtube/v3/search"
    
    if publishedAfter: parameters["publishedAfter"] = publishedAfter
    if publishedBefore: parameters["publishedBefore"] = publishedBefore
    
    page = requests.request(method="get", url=url, params=parameters)
    return json.loads(page.text)

def search_list(q="", publishedAfter=None, publishedBefore=None, max_requests=10):
    more_results = True
    pageToken=""
    results = []
    
    for counter in range(max_requests):
        j_results = _search_list(q=q, publishedAfter=publishedAfter, publishedBefore=publishedBefore, pageToken=pageToken)
        items = j_results.get("items", None)
        if items:
            results += [item["id"]["videoId"] for item in j_results["items"]]
            #if j_results.has_key("nextPageToken"):
            if "nextPageToken" in j_results:
                pageToken = j_results["nextPageToken"]
            else:
                return results
        else:
            return results
    return results

def _video_list(video_id_list):
    parameters = {"part": "statistics",
                  "id": ",".join(video_id_list),
                  "key": api_key,
                  "maxResults": 50
                  }
    url = "https://www.googleapis.com/youtube/v3/videos"
    page = requests.request(method="get", url=url, params=parameters)
    j_results = json.loads(page.text)
    df = pd.DataFrame([item["statistics"] for item in j_results["items"]])
    df["video_id"] = [item["id"] for item in j_results["items"]]
    
    parameters["part"] = "snippet"
    page = requests.request(method="get", url=url, params=parameters)
    j_results = json.loads(page.text)
    df["publishedAt"] = [item["snippet"]["publishedAt"] for item in j_results["items"]]
    df["publishedAt"] = df["publishedAt"].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z"))
    df["date"] = df["publishedAt"].apply(lambda x: x.date())
    df["week"] = df["date"].apply(lambda x: x.isocalendar()[1])
    df["channelId"] = [item["snippet"]["channelId"] for item in j_results["items"]]
    df["title"] = [item["snippet"]["title"] for item in j_results["items"]]
    df["description"] = [item["snippet"]["description"] for item in j_results["items"]]
    df["channelTitle"] = [item["snippet"]["channelTitle"] for item in j_results["items"]]
    df["categoryId"] = [item["snippet"]["categoryId"] for item in j_results["items"]]
    return df

def video_list(video_id_list):
    values = []
    for index, item in enumerate(video_id_list[::50]):
        t_index = index * 50
        values.append(_video_list(video_id_list[t_index:t_index+50]))
    return pd.concat(values)

In [41]:
def get_data(yoga, publishedAfter, publishedBefore):
    results_list = []
    for q in yoga:
        results = search_list(q=q,
                              publishedAfter=publishedAfter,
                              publishedBefore=publishedBefore,
                              max_requests=50)

        stat_data_set = video_list(results)
        stat_data_set["key_word"] = q
        results_list.append(stat_data_set)
    data_set = pd.concat(results_list)
    return data_set

def get_2010_data(yoga):
    return get_data(yoga, publishedAfter="2010-01-01T00:00:00Z", publishedBefore="2010-12-30T00:00:00Z")

def get_2011_data(yoga):
    return get_data(yoga, publishedAfter="2011-01-01T00:00:00Z", publishedBefore="2011-12-30T00:00:00Z")

def get_2012_data(yoga):
    return get_data(yoga, publishedAfter="2012-01-01T00:00:00Z", publishedBefore="2012-12-30T00:00:00Z")

def get_2013_data(yoga):
    return get_data(yoga, publishedAfter="2013-01-01T00:00:00Z", publishedBefore="2013-12-30T00:00:00Z")

def get_2014_data(yoga):
    return get_data(yoga, publishedAfter="2014-01-01T00:00:00Z", publishedBefore="2014-12-30T00:00:00Z")

def get_2015_data(yoga):
    return get_data(yoga, publishedAfter="2015-01-01T00:00:00Z", publishedBefore="2015-12-30T00:00:00Z")

def get_2016_data(yoga):
    return get_data(yoga, publishedAfter="2016-01-01T00:00:00Z", publishedBefore="2016-12-30T00:00:00Z")

### Run code to extract data

In [42]:
yoga = ["Yoga With Adriene"]
yoga_2010 = get_2010_data(yoga)
yoga_2011 = get_2011_data(yoga)
yoga_2012 = get_2012_data(yoga)
yoga_2013 = get_2013_data(yoga)
yoga_2014 = get_2014_data(yoga)
yoga_2015 = get_2015_data(yoga)
yoga_2016 = get_2016_data(yoga)

In [1]:
yoga_2010.head(2)

NameError: name 'yoga_2010' is not defined