# Twitch Data Collection

## Testing the API

In [2]:
# Import necessary libraries
import pandas as pd
import requests
import json

# 
from dotenv import load_dotenv
import os 


Getting secrets from .env file in root of project folder. Token generated from here https://dev.twitch.tv/docs/api

In [3]:
# load environment
load_dotenv()

url = "https://api.twitch.tv/helix/games/top?first=40"
CLIENT_ID = os.environ.get('CLIENT_ID')
CLIENT_SECRET = os.environ.get('CLIENT_SECRET')
ACCESS_TOKEN = os.environ.get('ACCESS_TOKEN')

headers = {
    'Client-ID': CLIENT_ID,
    'Authorization': 'Bearer ' + ACCESS_TOKEN
}

In [4]:
requests.get(url, headers=headers).json()

{'data': [{'id': '21779',
   'name': 'League of Legends',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/21779-{width}x{height}.jpg'},
  {'id': '29452',
   'name': 'Virtual Casino',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/29452_IGDB-{width}x{height}.jpg'},
  {'id': '509658',
   'name': 'Just Chatting',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/509658-{width}x{height}.jpg'},
  {'id': '33214',
   'name': 'Fortnite',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/33214-{width}x{height}.jpg'},
  {'id': '32982',
   'name': 'Grand Theft Auto V',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/32982_IGDB-{width}x{height}.jpg'},
  {'id': '516575',
   'name': 'VALORANT',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/516575-{width}x{height}.jpg'},
  {'id': '512980',
   'name': 'Fall Guys: Ultimate Knockout',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/512980-{width}x{height}.jpg'},
  {'id': '29595'

Generating a new token for each call to the API

In [5]:
AUTH_URL = "https://id.twitch.tv/oauth2/token"
CLIENT_ID = os.environ.get('CLIENT_ID')
CLIENT_SECRET = os.environ.get('CLIENT_SECRET')
GRANT_TYPE = "client_credentials"

# post a request to get a token using my client id and client secret
params = {
  "client_id": CLIENT_ID,
  "client_secret": CLIENT_SECRET,
  "grant_type": GRANT_TYPE
}

r1 = requests.post(url=AUTH_URL, params=params)
token = r1.json()["access_token"]

In [6]:
# get request

URL = "https://api.twitch.tv/helix/search/channels?query=reisu1337"

headers = {
    'Client-ID': CLIENT_ID,
    'Authorization': 'Bearer ' + token
}

requests.get(URL, headers=headers).json()

{'data': [{'broadcaster_language': 'en',
   'broadcaster_login': 'reign1337',
   'display_name': 'Reign1337',
   'game_id': '509658',
   'game_name': 'Just Chatting',
   'id': '95707013',
   'is_live': False,
   'tag_ids': [],
   'thumbnail_url': 'https://static-cdn.jtvnw.net/jtv_user_pictures/reign1337-profile_image-77df4c4bcf4e390e-300x300.png',
   'title': 'quick workout',
   'started_at': ''},
  {'broadcaster_language': 'en',
   'broadcaster_login': 'reiss1337',
   'display_name': 'reiss1337',
   'game_id': '33214',
   'game_name': 'Fortnite',
   'id': '246146116',
   'is_live': False,
   'tag_ids': [],
   'thumbnail_url': 'https://static-cdn.jtvnw.net/jtv_user_pictures/54834c0d-31f1-4c89-9b53-c4c0cab7a958-profile_image-300x300.jpg',
   'title': '[PT-BR] My life is a fucking shit ;(',
   'started_at': ''},
  {'broadcaster_language': 'en',
   'broadcaster_login': 'reisu1337',
   'display_name': 'Reisu1337',
   'game_id': '27471',
   'game_name': 'Minecraft',
   'id': '576638480',
  

## Framing the project

In this section I will outline what I am aiming to achieve with this piece of work. In particular, I will describe:
- The questions that I hope to answer
- What data I will use to answer these questions

### Questions/Hypothesis

#1 There is a positive relationship between minutes streamed and followers

#2 Mature content is more popular

#3 What is the best time for a streamer to go live?

#4 What are the top games on Twitch?

#5 Who is the most popular streamer on Twitch in terms of followers/subscriptions/views?


### Data requirements

I will collect the data from the Twitch API

I will separate the data into two dataframes

1. Channels
- Channel Name
- Channel Start Date (?)
- Watch Time (minutes)
- Stream Time (minutes)
- Peak Viewers
- Average Viewers
- Followers
- Followers gained
- Views gained
- Subscription count
- Max subscription count (?)
- Partnered
- Mature

2. Games
- Game Name
- Month (?)
- Year (?)
- Rank in the month
- Hours watched
- Hours streamed on twitch
- Peak viewers
- Peak Channels
- Streamers
- Average viewers

## Channel dataset

### Get streams

[Search channels](https://dev.twitch.tv/docs/api/reference#search-channels) gives us the channel ID and the channel name.
[Get Users](https://dev.twitch.tv/docs/api/reference#get-users) gives us Views gained and stream created date.

Note: Twitch API has the [Get Streams](https://dev.twitch.tv/docs/api/reference#get-streams) endpoint. However, it only returns live streams. There is not an endpoint that returns all streams (live and offline). Therefore, I could collect a dataset by getting the top followed streams overtime and storing them in a CSV. 

For now, I will build out the analysis using just the live streams.

In [73]:
headers = {
    'Client-ID': CLIENT_ID,
    'Authorization': 'Bearer ' + token
}

In [168]:
def get_streams():
    data=[]
    cursor=''
    i=0
    
    while i < 10: 
        URL = f"https://api.twitch.tv/helix/streams?first=100&after={cursor}"
        response = requests.get(URL, headers=headers).json()

        # append to df
        for stream in response['data']:
            user_id = stream['user_id']
            stream_name = stream['user_name']
            data.append([user_id, stream_name])

        # cursor
        cursor = response['pagination']['cursor']
        
        i+=1
    
    df = pd.DataFrame(data, columns=['user_id', 'stream_name'])
        
    return df

    
    
    

In [169]:
streams_df = get_streams()

In [170]:
streams_df.head()

Unnamed: 0,user_id,stream_name
0,124422593,LEC
1,459331509,auronplay
2,57781936,RocketLeague
3,622498423,otplol_
4,100814397,Dota2RuHub


In [171]:
streams_df.shape

(995, 2)

### Get stream additional stream info

Now that we have the user_id of the top 1000 live streams, we can user the user id to pull other information.

In [175]:
def get_view_count():
    data=[]
    for user_id in streams_df['user_id']:
        URL = f"https://api.twitch.tv/helix/users?id={user_id}"
        response = requests.get(URL, headers=headers).json()
        user_Fk = response['data'][0]['id']
        view_count = response['data'][0]['view_count']
        created_at_date = response['data'][0]['created_at']
        data.append([user_Fk, view_count, created_at_date])
    df = pd.DataFrame(data, columns=['user_id', 'view_count', 'created_at_date'])
    return df

In [176]:
get_view_count()

Unnamed: 0,user_id,view_count,created_at_date
0,124422593,133495781,2016-05-16T18:00:12Z
1,459331509,223614599,2019-09-03T14:02:49Z
2,57781936,295285770,2014-02-26T16:42:48Z
3,622498423,77706610,2020-12-18T10:17:00Z
4,100814397,330402074,2015-08-28T09:10:34Z
...,...,...,...
990,43071273,339127,2013-05-01T10:12:22Z
991,51529432,8424969,2013-11-14T20:38:01Z
992,109593463,18837731,2015-12-15T20:26:42Z
993,76149772,5019534,2014-11-29T00:01:06Z


In [177]:
streams_df.merge(view_count_df, how="inner", on="user_id")

Unnamed: 0,user_id,stream_name,view_count,created_at_date
0,124422593,LEC,133495781,2016-05-16T18:00:12Z
1,459331509,auronplay,223614599,2019-09-03T14:02:49Z
2,57781936,RocketLeague,295285770,2014-02-26T16:42:48Z
3,622498423,otplol_,77706610,2020-12-18T10:17:00Z
4,100814397,Dota2RuHub,330402074,2015-08-28T09:10:34Z
5,52091823,Castro_1021,130624729,2013-11-24T08:42:28Z
6,39426641,Evelone192,93580063,2013-01-18T07:41:13Z
7,45044816,MontanaBlack88,147073500,2013-06-21T23:14:20Z
8,22346597,LVPes,229606073,2011-05-11T10:25:51Z
9,49207184,fps_shaka,118681672,2013-09-19T13:21:29Z
