# Twitch Data Collection

## Testing the API

In [1]:
# Import necessary libraries
import pandas as pd
import requests
import json

# 
from dotenv import load_dotenv
import os 


Getting secrets from .env file in root of project folder. Token generated from here https://dev.twitch.tv/docs/api

In [2]:
# load environment
load_dotenv()

url = "https://api.twitch.tv/helix/games/top?first=40"
CLIENT_ID = os.environ.get('CLIENT_ID')
CLIENT_SECRET = os.environ.get('CLIENT_SECRET')
ACCESS_TOKEN = os.environ.get('ACCESS_TOKEN')

headers = {
    'Client-ID': CLIENT_ID,
    'Authorization': 'Bearer ' + ACCESS_TOKEN
}

In [3]:
requests.get(url, headers=headers).json()

{'data': [{'id': '509658',
   'name': 'Just Chatting',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/509658-{width}x{height}.jpg'},
  {'id': '32982',
   'name': 'Grand Theft Auto V',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/32982_IGDB-{width}x{height}.jpg'},
  {'id': '33214',
   'name': 'Fortnite',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/33214-{width}x{height}.jpg'},
  {'id': '21779',
   'name': 'League of Legends',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/21779-{width}x{height}.jpg'},
  {'id': '941530474',
   'name': 'Mario Kart 8 Deluxe',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/941530474_IGDB-{width}x{height}.jpg'},
  {'id': '561013832',
   'name': 'Pokémon Legends: Arceus',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/561013832_IGDB-{width}x{height}.jpg'},
  {'id': '518203',
   'name': 'Sports',
   'box_art_url': 'https://static-cdn.jtvnw.net/ttv-boxart/518203-{width}x{height}.jpg'},

Generating a new token for each call to the API

In [4]:
AUTH_URL = "https://id.twitch.tv/oauth2/token"
CLIENT_ID = os.environ.get('CLIENT_ID')
CLIENT_SECRET = os.environ.get('CLIENT_SECRET')
GRANT_TYPE = "client_credentials"

# post a request to get a token using my client id and client secret
params = {
  "client_id": CLIENT_ID,
  "client_secret": CLIENT_SECRET,
  "grant_type": GRANT_TYPE
}

r1 = requests.post(url=AUTH_URL, params=params)
token = r1.json()["access_token"]

In [5]:
# get request

URL = "https://api.twitch.tv/helix/search/channels?query=reisu1337"

headers = {
    'Client-ID': CLIENT_ID,
    'Authorization': 'Bearer ' + token
}

requests.get(URL, headers=headers).json()

{'data': [{'broadcaster_language': 'en',
   'broadcaster_login': 'retsu1337',
   'display_name': 'Retsu1337',
   'game_id': '0',
   'game_name': '',
   'id': '45370373',
   'is_live': False,
   'tag_ids': [],
   'thumbnail_url': 'https://static-cdn.jtvnw.net/user-default-pictures-uv/de130ab0-def7-11e9-b668-784f43822e80-profile_image-300x300.png',
   'title': '',
   'started_at': ''},
  {'broadcaster_language': 'en',
   'broadcaster_login': 'reign1337',
   'display_name': 'Reign1337',
   'game_id': '509658',
   'game_name': 'Just Chatting',
   'id': '95707013',
   'is_live': False,
   'tag_ids': [],
   'thumbnail_url': 'https://static-cdn.jtvnw.net/jtv_user_pictures/reign1337-profile_image-77df4c4bcf4e390e-300x300.png',
   'title': 'quick workout',
   'started_at': ''},
  {'broadcaster_language': 'en',
   'broadcaster_login': 'reisk1337',
   'display_name': 'reisk1337',
   'game_id': '0',
   'game_name': '',
   'id': '117996547',
   'is_live': False,
   'tag_ids': [],
   'thumbnail_url'

## Framing the project

In this section I will outline what I am aiming to achieve with this piece of work. In particular, I will describe:
- The questions that I hope to answer
- What data I will use to answer these questions

### Questions/Hypothesis

#1 There is a positive relationship between minutes streamed and followers

#2 Mature content is more popular

#3 What is the best time for a streamer to go live?

#4 What are the top games on Twitch?

#5 Who is the most popular streamer on Twitch in terms of followers/subscriptions/views?


### Data requirements

I will collect the data from the Twitch API

I will separate the data into two dataframes

1. Channels
- Channel Name
- Channel Start Date (?)
- Watch Time (minutes)
- Stream Time (minutes)
- Peak Viewers
- Average Viewers
- Followers
- Followers gained
- Views gained
- Subscription count
- Max subscription count (?)
- Partnered
- Mature

2. Games
- Game Name
- Month (?)
- Year (?)
- Rank in the month
- Hours watched
- Hours streamed on twitch
- Peak viewers
- Peak Channels
- Streamers
- Average viewers

## Channel dataset

### Get streams

[Search channels](https://dev.twitch.tv/docs/api/reference#search-channels) gives us the channel ID and the channel name.
[Get Users](https://dev.twitch.tv/docs/api/reference#get-users) gives us Views gained and stream created date.

Note: Twitch API has the [Get Streams](https://dev.twitch.tv/docs/api/reference#get-streams) endpoint. However, it only returns live streams. There is not an endpoint that returns all streams (live and offline). Therefore, I could collect a dataset by getting the top followed streams overtime and storing them in a CSV. 

For now, I will build out the analysis using just the live streams.

In [6]:
headers = {
    'Client-ID': CLIENT_ID,
    'Authorization': 'Bearer ' + token
}

In [7]:
def get_streams():
    data=[]
    cursor=''
    i=0
    
    while i < 10: 
        URL = f"https://api.twitch.tv/helix/streams?first=100&after={cursor}"
        response = requests.get(URL, headers=headers).json()

        # append to df
        for stream in response['data']:
            user_id = stream['user_id']
            stream_name = stream['user_name']
            is_mature = stream['is_mature']
            data.append([user_id, stream_name, is_mature])

        # cursor
        cursor = response['pagination']['cursor']
        
        i+=1
    
    df = pd.DataFrame(data, columns=['user_id', 'stream_name', 'is_mature'])
        
    return df

    
    
    

In [8]:
streams_df = get_streams()

In [9]:
streams_df.head()

Unnamed: 0,user_id,stream_name,is_mature
0,459331509,auronplay,False
1,71092938,xQcOW,False
2,22346597,LVPes,False
3,121510236,juansguarnizo,False
4,71190292,Trainwreckstv,True


In [10]:
streams_df.shape

(998, 3)

### Get stream additional stream info

Now that we have the user_id of the top 1000 live streams, we can user the user id to pull other information.

In [11]:
def get_view_count():
    data=[]
    for user_id in streams_df['user_id']:
        URL = f"https://api.twitch.tv/helix/users?id={user_id}"
        response = requests.get(URL, headers=headers).json()
        user_Fk = response['data'][0]['id']
        view_count = response['data'][0]['view_count']
        created_at_date = response['data'][0]['created_at']
        data.append([user_Fk, view_count, created_at_date])
    df = pd.DataFrame(data, columns=['user_id', 'view_count', 'created_at_date'])
    return df

In [12]:
get_view_count()

IndexError: list index out of range

In [None]:
streams_df.merge(view_count_df, how="inner", on="user_id")

In [13]:
def get_sub_data():
    data=[]
    for user_id in streams_df['user_id']:
        URL = f"https://api.twitch.tv/helix/subscriptions?broadcaster_id={user_id}"
        response = requests.get(URL, headers=headers).json()
        user_Fk = response['data'][0]['broadcaster_id']
        total_subs = response['data'][0]['total']
        data.append([user_Fk, total_subs])
    df = pd.DataFrame(data, columns=['user_id', 'total_subs'])
    return df

In [14]:
def get_follower_data():
    data=[]
    for user_id in streams_df['user_id']:
        URL = f"https://api.twitch.tv/helix/users/follows?to_id={user_id}"
        response = requests.get(URL, headers=headers).json()
        user_Fk = response['data'][0]['to_id']
        total_followers = response['total']
        data.append([user_Fk, total_followers])
    df = pd.DataFrame(data, columns=['user_id', 'total_followers'])
    return df

In [None]:
get_follower_data()