# Setup

In [127]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


import twitch
import urllib
import json
import re

# Data Cleaning

In [128]:
# Bring in subsamples of streamers
xqc = pd.read_pickle('Data/xqcow.pkl').sample(10000)
tim = pd.read_pickle('Data/timthetatman.pkl').sample(10000)
tyler1 = pd.read_pickle('Data/loltyler1.pkl').sample(10000)

qtpie = pd.read_pickle('Data/imaqtpie.pkl').sample(10000)
myth = pd.read_pickle('Data/tsm_myth.pkl').sample(10000)
ninja = pd.read_pickle('Data/ninja.pkl').sample(10000) 

In [129]:
# Get Twitch API keys (stored locally)
with open('api_keys.json') as f:
    keys = json.load(f)

# Get channel ID's from our streamers
helix = twitch.Helix(keys['client_id'], keys['client_secret'])
streamers = ['xqcow', 'timthetatman', 'loltyler1', 'imaqtpie', 'myth', 'ninja']

# Get all streamer ID's
streamer_ids = [helix.user(s).data['id'] for s in streamers]

In [130]:
# Create list of Twitch official global emotes
with urllib.request.urlopen("https://api.twitchemotes.com/api/v4/channels/0") as url:
    requested = json.loads(url.read().decode())
emote_list = requested['emotes']

# match id's to emotes
id_emote = {}
for emote_dict in emote_list:
    id_emote[emote_dict['id']] = emote_dict['code']
id_list = [int(x['id']) for x in emote_list]

# Get global and channel specific BTTV emotes to throw out of our analysis
bttv_emotes = []
with urllib.request.urlopen("https://api.betterttv.net/3/cached/emotes/global") as url:
    requested = json.loads(url.read().decode())
bttv_emotes = [emote['code'].lower() for emote in requested]

for s in streamer_ids:
    with urllib.request.urlopen("https://api.betterttv.net/3/cached/users/twitch/" + s) as url:
        requested = json.loads(url.read().decode())
        streamer_emotes = [emote['code'].lower() for emote in requested['channelEmotes']]
    bttv_emotes += streamer_emotes

print("Twitch emote ID subset: " + str(id_list[:5]))
print("BTTV Emote subset: " + str(bttv_emotes[:5]))

Twitch emote ID subset: [12, 17, 10, 864205, 47]
BTTV Emote subset: [':tf:', 'cigrip', 'datsauce', 'foreveralone', 'gaben']


In [131]:
# Emote function (take only one of each global emote, ignore rest)
def make_emote_list(x):
    lst = []
    # For each fragment
    for fragment in x:
        # Check that we have an emote fragment, and that it is a global emote
        if ('emoticon_id' in fragment.keys()) and (int(fragment['emoticon_id']) in id_list):
            # Add emote
            lst.append(int(fragment['emoticon_id']))
    
    # Save only unique emotes
    return list(set(lst))

# Apply function to dataset fragments
xqc['emotes'] = xqc.fragments.apply(make_emote_list)
tim['emotes'] = tim.fragments.apply(make_emote_list)
tyler1['emotes'] = tyler1.fragments.apply(make_emote_list)

qtpie['emotes'] = qtpie.fragments.apply(make_emote_list)
myth['emotes'] = myth.fragments.apply(make_emote_list)
ninja['emotes'] = ninja.fragments.apply(make_emote_list)

# Example output
test_fragments = [{'emoticon_id': '4'}, {'text': 'sample text'}, {'emoticon_id': '6'}]
print(make_emote_list(test_fragments))

[4, 6]


In [132]:
# Take only text (no emotes)
def get_text_only(x):
    # Combines all elements in dict with "text" key
    text_str = " ".join([y['text'] for y in x if 'text' in y.keys()])
    
    # Removes trailing whitespaces and uppercase for analysis
    return text_str.lower().strip()

xqc['text_only'] = xqc.fragments.apply(get_text_only)
tim['text_only'] = tim.fragments.apply(get_text_only)
tyler1['text_only'] = tyler1.fragments.apply(get_text_only)

qtpie['text_only'] = qtpie.fragments.apply(get_text_only)
myth['text_only'] = myth.fragments.apply(get_text_only)
ninja['text_only'] = ninja.fragments.apply(get_text_only)

# Example output
print(get_text_only(test_fragments))

sample text


In [133]:
# Regex out URLs
regex_str = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
xqc['text_only'] = xqc.text_only.str.replace(regex_str, '', regex=True)
tim['text_only'] = tim.text_only.str.replace(regex_str, '', regex=True)
tyler1['text_only'] = tyler1.text_only.str.replace(regex_str, '', regex=True)

qtpie['text_only'] = qtpie.text_only.str.replace(regex_str, '', regex=True)
myth['text_only'] = myth.text_only.str.replace(regex_str, '', regex=True)
ninja['text_only'] = ninja.text_only.str.replace(regex_str, '', regex=True)

# Example output
sample_text = 'check out https://github.com/COGS108/group013_sp21 for our project repo, or use !link'
sample_text = re.sub(regex_str, '', sample_text)

# Regex out commands
regex_str = r'\!\w+'

xqc['text_only'] = xqc.text_only.str.replace(regex_str, '', regex=True)
tim['text_only'] = tim.text_only.str.replace(regex_str, '', regex=True)
tyler1['text_only'] = tyler1.text_only.str.replace(regex_str, '', regex=True)

qtpie['text_only'] = qtpie.text_only.str.replace(regex_str, '', regex=True)
myth['text_only'] = myth.text_only.str.replace(regex_str, '', regex=True)
ninja['text_only'] = ninja.text_only.str.replace(regex_str, '', regex=True)

# Example Output
sample_text = re.sub(regex_str, '', sample_text)
print(sample_text)

check out  for our project repo, or use 


In [134]:
# Remove identifying information
xqc_identifying_info = '|'.join(['@xqcow', 'xqcow', 'xqc'])
tim_identifying_info = '|'.join(['@timthetatman', 'timthetatman', 'tatman', 'tim'])
tyler1_identifying_info = '|'.join(['@loltyler1', 'loltyler1', 'tyler1', 'tyler'])

qtpie_identifying_info = '|'.join(['@imaqtpie', 'imaqtpie', 'qtpie', 'qt'])
myth_identifying_info = '|'.join(['@tsm_myth', '@myth', 'tsm_myth', 'myth'])
ninja_identifying_info = '|'.join(['@ninja', 'ninja'])

xqc['text_only'] = xqc.text_only.str.replace(xqc_identifying_info, '', regex=True)
tim['text_only'] = tim.text_only.str.replace(tim_identifying_info, '', regex=True)
tyler1['text_only'] = tyler1.text_only.str.replace(tyler1_identifying_info, '', regex=True)

qtpie['text_only'] = qtpie.text_only.str.replace(qtpie_identifying_info, '', regex=True)
myth['text_only'] = myth.text_only.str.replace(myth_identifying_info, '', regex=True)
ninja['text_only'] = ninja.text_only.str.replace(ninja_identifying_info, '', regex=True)

# Example Output
re.sub(xqc_identifying_info, '', 'i love you @xqcow please respond')

'i love you  please respond'

In [135]:
# Keep only alphanumeric characters
regex_str = r'\W'
xqc['text_only'] = xqc.text_only.str.replace(regex_str, ' ', regex=True)
tim['text_only'] = tim.text_only.str.replace(regex_str, ' ', regex=True)
tyler1['text_only'] = tyler1.text_only.str.replace(regex_str, ' ', regex=True)

qtpie['text_only'] = qtpie.text_only.str.replace(regex_str, ' ', regex=True)
myth['text_only'] = myth.text_only.str.replace(regex_str, ' ', regex=True)
ninja['text_only'] = ninja.text_only.str.replace(regex_str, ' ', regex=True)

# Example Output
example = re.sub(regex_str, ' ', 'oh man i need help!!! please @ me for help#!@!#!# karappa')
example

'oh man i need help    please   me for help        karappa'

In [136]:
# Stopwords + bttv emotes + extra spaces
nltk.download('stopwords')
sw = stopwords.words('english') + bttv_emotes

xqc['text_only'] = xqc['text_only'].apply(lambda x: ' '.join([item for item in x.split() if item not in sw]))
tim['text_only'] = tim['text_only'].apply(lambda x: ' '.join([item for item in x.split() if item not in sw]))
tyler1['text_only'] = tyler1['text_only'].apply(lambda x: ' '.join([item for item in x.split() if item not in sw]))

qtpie['text_only'] = qtpie['text_only'].apply(lambda x: ' '.join([item for item in x.split() if item not in sw]))
myth['text_only'] = myth['text_only'].apply(lambda x: ' '.join([item for item in x.split() if item not in sw]))
ninja['text_only'] = ninja['text_only'].apply(lambda x: ' '.join([item for item in x.split() if item not in sw]))

example = ' '.join([item for item in example.split() if item not in sw])
example

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'oh man need help please help'

In [137]:
print('xqcOW: ' + str(len(xqc[xqc['body'].str.contains("alpha")])))
print('TimTheTatMan: ' + str(len(tim[tim['body'].str.contains("alpha")])))
print('loltyler1: ' + str(len(tyler1[tyler1['body'].str.contains("alpha")])))

print('imaqtpie: ' + str(len(qtpie[qtpie['body'].str.contains("alpha")])))
print('TSM_Myth: ' + str(len(myth[myth['body'].str.contains("alpha")])))
print('Ninja: ' + str(len(ninja[ninja['body'].str.contains("alpha")])))

xqcOW: 0
TimTheTatMan: 1
loltyler1: 14
imaqtpie: 5
TSM_Myth: 1
Ninja: 0


In [138]:
tfidf = TfidfVectorizer(sublinear_tf=True, analyzer='word', max_features=2000, tokenizer=word_tokenize)

In [139]:
# Remove empty string ('') from text_only for each of the datasets
xqc_final = pd.Series([i for i in xqc['text_only'] if i]) 
tim_final = pd.Series([i for i in tim['text_only'] if i])
tyler1_final = pd.Series([i for i in tyler1['text_only'] if i])
qtpie_final = pd.Series([i for i in qtpie['text_only'] if i])
myth_final = pd.Series([i for i in myth['text_only'] if i])
ninja_final = pd.Series([i for i in ninja['text_only'] if i])

In [None]:
# Combine the 'text_only' columns from each of the streamers into one dataset with labels
xqc_frame = {'Chat Text': xqc_final}
xqc_result = pd.DataFrame(xqc_frame)
xqc_result['label'] = 'xqc'

tim_frame = {'Chat Text': tim_final}
tim_result = pd.DataFrame(tim_frame)
tim_result['label'] = 'tim'


tyler1_frame = {'Chat Text': tyler1_final}
tyler1_result = pd.DataFrame(tyler1_frame)
tyler1_result['label'] = 'tyler1'


qtpie_frame = {'Chat Text': qtpie_final}
qtpie_result = pd.DataFrame(qtpie_frame)
qtpie_result['label'] = 'qtpie'


myth_frame = {'Chat Text': myth_final}
myth_result = pd.DataFrame(myth_frame)
myth_result['label'] = 'myth'


ninja_frame = {'Chat Text': ninja_final}
ninja_result = pd.DataFrame(ninja_frame)
ninja_result['label'] = 'ninja'


frames = [xqc_result, tim_result, tyler1_result, qtpie_result, myth_result, ninja_result]
text_frame = pd.concat(frames)

In [None]:
# Shuffle the dataset
text_frame = text_frame.sample(frac=1, random_state=200).reset_index(drop=True)

tfidf_X = tfidf.fit_transform(text_frame['Chat Text']).toarray()

tfidf_Y = text_frame['label'].values
text_frame = text_frame.reset_index()

In [None]:
# Split the dataset
train_tfidf_X, test_tfidf_X, train_tfidf_Y, test_tfidf_Y = train_test_split(tfidf_X, tfidf_Y, test_size=0.2, random_state=200)

In [140]:
def train_SVM(X, y, kernel='linear'):
# YOUR CODE HERE
    clf = SVC(kernel=kernel)
    clf.fit(X, y)
    return clf

In [141]:
# Train the model
tfidf_clf = train_SVM(train_tfidf_X, train_tfidf_Y)

In [None]:
# Make predictions based on the model
prediction_train_tfidf_Y = tfidf_clf.predict(train_tfidf_X)
prediction_test_tfidf_Y = tfidf_clf.predict(test_tfidf_X)

In [None]:
# Assess the model with training data
print(classification_report(train_tfidf_Y, prediction_train_tfidf_Y))

In [None]:
# Assesss the model with test data
print(classification_report(test_tfidf_Y, prediction_test_tfidf_Y))

# Emote TF-IDF

In [None]:
xqc_emote_final = pd.Series([i for i in xqc['emotes'] if i]) 
tim_emote_final = pd.Series([i for i in tim['emotes'] if i])
tyler1_emote_final = pd.Series([i for i in tyler1['emotes'] if i])
qtpie_emote_final = pd.Series([i for i in qtpie['emotes'] if i])
myth_emote_final = pd.Series([i for i in myth['emotes'] if i])
ninja_emote_final = pd.Series([i for i in ninja['emotes'] if i])

# Combine the 'text_only' columns from each of the streamers into one dataset with labels
xqc_emote_frame = {'Emote': xqc_emote_final}
xqc_result = pd.DataFrame(xqc_frame)
xqc_result['label'] = 'xqc'

tim_frame = {'Emote': tim_emote_final}
tim_result = pd.DataFrame(tim_frame)
tim_result['label'] = 'tim'


tyler1_frame = {'Emote': tyler1_emote_final}
tyler1_result = pd.DataFrame(tyler1_frame)
tyler1_result['label'] = 'tyler1'


qtpie_frame = {'Emote': qtpie_emote_final}
qtpie_result = pd.DataFrame(qtpie_frame)
qtpie_result['label'] = 'qtpie'


myth_frame = {'Emote': myth_emote_final}
myth_result = pd.DataFrame(myth_frame)
myth_result['label'] = 'myth'


ninja_frame = {'Emote': ninja_emote_final}
ninja_result = pd.DataFrame(ninja_frame)
ninja_result['label'] = 'ninja'


emote_frames = [xqc_result, tim_result, tyler1_result, qtpie_result, myth_result, ninja_result]
emote_frame = pd.concat(frames)



In [None]:
xqc