# Dataset Creation
---
This is the notebook used to create the dataset used in modeling. The original data imported below comes from [this Harvard study](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VE0IVQ). This notebook is included on the Github for reproducibility. 

In [1]:
#Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk

In [3]:
#Specific datasets used
kingrichard = pd.read_pickle('../data/ICWSM19_data/kingrichard.pkl')
scarra = pd.read_pickle('../data/ICWSM19_data/scarra.pkl')
xchocobars = pd.read_pickle('../data/ICWSM19_data/xchocobars.pkl')
tfue = pd.read_pickle('../data/ICWSM19_data/tfue.pkl')

Selects nine streams

In [30]:
#Selects specific streams
kr_stream1 = kingrichard[kingrichard['video_id'] == '264485130']
kr_stream2 = kingrichard[kingrichard['video_id'] == '269543679']
kr_stream3 = kingrichard[kingrichard['video_id'] == '270350156']
#Merges together
kr_short = pd.concat([kr_stream1, kr_stream2, kr_stream3], axis = 0, ignore_index = True)

In [4]:
#Selects specific streams
sc_stream1 = scarra[scarra['video_id'] == '265494216']
sc_stream2 = scarra[scarra['video_id'] == '262866347']
#Merges together
sc_short = pd.concat([sc_stream1, sc_stream2], axis = 0, ignore_index = True)

In [5]:
#Selects specific streams
cb_stream1 = xchocobars[xchocobars['video_id'] == '275994445']
cb_stream2 = xchocobars[xchocobars['video_id'] == '274073677']
#Merges together
cb_short = pd.concat([cb_stream1, cb_stream2], axis = 0, ignore_index = True)

In [6]:
#Selects specific streams
tf_stream1 = tfue[tfue['video_id'] == '265017626']
tf_stream2 = tfue[tfue['video_id'] == '266069120']
#Merges together
tf_short = pd.concat([tf_stream1, tf_stream2], axis = 0, ignore_index = True)

Merges stream subsets together

In [40]:
dfs = [kr_short, sc_short, cb_short, tf_short]
final_df = pd.concat(dfs, axis = 0, ignore_index = True)

Find only users and drops admin/staff comments

In [42]:
#Selects users
df = final_df[final_df.commenter_type == 'user']
#Gets rid of column as all same value now
df = df.drop('commenter_type', axis = 1)

Drop any edited Twitch chats (can't tell if commentor or mod edited)

In [44]:
#Converts columns to datetime for easy use
df['created_at'] = pd.to_datetime(df['created_at'])
df['updated_at'] = pd.to_datetime(df['updated_at'])
#Subtracts rows to see if there's a non-0 value
df['edited'] = df.apply(lambda row: row.updated_at - row.created_at, axis=1)
#Only keeps rows that weren't edited
no_edits = df[df['edited'] == pd.Timedelta("0 days 00:00:00")]

Create twitch_chat column that has no emoticons in it

In [48]:
#Function used to replace chats
def replace_text(fragments):
    twitch_chat = ''
    for d in fragments:
        #Finds text in fragments and adds it to twitch_chat
        if 'text' in d.keys():
            #If no previous chat creates one
            if len(twitch_chat) == 0:
                items = list(d.items())
                twitch_chat = items[0][1]
            #If previous text in fragments, adds to it
            else:
                items = list(d.items())
                twitch_chat = twitch_chat + ' ' + items[0][1]
    return twitch_chat

#Column with only words, no emoticons
no_edits['twitch_chat'] = no_edits['fragments'].apply(replace_text)

Break emoticons out into a list in their own column

In [50]:
#Function used to isolate emotes
def emoticon_list(fragments):
    emotes = []
    for d in fragments:
        #Finds if any emotes and adds to list
        if 'emoticon_id' in d.keys():
            items = list(d.items())
            emotes.append(items[0][1])
    #If no emotes in chat returns None
    if emotes == []:
        return ['None']
    return emotes

#Creates emotes only column
no_edits['emotes'] = no_edits['fragments'].apply(emoticon_list)

Get rid of any rows that are chat commands (start with !)

In [53]:
#Finds rows that start with !
command = no_edits['twitch_chat'].str.startswith('!') 
#Drops them from the dataframe
no_edits = no_edits[~command].reset_index(drop=True)

Drops the columns just used for creating other features that are no longer needed

In [54]:
no_edits = no_edits.drop(['fragments', 'updated_at', 'edited'], axis = 1)

Saves the final dataframe

In [56]:
no_edits.to_csv('../data/small_merged_chats', index = False)