### Acquire & Clean Data

In [154]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
import json
import requests
import os
import warnings
import time
from dotenv import load_dotenv
load_dotenv()

access_token = os.environ.get("access_token")

workshop_channel_id = os.environ.get("workshop_channel_id")
vc_channel_id = os.environ.get("vc_channel_id")
weebchat_channel_id = os.environ.get("weebchat_channel_id")
lol_channel_id = os.environ.get("lol_channel_id")

In [None]:
def jsonToDataframe(messages_json, channel_name):
    timestamps = []
    username = []
    content = []
    message_id = []
    for message in messages_json:
        timestamps.append(message['timestamp'])
        username.append(message['author']['username'])
        content.append(message['content'])
        message_id.append(message['id'])

    list_of_tuples = list(zip(username, timestamps, content, message_id)) 

    df = pd.DataFrame(list_of_tuples,
                      columns = ['username', 'timestamps', 'content', 'message_id']) 
    df['channel'] = channel_name
    
    return df

def getChannelMessages(access_token, channel_id):
    api_url = "https://discord.com/api/channels/" + channel_id + "/messages?limit=100"
    api_call_headers = {'Authorization': 'Bot ' + access_token}
    api_call_response = requests.get(api_url, headers=api_call_headers, verify=False)

    messages_raw = api_call_response.text
    messages_json_new = json.loads(messages_raw)

    messages_json = messages_json_new.copy() # create deep copy of list 

    while len(messages_json_new) > 0: 
        messages_json.extend(messages_json_new)
        starting_id = messages_json[-1]['id']
        
        print("Working on channel_id: " + channel_id)
        time.sleep(.25)
        
        api_url = "https://discord.com/api/channels/" + channel_id + "/messages?limit=100&before=" + starting_id
        api_call_response = requests.get(api_url, headers=api_call_headers, verify=False)
        
        messages_raw = api_call_response.text
        messages_json_new = json.loads(messages_raw)
        
    return messages_json

In [185]:
# Messages Pull and JSON Conversion
warnings.filterwarnings("ignore")

#workshop
workshop_json = getChannelMessages(access_token, workshop_channel_id)
workshop_df = jsonToDataframe(workshop_json, 'workshop')

#vc-discussion
vc_json = getChannelMessages(access_token, vc_channel_id)
vc_df = jsonToDataframe(vc_json, 'vc-discussion')

#workshop
weebchat_json = getChannelMessages(access_token, weebchat_channel_id)
weebchat_df = jsonToDataframe(weebchat_json, 'weebchat')

#workshop
lol_json = getChannelMessages(access_token, lol_channel_id)
lol_df = jsonToDataframe(lol_json, 'lol')

Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 392898909267165188
Working on channel_id: 39289890926

In [161]:
# Cleaning Process
# Remove ""(GIFS), remove the tags between <@!____>, filter out bots like Pepper-potts & groovy
# Fix timestamps 

workshop_df1 = workshop_df[workshop_df[~df['username'].isin(countries_to_keep)]]

workshop_df2 = workshop_df1[workshop_df1['content'] != ""]

In [188]:
pd.set_option('display.max_row', 1000)
workshop_df2['username'].unique()


array(['626', 'Gremlin', 'Rohnji', 'Prime', 'IRON', 'suchadad', 'Pikachi',
       'aaliyah', 'Floseidon', 'nuna_d', 'Yang', 'Tenecchi', 'grogu',
       'Siddartha', 'Naughty Roti', 'fancypants', 'Chucher', 'Raederle',
       'kumomi18', 'DrewTheDon', 'CrabRaveDad', 'ellie', 'DownToPho',
       'Rythm', 'Bennay', 'jacksmores', 'Kaivitz', 'rina', 'archerja',
       'gambling bot', 'MEE6', 'kinglenyx', 'chimkin nugget', 'mattyv43',
       'Simple Poll', 'NatRae', 'Pnoyy-', '.ryan', 'peepeepoopoo', 'Den~',
       'ericle0n', 'neely', 'lael54', 'Zo', 'Maalikin', 'NotoriousCAT',
       'Alfredo', 'TurkeyManLaFlare', 'ricki', 'cass', 'alpha',
       'MethodAct', 'thegreatgabola', 'star.duhst', 'LoreVerne', 'danny',
       'Zandophen', 'Sidekick'], dtype=object)

In [None]:
workshop_df3 = workshop_df2[~workshop_df2['username'].isin(countries_to_keep)]]
workshop_df3['username'].unique()