### Imports

In [2]:
import os
import pandas as pd
import config
import re
import json
from fetch_messages import retrieve_messages
from IPython.display import display

### Scrape messages
- Set the `channel_id` you want to scrape from in your `config.py`
- `to_fetch` defaults to -1, which tells the scraper to grab as much as it can

In [7]:
retrieve_messages(channel_id=config.general_channel_id, to_fetch=-1)

Messages fetched: 50
Messages fetched: 100
Messages fetched: 150
Messages fetched: 200
Messages fetched: 250
Messages fetched: 300
Messages fetched: 350
Messages fetched: 400
Messages fetched: 450
Messages fetched: 500
Messages fetched: 550
Messages fetched: 600
Messages fetched: 650
Messages fetched: 700
Messages fetched: 750
Messages fetched: 800
Messages fetched: 850
Messages fetched: 900
Messages fetched: 950
Messages fetched: 1000
Messages fetched: 1050
Messages fetched: 1100
Messages fetched: 1150
Messages fetched: 1200
Messages fetched: 1250
Messages fetched: 1300
Messages fetched: 1350
Messages fetched: 1400
Messages fetched: 1450
Messages fetched: 1500
Messages fetched: 1550
Messages fetched: 1600
Messages fetched: 1650
Messages fetched: 1700
Messages fetched: 1750
Messages fetched: 1800
Messages fetched: 1850
Messages fetched: 1900
Messages fetched: 1950
Messages fetched: 2000
Messages fetched: 2050
Messages fetched: 2100
Messages fetched: 2150
Messages fetched: 2200
Messages

### Load messages into dataframe
- Also writes the messages to a .csv

In [8]:
dfs = []
for filename in os.listdir('messages'):
    data = pd.read_json(f'messages/{filename}')
    dfs.append(data)

messages_df = pd.concat(dfs, ignore_index=True)

In [None]:
display(messages_df)
messages_df.to_csv(f'messages/messages.csv')

Reload dataframe from .csv

In [14]:
messages_df = pd.read_csv('messages/messages.csv')
print(f'Loaded {len(messages_df)} messages into messages_df.')

Loaded 10867 messages into messages_df.


### Remove embedded messages

In [15]:
with_attachments = len(messages_df)
messages_df = messages_df[messages_df['attachments']=='[]']
print(f'Removed {with_attachments - len(messages_df)} attachment messages.')

with_embeds = len(messages_df)
messages_df = messages_df[messages_df['embeds']=='[]']
print(f'Removed {with_embeds - len(messages_df)} embedded messages.')

with_application_msgs = len(messages_df)
messages_df = messages_df[messages_df['application_id'].isnull()]
print(f'Removed {with_application_msgs - len(messages_df)} application messages.')

print(f'{len(messages_df)} remaining messages.')
cleaned_df = messages_df.copy()

Removed 1285 attachment messages.
Removed 1261 embedded messages.
Removed 104 application messages.
8217 remaining messages.


### Profile users
- Creates a `{'username': 'id'}` dictionary containing each user that has sent a message

In [16]:
with_bot_msg = len(cleaned_df)
cleaned_df['author'] = cleaned_df['author'].apply(eval)
cleaned_df = cleaned_df[cleaned_df['author'].apply(lambda x: "bot" not in x)] # remove bots
print(f'Removed {with_bot_msg - len(cleaned_df)} bot messages.')
print(f'{len(cleaned_df)} messages remaining.')

users_dict = {}
extract_users = cleaned_df.copy()
while not extract_users.empty:
    author = extract_users.iloc[0]['author']
    users_dict[author['id']] = author['username']
    extract_users = extract_users[extract_users['author'].apply(lambda x: x['id']) != author['id']]

Removed 139 bot messages.
8078 messages remaining.


In [7]:
print(f'Generated users_dict with {len(users_dict)} users.')
# display(users_dict)

Generated users_dict with 22 users.


### Replace mention tags with usernames

In [24]:
with_mentions_df = cleaned_df.copy()
with_mentions_df = with_mentions_df[(with_mentions_df['mentions']!='[]') & (with_mentions_df['message_reference'].isna())]

def replace_mentions(msg):
    def extract_and_replace_id(match):
        key = match.group(1)
        return users_dict.get(key, f'<@{key}>')
    return re.sub(r'<@(\d+)>', extract_and_replace_id, msg)
    
with_mentions_df['content'] = with_mentions_df['content'].apply(replace_mentions)
print(f'Replaced mentions with corresponding usernames in {len(with_mentions_df)} messages.')

Replaced mentions with corresponding usernames in 287 messages.
