In [1]:
import pandas as pd
import numpy as np
import uuid
import datetime
import random
import time

# Initialize and clean chat dataset

In [2]:
df = pd.read_csv('../data/healthygamer_gg_testdata.csv')
df

Unnamed: 0,user,channel,message,timestamp
0,itztony1702,healthygamer_gg,BibleThump BibleThump,2021-07-16 14:05:22
1,flaredrip,healthygamer_gg,SUPERHERO BibleThump BibleThump,2021-07-16 14:05:23
2,modxta23,healthygamer_gg,GOOD DAD FeelsGoodMan,2021-07-16 14:05:23
3,reaperdiff,healthygamer_gg,FeelsStrongMan,2021-07-16 14:05:23
4,3rdkira,healthygamer_gg,drhgWeird,2021-07-16 14:05:25
...,...,...,...,...
7725,tmi,,tmi.twitch.tv,2021-07-16 15:40:08
7726,0v3rki1192,healthygamer_gg,like does he actually now what content she mak...,2021-07-16 15:41:12
7727,wizz4e,healthygamer_gg,he 100% didn't knew/didn't check,2021-07-16 15:42:48
7728,tmi,,tmi.twitch.tv,2021-07-16 15:44:16


In [3]:
# Remove unnecessary data
# df.drop(['channel'],axis=1,inplace=True)
df.dropna(axis=0,subset=['user', 'channel'],inplace=True)
df.reset_index(inplace=True,drop=True)
# Anonymize users and channels
df['user'] = 'user_' + pd.Series(pd.factorize(df['user'])[0] + 1).astype(str)
df['channel'] = 'channel_' + pd.Series(pd.factorize(df['channel'])[0] + 1).astype(str)
df

Unnamed: 0,user,channel,message,timestamp
0,user_1,channel_1,BibleThump BibleThump,2021-07-16 14:05:22
1,user_2,channel_1,SUPERHERO BibleThump BibleThump,2021-07-16 14:05:23
2,user_3,channel_1,GOOD DAD FeelsGoodMan,2021-07-16 14:05:23
3,user_4,channel_1,FeelsStrongMan,2021-07-16 14:05:23
4,user_5,channel_1,drhgWeird,2021-07-16 14:05:25
...,...,...,...,...
7704,user_1394,channel_1,no he def new,2021-07-16 15:38:51
7705,user_1328,channel_148,P,2021-07-16 15:39:05
7706,user_1394,channel_1,if u have big bobs. then make money with it,2021-07-16 15:39:29
7707,user_1351,channel_1,like does he actually now what content she mak...,2021-07-16 15:41:12


In [4]:
# Remove invalid dates
df = df[df.timestamp.str.len() == 19]
# Add extra column for just time
df['time'] = df['timestamp'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime('%H:%M:%S'))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,user,channel,message,timestamp,time
0,user_1,channel_1,BibleThump BibleThump,2021-07-16 14:05:22,14:05:22
1,user_2,channel_1,SUPERHERO BibleThump BibleThump,2021-07-16 14:05:23,14:05:23
2,user_3,channel_1,GOOD DAD FeelsGoodMan,2021-07-16 14:05:23,14:05:23
3,user_4,channel_1,FeelsStrongMan,2021-07-16 14:05:23,14:05:23
4,user_5,channel_1,drhgWeird,2021-07-16 14:05:25,14:05:25
...,...,...,...,...,...
7704,user_1394,channel_1,no he def new,2021-07-16 15:38:51,15:38:51
7705,user_1328,channel_148,P,2021-07-16 15:39:05,15:39:05
7706,user_1394,channel_1,if u have big bobs. then make money with it,2021-07-16 15:39:29,15:39:29
7707,user_1351,channel_1,like does he actually now what content she mak...,2021-07-16 15:41:12,15:41:12


# Create randomized donation dataset to add to chat dataset

In [5]:
# Functions to generate random datetime in given range
def str_time_prop(start, end, time_format, prop):
    """Get a time at a proportion of a range of two formatted times.

    start and end should be strings specifying times formatted in the
    given format (strftime-style), giving an interval [start, end].
    prop specifies how a proportion of the interval to be taken after
    start.  The returned time will be in the specified format.
    """

    stime = time.mktime(time.strptime(start, time_format))
    etime = time.mktime(time.strptime(end, time_format))

    ptime = stime + prop * (etime - stime)

    return time.strftime(time_format, time.localtime(ptime))


def random_date(start, end, prop):
    return str_time_prop(start, end, '%Y-%m-%d %H:%M:%S', prop)

In [6]:
# Add donation column
df['donation'] = 0
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,user,channel,message,timestamp,time,donation
0,user_1,channel_1,BibleThump BibleThump,2021-07-16 14:05:22,14:05:22,0
1,user_2,channel_1,SUPERHERO BibleThump BibleThump,2021-07-16 14:05:23,14:05:23,0
2,user_3,channel_1,GOOD DAD FeelsGoodMan,2021-07-16 14:05:23,14:05:23,0
3,user_4,channel_1,FeelsStrongMan,2021-07-16 14:05:23,14:05:23,0
4,user_5,channel_1,drhgWeird,2021-07-16 14:05:25,14:05:25,0
...,...,...,...,...,...,...
7704,user_1394,channel_1,no he def new,2021-07-16 15:38:51,15:38:51,0
7705,user_1328,channel_148,P,2021-07-16 15:39:05,15:39:05,0
7706,user_1394,channel_1,if u have big bobs. then make money with it,2021-07-16 15:39:29,15:39:29,0
7707,user_1351,channel_1,like does he actually now what content she mak...,2021-07-16 15:41:12,15:41:12,0


In [7]:
donation_df = pd.DataFrame(columns=["user","channel","message","timestamp","time","donation"])

# Configurable variables
num_donations = 500
donation_pool = [1,5,10,25,100,500,1000]
message_pool = ['Hi','Love your content!','You\'re the best!','']
start_date = "2021-07-16 14:05:22"
end_date = "2021-07-16 15:48:30"
# Static variables
total_users = len(df['user'].unique())
total_channels = len(df['channel'].unique())

for i in range(num_donations):
    random_time = random_date(start_date, end_date, random.random())
    donation_df.loc[len(donation_df.index)] = [
        'user_' + str(random.randint(1,total_users - 1)),
        'channel_' + str(random.randint(1,total_channels - 1)),
        random.choice(message_pool),
        random_time,
        pd.to_datetime(random_time).strftime('%H:%M:%S'),
        random.choice(donation_pool)
    ]
    
donation_df

Unnamed: 0,user,channel,message,timestamp,time,donation
0,user_1082,channel_116,Love your content!,2021-07-16 14:59:07,14:59:07,5
1,user_221,channel_33,Hi,2021-07-16 14:45:23,14:45:23,10
2,user_816,channel_97,Hi,2021-07-16 15:18:02,15:18:02,100
3,user_134,channel_10,Love your content!,2021-07-16 15:32:20,15:32:20,500
4,user_738,channel_17,,2021-07-16 14:23:42,14:23:42,1
...,...,...,...,...,...,...
495,user_1005,channel_15,Hi,2021-07-16 15:10:13,15:10:13,25
496,user_848,channel_51,Love your content!,2021-07-16 14:25:52,14:25:52,25
497,user_845,channel_134,,2021-07-16 14:46:56,14:46:56,25
498,user_448,channel_115,Hi,2021-07-16 14:43:18,14:43:18,1


In [8]:
# Concatenate and sort the DataFrames
combined_df = pd.concat([df,donation_df],ignore_index=True)
combined_df.sort_values(by=['timestamp'],ignore_index=True,inplace=True)
combined_df

Unnamed: 0,user,channel,message,timestamp,time,donation
0,user_1,channel_1,BibleThump BibleThump,2021-07-16 14:05:22,14:05:22,0
1,user_2,channel_1,SUPERHERO BibleThump BibleThump,2021-07-16 14:05:23,14:05:23,0
2,user_3,channel_1,GOOD DAD FeelsGoodMan,2021-07-16 14:05:23,14:05:23,0
3,user_4,channel_1,FeelsStrongMan,2021-07-16 14:05:23,14:05:23,0
4,user_5,channel_1,drhgWeird,2021-07-16 14:05:25,14:05:25,0
...,...,...,...,...,...,...
8202,user_483,channel_54,Hi,2021-07-16 15:47:26,15:47:26,5
8203,user_796,channel_67,,2021-07-16 15:47:33,15:47:33,500
8204,user_1238,channel_81,,2021-07-16 15:47:37,15:47:37,1
8205,user_341,channel_31,Hi,2021-07-16 15:47:42,15:47:42,1


In [9]:
combined_df.to_csv('../data/chat_dataset.csv',index=False)