-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_analysis.py
369 lines (279 loc) · 12 KB
/
data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
import numpy as np
from matplotlib.pyplot import figure, show
import pandas as pd
import re
import datetime
# Variables
# How many days the rolling windows should be
rolling_window_days = 90
# Users to use in plots
use_names = [
"aldebaran#4477",
"generalmelchett",
"jespertjee",
"thethymos",
"virtualhummingbird",
"maxi_we",
"1tsclassified",
"versatilespice",
"spocklan116",
"kalmoire",
"sigismundaugustus6042"
]
# Loading the data TODO: there must be a better way to do this instead of just loading it like this
print("loaded Retihom.csv")
data_original = pd.read_csv("Retihom.csv", encoding="utf8")
# This data we will append stuff to
# TODO: do this nicer by instead changing the names in the rest of the file (or copying it there)
data = data_original.copy()
def get_bots(unique_names):
""""
Function to get o get the list of bots
"""
def add_delta_day(data, column_name: str = 'Date'):
""""
Function to add the delta day column in the data, which is simply a column with the amount of days since the
first date in the column.
:param data : the pandas dataframe containing all the discord messages
:param column_name : the name of the date column, is 'date' by default
:returns data : modified pandas dataframe with the aforementioned delta_day column
"""
# Converting date column to pandas datetime
data[column_name] = pd.to_datetime(data[column_name], format="%d/%m/%Y")
# Adding delta day column necessary for plotting
data['Delta_day'] = data[column_name] - data[column_name].iloc[0]
return data
def get_unique_names(data, column_name: str = 'Author'):
""""
Function which returns all the unique names in the chat log, i.e. all the users that were in the chat
:param data : the pandas dataframe containing all the discord messages
:param column_name : the name of the author column, is 'author' by default
:returns names : all the unique names in the file
"""
return data[column_name].unique()
def get_filter_word_count(filedata, filterword_query):
""""
Function to get the absolute count of the filtered words in a pandas dataframe
"""
filedata['filter_count'] = filedata['Content'].str.count(filterword_query, flags=re.IGNORECASE)
data = filedata[["Author", "filter_count", "Delta_day"]]
# Getting number of messages in chat
data = data.groupby(['Author', 'Delta_day']).sum()
# Unstacking data and filling in the NaNs, this is done so we can do the cumsum later
data = data.unstack(0)
data = data.fillna(0)
# Cumulative summing
data = data.cumsum()
return data
def get_total_words(filedata):
""""
Function to get total words over time in a pandas dataframe
"""
filedata['word_number'] = filedata['Content'].str.split().str.len()
data = filedata[["Author", "word_number", "Delta_day"]]
# Getting number of messages in chat
data = data.groupby(['Author', 'Delta_day']).sum()
# Unstacking data and filling in the NaNs, this is done so we can do the cumsum later
data = data.unstack(0)
data = data.fillna(0)
# Cumulative summing
data = data.cumsum()
return data
def plot_general():
""""
Generic plot function
"""
return
def plot_messages(filedata, plot_name):
# Getting number of messages in chat
data = filedata.groupby(['Author', 'Delta_day']).size()
# Unstacking data and filling in the NaNs, this is done so we can do the cumsum later
data = data.unstack(0)
data = data.fillna(0)
# Cumulative summing
data = data.cumsum()
fig = figure(figsize=(15, 10))
frame = fig.add_subplot(1, 1, 1)
sortednames = data.iloc[-1].sort_values(ascending=False).index
# Days for plotting on the x-axis
days = [(pd.to_datetime(filedata['Date'][0], format="%d/%m/%Y") + datetime.timedelta(days=i)) for i in
range(data.index.size)]
for column in sortednames:
if column in use_names:
number = str(int(data[column][-1]))
frame.plot(days, data[column], label=(column + f': {number} messages'))
frame.set_ylim(bottom=0)
frame.grid()
frame.set_ylabel("Messages")
frame.set_xlabel("Date")
fig.legend(loc=2)
fig.suptitle('Data for sum of messages')
fig.savefig(plot_name)
def plot_words(filedata, plot_name):
data = get_total_words(filedata)
fig = figure(figsize=(15, 10))
frame = fig.add_subplot(1, 1, 1)
# Have to reset the columns
data = data.transpose().reset_index(level=0, drop=True).transpose()
sortednames = data.iloc[-1].sort_values(ascending=False).index
# Days for plotting on the x-axis
days = [(pd.to_datetime(filedata['Date'][0], format="%d/%m/%Y") + datetime.timedelta(days=i)) for i in
range(data.index.size)]
for column in sortednames:
if column in use_names:
number = str(int(data[column][-1]))
frame.plot(days, data[column], label=(column + f': {number} words'))
frame.set_ylim(bottom=0)
frame.grid()
frame.set_ylabel("Words")
frame.set_xlabel("Date")
fig.legend(loc=2)
fig.suptitle('Data for sum of words')
fig.savefig(plot_name)
# TODO: process (custom) emoji's into words first so they can be processed properly
def plot_filter_words(filedata, plot_name, filterwords, filterword_query):
data = get_filter_word_count(filedata, filterword_query)
fig = figure(figsize=(15, 10))
frame = fig.add_subplot(1, 1, 1)
# Have to reset the columns
data = data.transpose().reset_index(level=0, drop=True).transpose()
sortednames = data.iloc[-1].sort_values(ascending=False).index
# Days for plotting on the x-axis
days = [(pd.to_datetime(filedata['Date'][0], format="%d/%m/%Y") + datetime.timedelta(days=i)) for i in
range(data.index.size)]
for name in sortednames:
if name in use_names:
number = str(int(data[name][-1]))
frame.plot(days, data[name], label=(name + f': {number} times said'))
frame.set_ylim(bottom=0)
frame.grid()
frame.set_ylabel("Word count")
frame.set_xlabel("Date")
fig.legend(loc=2)
fig.suptitle('Data for filtered words: ' + str(filterwords))
fig.savefig(plot_name)
def plot_relative_filter_words(filedata, plot_name, filterwords, filterword_query):
# Getting filtered words and total words so we can divide them
filterdata = get_filter_word_count(filedata, filterword_query)['filter_count']
worddata = get_total_words(filedata)['word_number']
# Dividing them and changing na's into 0's (since we divided by 0 a bunch of times)
fraction = filterdata.divide(worddata)
fraction = fraction.fillna(0)
fig = figure(figsize=(20, 10))
frame = fig.add_subplot(1, 1, 1)
sortednames = fraction.iloc[-1].sort_values(ascending=False).index
# Days for plotting on the x-axis
days = [(pd.to_datetime(filedata['Date'][0], format="%d/%m/%Y") + datetime.timedelta(days=i)) for i in
range(fraction.index.size)]
for name in sortednames:
if name in use_names:
# Transforming values to log, except if they are 0
number = float(fraction[name][-1])
# Check if number is equal to 0
if number == 0:
frame.plot(days, fraction[name], label=(name + r': final fraction: 0'))
else:
lognumber = str(round(np.log10(number), 2))
text = name + r' Final fraction: $10^{' + lognumber + r'}$'
frame.plot(days, fraction[name], label=text)
frame.set_yscale('log')
frame.grid()
frame.set_ylabel("Fraction of words")
frame.set_xlabel("Date")
fig.legend(loc=2)
fig.suptitle('Data for filtered words: ' + str(filterwords))
fig.subplots_adjust(left=0.25)
fig.savefig(plot_name)
def get_total_words_rolling(filedata):
""""
Function to get total words over a rolling week
"""
filedata['word_number'] = filedata['Content'].str.split().str.len()
data = filedata[["Author", "word_number", "Delta_day"]]
# Getting number of messages in chat
data = data.groupby(['Author', 'Delta_day']).sum()
# Unstacking data and filling in the NaNs
data = data.unstack(0)
data = data.fillna(0)
# Cumulative summing
data = data.rolling(f'{rolling_window_days}d').sum()
return data
def plot_relative_filter_words_week(filedata, plot_name, filterwords, filterword_query):
# Getting the frequency of words
filedata['filter_count'] = filedata['Content'].str.count(filterword_query, flags=re.IGNORECASE)
data = filedata[["Author", "filter_count", "Delta_day"]]
# Getting number of messages per day
data = data.groupby(['Author', 'Delta_day']).sum()
# Unstacking such that we have data for each day
data = data.unstack(0)
data = data.rolling(f'{rolling_window_days}d').sum()
worddata = get_total_words_rolling(filedata)['word_number']
# Dividing them and changing na's into 0's (since we divided by 0 a bunch of times)
fraction = data.divide(worddata)
fraction = fraction.fillna(0)
fig = figure(figsize=(20, 10))
frame = fig.add_subplot(1, 1, 1)
sortednames = fraction.iloc[-1].sort_values(ascending=False).index
# Days for plotting on the x-axis
days = [(pd.to_datetime(filedata['Date'][0], format="%d/%m/%Y") + datetime.timedelta(days=i)) for i in
range(fraction.index.size)]
for name in sortednames:
if name[1] in use_names:
frame.plot(days, fraction[name], label=name[1])
frame.grid()
frame.set_ylabel("Fraction of words")
frame.set_xlabel("Date")
fig.legend(loc=2)
fig.suptitle(f'{rolling_window_days} day rolling fraction for filtered words: ' + str(filterwords))
fig.subplots_adjust(left=0.25)
fig.savefig(plot_name)
def plot_absolute_filter_words_week(filedata, plot_name, filterwords, filterword_query):
# Getting the frequency of words
filedata['filter_count'] = filedata['Content'].str.count(filterword_query, flags=re.IGNORECASE)
data = filedata[["Author", "filter_count", "Delta_day"]]
# Getting number of messages per day
data = data.groupby(['Author', 'Delta_day']).sum()
# Unstacking such that we have data for each day
data = data.unstack(0)
data = data.rolling(f'{rolling_window_days}d').sum()
fig = figure(figsize=(20, 10))
frame = fig.add_subplot(1, 1, 1)
sortednames = data.iloc[-1].sort_values(ascending=False).index
# Days for plotting on the x-axis
days = [(pd.to_datetime(filedata['Date'][0], format="%d/%m/%Y") + datetime.timedelta(days=i)) for i in
range(data.index.size)]
for name in sortednames:
if name[1] in use_names:
frame.plot(days, data[name], label=name[1])
frame.grid()
frame.set_ylabel("Count of words")
frame.set_xlabel("Date")
fig.legend(loc=2)
fig.suptitle(f'{rolling_window_days} day rolling count for filtered words: ' + str(filterwords))
fig.subplots_adjust(left=0.25)
fig.savefig(plot_name)
def analyse(choice, plot_name, filterwords=None):
filedata = add_delta_day(data)
if choice == 1:
plot_messages(filedata, plot_name)
elif choice == 2:
plot_words(filedata, plot_name)
else:
# Retrieving filter words by transforming them into a query
# We have to make the filterwords into a query for pandas count to understand using '|'s
if len(filterwords) == 1:
filterword_query = filterwords[0]
else:
filterword_query = filterwords[0]
for word in filterwords[1::]:
filterword_query += f"|{word}"
if choice == 3:
plot_filter_words(filedata, plot_name, filterwords, filterword_query)
elif choice == 4:
plot_relative_filter_words(filedata, plot_name, filterwords, filterword_query)
elif choice == 6:
plot_relative_filter_words_week(filedata, plot_name, filterwords, filterword_query)
elif choice == 5:
plot_absolute_filter_words_week(filedata, plot_name, filterwords, filterword_query)
if __name__ == "__main__":
analyse(3, ['test', 'fun'])