In [1]:
import pandas as pd
import re
import os
import logging
import sys


def preprocess_dataset(day,nrow=-1):
    if nrow==-1:
        neg_data = pd.read_csv("/bigtemp/rm5tx/nlp_project/"+day+"_for_model_2_tuned.csv")
    else:
        neg_data = pd.read_csv("/bigtemp/rm5tx/nlp_project/"+day+"_for_model_2_tuned.csv",nrows=nrow)
    print("removing irrelevant data")
    neg_data = neg_data.dropna(subset=['author', 'data'])
    neg_data = neg_data[neg_data.author!='[deleted]']
    neg_data = neg_data[neg_data.author!='AutoModerator']

    return neg_data

def get_toxic_users(neg_data,min_threshold=5):
    toxic_df = neg_data[neg_data['label']==1.0]
    toxic_author = toxic_df['author'].value_counts().to_frame().reset_index()
    toxic_author.rename(columns = {'index':'author','author':'toxic_count'},inplace = True)
    
    top_toxic_author = toxic_author[toxic_author.toxic_count>=min_threshold]
    toxic_user_list = top_toxic_author['author'].tolist()
    #toxic_user_list.remove('[deleted]')
    print('extracted toxic users')
    
    return toxic_user_list

def get_adjacent_dataset(user_list,dummy_data,max_threshold=500):
    pos_data = dummy_data[dummy_data.author.isin(user_list)]
    pos_author = pos_data['author'].value_counts().to_frame().reset_index()
    pos_author.rename(columns = {'index':'author','author':'toxic_count'},inplace = True)
    pos_toxic_author = pos_author[pos_author.toxic_count<=max_threshold]
    toxic_user_list = pos_toxic_author['author'].tolist()
    pos_data = pos_data[pos_data.author.isin(toxic_user_list)]
    print('extracted comments of toxic users')
    
    neg_data = dummy_data[~dummy_data.author.isin(user_list)]
    print('extracted comments of non-toxic users')
    return pos_data,neg_data


day = "2016-06" #first parameter for notebook, for python code it will be sys.argv[1]
min_toxic_comment = 1 # second paramter for notebook, for python code it will be sys.argv[2]
max_toxic_comment = 100000 # second paramter for notebook, for python code it will be sys.argv[2]


dummy_df = preprocess_dataset(day)
print(dummy_df.shape)

pos_counts = dummy_df['author'][dummy_df['label']==1.0]].value_counts()

counts = pd.concat([ pos, neg['author'].value_counts()], axis=1)



toxic_users = get_toxic_users(dummy_df,min_threshold=min_toxic_comment)
print("number of toxic users ",len(toxic_users))
print(toxic_users[0:10])

pos_df,neg_df = get_adjacent_dataset(toxic_users,dummy_df,max_threshold=max_toxic_comment)
print(pos_df.shape)
print(neg_df.shape)
pos_df['label'] = 1.0
neg_df['label'] = 0.0


removing irrelevant data
(20130195, 22)
extracted toxic users
number of toxic users  157673
['GoTradeBot', 'TrumpTrain-bot', 'SilentOneBravo', 'CrookedPrisoner', 'autotldr', 'Trump-Tzu', 'ValorousVagabond', 'samacharbot2', 'xspacess', 'Mentioned_Videos']
extracted comments of toxic users
extracted comments of non-toxic users
(10912697, 22)
(9217498, 22)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [36]:

pos_counts = dummy_df['author'][dummy_df['label']==1.0].value_counts().rename("pos")
neg_counts = dummy_df['author'][dummy_df['label']==0.0].value_counts().rename("neg")

In [66]:

pos_counts = dummy_df['author'][dummy_df['label']==1.0].value_counts().rename("pos")
neg_counts = dummy_df['author'][dummy_df['label']==0.0].value_counts().rename("neg")
counts = pd.concat([pos_counts, neg_counts], axis=1)
counts['ratio'] = counts.pos / counts.neg
counts['total'] = counts.pos + counts.neg
counts = counts.dropna()

rat = .3
tot = 2
print(counts['total'][(counts['ratio'] >= rat) & (counts['total'] >= tot)].sum())
toxic_list = 

48193.0


In [82]:
rat = 1.0
tot = 0
print(counts['total'][(counts['ratio'] >= rat) & (counts['total'] >= tot)].sum())
print(len(counts[(counts['ratio'] >= rat) & (counts['total'] >= tot)].index))

12164.0
5948


In [44]:
counts.sort_values(by='ratio', ascending=False)

Unnamed: 0,pos,neg,ratio,total
HungAndSung,3.0,1.0,3.000000,4.0
namaloom,2.0,1.0,2.000000,3.0
the_lurking_turkey,2.0,1.0,2.000000,3.0
sparkyibew100,2.0,1.0,2.000000,3.0
ShadynastyS14,2.0,1.0,2.000000,3.0
...,...,...,...,...
xTRYPTAMINEx,1.0,530.0,0.001887,531.0
peppermind,1.0,554.0,0.001805,555.0
I_Dumped_Adele,1.0,571.0,0.001751,572.0
Reecey94,1.0,627.0,0.001595,628.0


In [7]:

a = pos_df.groupby('author')['data'].count()

author
----------_----          36
--------_-------         17
-----____---___          36
----Pyro----             10
----feelingporny----     12
                       ... 
zzzxxxccc1               43
zzzz_z                   48
zzzzcharliezzzz           2
zzzzz94                 108
zzzzzzzxxxxxxxxx         86
Name: data, Length: 157673, dtype: int64

In [22]:
a[a >= 1][a <= 4].count()

21412

In [18]:
# a[a > 500]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(a.value_counts())

1        6443
2        5698
3        4939
4        4332
5        4009
6        3798
7        3409
8        3288
9        2980
11       2735
10       2728
12       2544
13       2382
15       2260
14       2248
16       2172
17       2069
18       1997
19       1961
20       1823
23       1774
21       1745
22       1719
24       1617
26       1539
25       1516
27       1461
28       1431
29       1413
33       1329
31       1313
30       1292
34       1280
32       1270
36       1177
35       1146
37       1112
39       1095
38       1091
40       1029
42       1020
41       1001
46        946
45        931
44        921
43        920
47        917
48        870
50        861
51        833
49        820
56        765
55        761
54        758
52        753
53        729
57        726
59        707
60        695
62        666
61        664
58        661
65        660
66        613
67        610
63        582
64        571
69        559
71        556
68        554
72        542
75    