In [12]:
import pandas as pd
import re
import os
import logging
import sys

import warnings
warnings.filterwarnings("ignore")

def preprocess_dataset(day,nrow=-1):
    
    if nrow==-1:
        neg_data = pd.read_csv("/bigtemp/rm5tx/nlp_project/"+day+"_all.csv")
    else:
        neg_data = pd.read_csv("/bigtemp/rm5tx/nlp_project/"+day+"_all.csv",nrows=nrow)
    # We want a unify col name for when we concat pos and neg data
    neg_data.rename(columns={"body":"data"}, inplace=True)
    neg_data = neg_data.dropna(subset=['author', 'data'])
    neg_data["label"] = 0
    #print(neg_data[['data']].tail(1))
    
    # RNV uses a special preprocess step
    print("Preprocessing... 1. split new lines, 2. convert to lowercase, and 3. strip numbers and punct")
    ### 1) remove newlines
    neg_data['data'] = neg_data['data'].replace('\n', ' ', regex = True)

    ## 2) convert to lowercase
    neg_data['data'] = neg_data['data'].str.lower()

    # ### 3) remove punct and numbers: https://stackoverflow.com/questions/47947438/preprocessing-string-data-in-pandas-dataframe
    neg_data["data"] = neg_data.data.apply(lambda x : " ".join(re.findall('[\w]+',x)))
    
    return neg_data

def get_toxic_users(neg_data,curse_word_list,threshold=5):
    
    toxic_df = neg_data[neg_data['data'].str.contains('|'.join(curse_word_list))]
    toxic_author = toxic_df['author'].value_counts().to_frame().reset_index()
    toxic_author.rename(columns = {'index':'author','author':'toxic_count'},inplace = True)
    
    top_toxic_author = toxic_author[toxic_author.toxic_count>threshold]
    toxic_user_list = top_toxic_author['author'].tolist()
    toxic_user_list.remove('[deleted]')
    print('extracted toxic users')
    
    return toxic_user_list

def get_adjacent_dataset(user_list,neg_data):
    # if path:
        # neg_data = pd.read_csv(path, usecols=['body'], dtype="string")
    # else:
    neg_data = neg_data[neg_data.author.isin(user_list)]
    print('extracted comments of negative users')
    return neg_data




In [13]:
day = "2016-05" #first parameter for notebook, for python code it will be sys.argv[1]
min_toxic_comment = 5 # second paramter for notebook, for python code it will be sys.argv[2]
sample_size = 500000 # third parameter for notebook, for python code it will be sys.argv[3]
num_curse_words = 3 #fourth parameter for notebook, for python code it will be sys.argv[4]
curse_word_list = ['crap','damn','trash'] #fourth parameter for notebook, will comment it out in python code

# following block will be used in python code
# curse_word_list = []

# for i in range(num_curse_words):
#     cur_word = sys.argv[5+i]
#     curse_word_list.append(cur_word)

logfilename = 'log_files/adjacent_'+day+'.log'

# if os.path.isfile(logfilename):
#     os.remove(logfilename)

if not os.path.isdir('log_files/'):
    os.makedirs('log_files/')

logging.basicConfig(filename=logfilename, filemode='w', format='%(message)s', level=logging.INFO)
# Create the logger
# Admin_Client: The name of a logger defined in the config file
mylogger = logging.getLogger('Admin_Client')
mylogger.info('starting log....')

In [14]:
%%time
#dummy_df = preprocess_dataset(day)
dummy_df = preprocess_dataset(day,nrow=sample_size)
print(dummy_df.shape)
toxic_users = get_toxic_users(dummy_df,curse_word_list,threshold=min_toxic_comment)
print("number of toxic users ",len(toxic_users))
print(toxic_users[0:10])

adjacent_df = get_adjacent_dataset(toxic_users,dummy_df)
print(adjacent_df.shape)

adjacent_loc = "/bigtemp/rm5tx/nlp_project/adjacent_data/"

if not os.path.isdir(adjacent_loc):
    os.makedirs(adjacent_loc)

adjacent_df.to_csv(adjacent_loc+"adjacent_"+day+"_all.csv")

Preprocessing... 1. split new lines, 2. convert to lowercase, and 3. strip numbers and punct
(499910, 20)
extracted toxic users
number of toxic users  4
['KaitoGL', 'gokuuu', 'thedude0121', 'JexInfinite']
extracted comments of negative users
(311, 20)
CPU times: user 9.81 s, sys: 38.9 ms, total: 9.85 s
Wall time: 9.85 s


In [None]:
import pandas as pd
out_path="/bigtemp/rm5tx/nlp_project/2016-07_all.csv"
df = pd.read_csv(out_path)
df.head()

In [None]:
df.shape

In [7]:
import pandas as pd

df = pd.read_csv('/bigtemp/rm5tx/nlp_project/adjacent_data/adjacent_2016-06min_5_positive.csv')

df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,author_flair_css_class,edited,id,controversiality,stickied,distinguished,...,score,data,link_id,author_flair_text,gilded,created_utc,parent_id,author,ups,label
0,16,16,16,16.0,,False,d3r38e6,0.0,False,,...,2.0,for sure P,t3_4lycfw,,0.0,1464739000.0,t1_d3r37pf,chocoholicbrunette,2.0,1.0
1,40,40,40,40.0,text-pcmr color-pcmr icon-steam,False,d3r38ga,0.0,False,,...,1.0,Yeah probably if not then ask again,t3_4lyceq,Potato!,0.0,1464739000.0,t1_d3r370t,Reckasta,1.0,1.0
2,55,55,55,55.0,,False,d3r38hf,0.0,False,,...,1.0,Tag team,t3_4ly3qi,,0.0,1464739000.0,t1_d3r33s3,That-nz-guy,1.0,1.0
3,62,62,62,62.0,,False,d3r38i0,0.0,False,,...,1.0,Fucking Genji man,t3_4lvvcd,,0.0,1464739000.0,t3_4lvvcd,crunch816,1.0,1.0
4,64,64,64,64.0,Warriors3,False,d3r38i6,0.0,False,,...,1.0,I heard he started laughing at all of Klay s j...,t3_4lt7yz,Warriors,0.0,1464739000.0,t1_d3q1gdp,NoseDragon,1.0,1.0


In [8]:
df['author'].value_counts()

AutoModerator           243376
GoTradeBot               15988
TrumpTrain-bot            4221
Trump-Tzu                 4066
autotldr                  3883
                         ...  
themastercheif              74
SaintSixx                   69
Zippythewonderpoodle        66
coach_veratu                66
FuguofAnotherWorld          36
Name: author, Length: 3926, dtype: int64