In [1]:
%load_ext autoreload
%autoreload 2

In [158]:
import pandas as pd
import numpy as np
import utils
import csv
import re
from tqdm import tqdm
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn import preprocessing

from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)


from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

In [27]:
# From https://github.com/GBJim/age_gender_predictor/blob/master/predictor.py
age_intercept = 23.2188604687
gender_intercept = -0.06724152


def load_age_lexica(file_name = "../data/datasets/age_gender_lexica/emnlp14age.csv"):
    age_lexica = {}
    with open(file_name, mode='r') as infile:
        reader = csv.DictReader(infile)
        for data in reader:
            weight = float(data['weight'])
            term = data['term']
            age_lexica[term] = weight

    del age_lexica['_intercept']
    return age_lexica


def load_gender_lexica(file_name = "../data/datasets/age_gender_lexica/emnlp14gender.csv"):
    gender_lexica = {}
    with open(file_name, mode='r') as infile:
        reader = csv.DictReader(infile)
        for data in reader:
            weight =  float(data['weight'])
            term = data['term']
            gender_lexica[term] = weight

    del gender_lexica['_intercept']
    return gender_lexica

age_lexica = load_age_lexica()
gender_lexica = load_gender_lexica()

# This function returns a float. Positive valuse represents female and vice versa.
def get_gender(text):
    words = text.split()

    text_scores = {}
    for word in words:
        text_scores[word] = text_scores.get(word, 0) + 1

    gender = 0
    words_count = 1
    for word, count in text_scores.items():
        if word in gender_lexica:
            words_count += count
            gender += count * gender_lexica[word]

    gender = gender / words_count + gender_intercept

    return gender
    
# This function returns a float, representing the age. 

def get_age(text):
    words = text.split()

    text_scores = {}
    for word in words:
        text_scores[word] = text_scores.get(word, 0) + 1

    age = 0
    words_count = 1
    for word, count in text_scores.items():
        if word in age_lexica:
            words_count +=count
            age += count * age_lexica[word]

    age = age / words_count + age_intercept

    return age


In [4]:
target_users_df = pd.read_csv('../data/target_users.csv', index_col=0)
users = target_users_df[[
# 'harassing_users_final_model_gboost_toxic_comments_identity_hate',
# 'harassing_users_final_model_gboost_toxic_comments_insult',
# 'harassing_users_final_model_gboost_toxic_comments_obscene',
# 'harassing_users_final_model_gboost_toxic_comments_severe_toxic',
# 'harassing_users_final_model_gboost_toxic_comments_threat',
'harassing_users_final_model_gboost_toxic_comments_toxic',
# 'harassing_users_final_model_gboost_hate_offensive_Hate',
# 'harassing_users_final_model_gboost_hate_offensive_Offensive',
# 'harassing_users_final_model_gboost_hate_abuse_abusive',
# 'harassing_users_final_model_gboost_hate_abuse_hateful'
]].values.flatten()

all_users = []
for s in users:
    if s is not '[]':
        all_users.extend([u for u in re.sub("\[|\]|\'|\,", '', s).split(' ') if len(u) > 0])
all_users = np.unique(all_users)
np.random.shuffle(all_users)

In [None]:
control_users = utils.load_users('../data/control_user_tweets2/', min_tweets=500, limit=500)

In [16]:
selected_control_users = [str(u.id) for u in control_users]

In [136]:
harassing_users = utils.load_users_by_name('../data/harassers_from_targetted/', all_users[:1000])
selected_harassing_users = [u.screen_name for u in harassing_users]

In [17]:
control_user_tweets = utils.load_user_tweets_by_name('../data/control_user_tweets2/', selected_control_users)

In [18]:
len(selected_control_users), len(control_user_tweets), len(selected_harassing_users), len(harasser_tweets)

(501, 501, 482, 482)

In [137]:
harasser_tweets = utils.load_user_tweets_by_name('../data/harassers_from_targetted/', selected_harassing_users)

In [117]:
with open('final_model_gboost_toxic_comments_toxic.p', 'rb') as f:
    model = pickle.load(f)

In [138]:
def predict_list(user_tweets, model):
    user_scores = {}
    user_preds = {}
    i = 0
    for uid, tweets in tqdm(user_tweets.items()):
        texts = tweets #[t.text for t in tweets]
        probs = model.predict_proba(texts)[:, 1]
        user_scores[uid] = probs.mean(axis=0)
        user_preds[uid] = probs
        i += 1
    return user_scores, user_preds

user_scores, user_preds = predict_list(harasser_tweets, model)

100%|██████████| 965/965 [30:09<00:00,  1.90s/it]


In [142]:
harasser_ages = {}
harasser_genders = {}
for uid, tweets in harasser_tweets.items():

    text = '\n'.join(tweets)
    if len(text) == 0:
        continue
    harasser_ages[uid] = get_age(text)
    harasser_genders[uid] = get_gender(text)
    
    
control_ages = {}
control_genders = {}
for uid, tweets in control_user_tweets.items():
    text = '\n'.join(tweets)
    if len(text) == 0:
        continue
    try:
        control_ages[uid] = get_age(text)
    except:
        print(text)
        break
    control_genders[uid] = get_gender(text)

In [69]:
control_account_age = [(datetime.now().timestamp() - u.created_at.timestamp()) / (365 * 24 * 60 * 60) for u in control_users]
harasser_account_age = [(datetime.now().timestamp() - u.created_at.timestamp()) / (365 * 24 * 60 * 60) for u in harassing_users]

In [70]:
control_tweet_count = [u.statuses_count for u in control_users]
harasser_tweet_count = [u.statuses_count for u in harassing_users]

control_followers_count = [u.followers_count for u in control_users]
harasser_followers_count = [u.followers_count for u in harassing_users]

control_friends_count = [u.friends_count for u in control_users]
harasser_friends_count = [u.friends_count for u in harassing_users]

In [104]:
fig = make_subplots(rows=3, cols=2, subplot_titles=['Estimated Age', 'Estimated Gender', 'Account Age (Years)', 'Tweets Count', 'Followers Count', 'Friends Count'])
fig.add_trace(
    go.Histogram(
        x=list(harasser_ages.values()), 
        name='Harasser', 
        histnorm='probability',
        marker_color='#1f77b4',  # muted blue
        bingroup=1
    ),
    row=1, col=1
)
fig.add_trace(
    go.Histogram(
        x=list(control_ages.values()),
        name='Random', 
        histnorm='probability',
        marker_color='#ff7f0e',  # safety orange
        bingroup=1
    ),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(
        x=list(harasser_genders.values()),
        name='Harasser', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#1f77b4',  # muted blue,
#         nbinsx=1000,
        bingroup=2
    ),
    row=1, col=2
)
fig.add_trace(
    go.Histogram(
        x=list(control_genders.values()),
        name='Random', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#ff7f0e',  # safety orange
        bingroup=2
    ),
    row=1, col=2
)


fig.add_trace(
    go.Histogram(
        x=harasser_account_age,
        name='Harasser', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#1f77b4',  # muted blue,
#         nbinsx=1000,
        bingroup=3
    ),
    row=2, col=1
)
fig.add_trace(
    go.Histogram(
        x=control_account_age,
        name='Random', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#ff7f0e',  # safety orange
        bingroup=3
    ),
    row=2, col=1
)


fig.add_trace(
    go.Histogram(
        x=harasser_tweet_count,
        name='Harasser', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#1f77b4',  # muted blue,
#         nbinsx=1000,
        bingroup=4
    ),
    row=2, col=2
)
fig.add_trace(
    go.Histogram(
        x=control_tweet_count,
        name='Random', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#ff7f0e',  # safety orange
        bingroup=4
    ),
    row=2, col=2
)

fig.add_trace(
    go.Histogram(
        x=harasser_followers_count,
        name='Harasser', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#1f77b4',  # muted blue,
        nbinsx=10000,
        bingroup=5
    ),
    row=3, col=1
)
fig.add_trace(
    go.Histogram(
        x=control_followers_count,
        name='Random', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#ff7f0e',  # safety orange
        bingroup=5
    ),
    row=3, col=1
)

fig.add_trace(
    go.Histogram(
        x=harasser_friends_count,
        name='Harasser', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#1f77b4',  # muted blue,
        nbinsx=1000,
        bingroup=6
    ),
    row=3, col=2
)
fig.add_trace(
    go.Histogram(
        x=control_friends_count,
        name='Random', 
        histnorm='probability', 
        showlegend=False,
        marker_color='#ff7f0e',  # safety orange
        bingroup=6
    ),
    row=3, col=2
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.65)
fig.update_xaxes(range=(0, 60), row=1, col=1)
fig.update_xaxes(range=(-10, 10), row=1, col=2)
fig.update_xaxes(range=(0, 2e5), row=2, col=2)
fig.update_xaxes(range=(0, 1e4), row=3, col=1)
fig.update_xaxes(range=(0, 1e4), row=3, col=2)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    legend=dict(x=0.25, y=-0.05),
    legend_orientation="h",
    margin=dict(l=20, r=20, t=20, b=20)
)

fig.show()

In [106]:
with open('predict_harasser_scores.p', 'rb') as f:
    results = pickle.load(f)

In [128]:
def compute_harassment_frac(user_tweets, threshold=0.5):
    user_scores = {}
    for uid, tweet_probs in user_tweets.items():
        user_scores[uid] = (tweet_probs > threshold).sum() / len(tweet_probs)
    return user_scores

In [139]:
# harasser_scores, harasser_preds = results['final_model_gboost_toxic_comments_toxic.p']
harasser_fracs = compute_harassment_frac(user_preds)

In [131]:
harasser_fracs

{'133DA115099': 0.09705882352941177,
 '19Bill84': 0.04779717373233583,
 '1GOPWIZARD': 0.09044193216855087,
 '1USA_Woman': 0.108348134991119,
 '237akhg': 0.09174311926605505,
 '2emesouffle': 0.09856262833675565,
 '2sidescoin': 0.19213081246806335,
 '325proit10': 0.07566539923954373,
 '3Chimp': 0.081203007518797,
 '3rian_3erk': 0.07807807807807808,
 '3toadvicki': 0.049798608568290006,
 '43017bg': 0.06060606060606061,
 '48Sezza': 0.04114490161001789,
 '4La_Volpe': 0.03680981595092025,
 '662Maz': 0.11707841031149302,
 '666david': 0.06017094017094017,
 '66stilllliberal': 0.016728624535315983,
 '6849_ft': 0.09888190076869321,
 '713HomeBuyer': 0.03412073490813648,
 '826irish': 0.13550135501355012,
 'A1wcc': 0.11268057784911717,
 'AJVGriff': 0.1509351892127012,
 'AJensen76': 0.016587677725118485,
 'ALANMCALEES': 0.11012433392539965,
 'AccountingDrone': 0.05530104712041885,
 'AdventureOfMoto': 0.10826319816373374,
 'Age_of_Mike': 0.07844574780058651,
 'AlanHajek': 0.08188585607940446,
 'AlanSie

In [140]:
len(harasser_fracs)

965

In [161]:
Y = [harasser_fracs[u] for u in selected_harassing_users]
account_ages = [(datetime.now().timestamp() - u.created_at.timestamp()) / (365 * 24 * 60 * 60) for u in harassing_users]
X = [
    #Account age
    account_ages,
    [u.statuses_count for u in harassing_users],
    [harassing_users[i].statuses_count / account_ages[i] for i in range(len(harassing_users))],
    [u.followers_count for u in harassing_users],
    [u.friends_count for u in harassing_users],
    [harasser_genders[u] for u in selected_harassing_users],
    [harasser_ages[u] for u in selected_harassing_users]
]

In [162]:
X = np.array(X)

In [145]:
X.shape, len(Y)

((6, 965), 965)

In [163]:
X2 = sm.add_constant(X.T)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     4.192
Date:                Tue, 31 Dec 2019   Prob (F-statistic):           0.000146
Time:                        05:58:51   Log-Likelihood:                 1176.6
No. Observations:                 965   AIC:                            -2337.
Df Residuals:                     957   BIC:                            -2298.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1354      0.015      9.184      0.0

In [151]:
ci = est2.conf_int()
cii = (ci[:, 1] - ci[:, 0])/2

In [157]:
cii

array([2.89383020e-02, 1.37599809e-03, 1.32999550e-07, 6.19064431e-07,
       8.67729863e-08, 2.28153127e-06, 3.36340129e-03, 8.17861822e-04])

In [152]:
cii

array([2.89383020e-02, 1.37599809e-03, 1.32999550e-07, 6.19064431e-07,
       8.67729863e-08, 2.28153127e-06, 3.36340129e-03, 8.17861822e-04])

In [93]:
fig = go.Figure()
fig.add_trace(
    go.Histogram(x=list(harasser_genders.values()), name='Harasser')
)
fig.add_trace(
    go.Histogram(x=list(control_genders.values()), name='Control')
)
fig.show()

In [5]:
age_weights = {r['term']: r['weight'] for _, r in age_df.iterrows()}

In [None]:
def predict(lexica, doc):
    [lexica[t] if t in lexica else 0 for t in doc.split()]

In [6]:
age_weights

{'_intercept': 23.218860468699997,
 'raining': 235.750747016,
 'yellow': -71.0739016775,
 'four': -27.8786758306,
 'gag': -79.9019763069,
 'woods': 35.3610249356,
 'spiders': -20.3825632808,
 "friend's": 47.1684119493,
 'hanging': 111.690159235,
 'woody': 88.2460568417,
 'marching': -112.773504559,
 'increase': 79.4046821209,
 'electricity': 12.2941812706,
 'funk': 38.2489728365,
 'lord': -128.780698841,
 'immature': -143.595143479,
 'shaving': 59.375953112299996,
 'sinking': -63.072350069799995,
 'callin': 37.723651898,
 'kent': 125.744998674,
 'retail': 74.9228353096,
 'oceans': -53.646211882299994,
 'dell': -87.7820945284,
 'crackin': -55.8040869226,
 'foul': 67.7656789338,
 'politician': -24.0928562221,
 'stabbed': -43.1788040116,
 'screaming': -188.445674889,
 'flys': -49.5297156225,
 'disturb': 29.6354578111,
 'basics': -18.214098201800002,
 'prize': 154.29348949299998,
 'wooden': -8.303064825369999,
 'wednesday': -0.5581416187839999,
 'crotch': -57.9077923954,
 'zzzzzz': -31.985

In [39]:
W = np.array(list(age_weights.values()))[1:]
vocab = list(age_weights.keys())[1:]
W0 = age_weights['_intercept']

In [61]:
W0

23.218860468699997

In [46]:
corpus = [ '\n'.join(tweets) for tweets in blockbot_user_tweets.values()]

In [48]:

vectorizer = TfidfVectorizer(use_idf=False, vocabulary=vocab, norm='l1')
X = vectorizer.fit_transform(corpus)

In [49]:
X = X.todense()

In [50]:
W0 + np.matmul(X, W[:, None])

matrix([[65.54431592],
        [56.25398055],
        [92.81245409],
        [44.18686977],
        [69.27014134],
        [49.46402564],
        [53.95262672],
        [46.06870992],
        [33.74604071],
        [52.33847471],
        [46.04754869],
        [38.87521364],
        [34.32314563],
        [51.26673485],
        [40.43727983],
        [73.27138271],
        [43.7986585 ],
        [43.88834953],
        [56.22299287],
        [49.56556521],
        [37.91816635],
        [42.01996958],
        [41.72991032],
        [31.47680233],
        [49.05677908],
        [50.68967889],
        [49.80854625],
        [39.32244927],
        [62.06384697],
        [38.1455411 ],
        [42.03528992],
        [47.65674682],
        [32.5380781 ],
        [43.4860225 ],
        [43.91539844],
        [35.92088423],
        [46.21576725],
        [36.53598507],
        [62.97414079],
        [35.50880447],
        [48.18774385],
        [55.33656289],
        [41.066836  ],
        [39