## Introduction

In this kernel, we use sklearn's logistic regression to classify the given text into sub-categories with a probability of prediction for detecting identity exposure.

In [13]:
import boto3
import pandas as pd, numpy as np
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize
import os
from configparser import ConfigParser
from smart_open import smart_open

### Data retrieval from Amazon S3

In [14]:
config = ConfigParser()

config_file = ('config.ini')
config.read(config_file)
config['aws.data']

default = config['aws.data']
aws_key = default['accessKey']
aws_secret = default['secretAccessKey']

bucket_name = 'texttoxicity-train-test'
train_key = 'train.csv'
test_key = 'test.csv'

train_path = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, train_key)
test_path = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, test_key)

train = pd.read_csv(smart_open(train_path))
test = pd.read_csv(smart_open(test_path))

In [15]:
pd.set_option('display.max_columns', 500)
train.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


### Data Cleaning

In [16]:
train['male'] = train.target.apply(lambda x: 1 if x>0.45 else 0)
train['female'] = train.severe_toxicity.apply(lambda x: 1 if x>0.45 else 0)
train['homosexual_gay_or_lesbian'] = train.obscene.apply(lambda x: 1 if x>0.45 else 0)
train['christian'] = train.insult.apply(lambda x: 1 if x>0.45 else 0)
train['jewish'] = train.threat.apply(lambda x: 1 if x>0.45 else 0)
train['muslim'] = train.identity_attack.apply(lambda x: 1 if x>0.45 else 0)
train['black'] = train.identity_attack.apply(lambda x: 1 if x>0.45 else 0)
train['white'] = train.identity_attack.apply(lambda x: 1 if x>0.45 else 0)
train['psychiatric_or_mental_illness'] = train.identity_attack.apply(lambda x: 1 if x>0.45 else 0)
train = train[['id','comment_text','male','female','homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black','white', 'psychiatric_or_mental_illness']]

We'll create a list of all the labels to predict, and we'll also create a 'none' label so we can see how many comments have no labels. We can then summarize the dataset.

In [17]:
label_cols = ['male','female','homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black','white', 'psychiatric_or_mental_illness']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

Unnamed: 0,id,male,female,homosexual_gay_or_lesbian,christian,jewish,muslim,black,white,psychiatric_or_mental_illness,none
count,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0,1804874.0
mean,3738434.0,0.08200849,9.972995e-06,0.005651918,0.0610846,0.002496573,0.007829909,0.007829909,0.007829909,0.007829909,0.9159986
std,2445187.0,0.2743777,0.00315799,0.07496651,0.2394855,0.04990332,0.0881397,0.0881397,0.0881397,0.0881397,0.27739
min,59848.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,796975.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,5223774.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,5769854.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,6334010.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
len(train),len(test)

(1804874, 97320)

In [19]:
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

### Building the model

TF-IDF Vectorizer is used to vectorise the comments

In [20]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=word_tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [21]:
trn_term_doc, test_term_doc

(<1804874x2304861 sparse matrix of type '<class 'numpy.float64'>'
 	with 170107570 stored elements in Compressed Sparse Row format>,
 <97320x2304861 sparse matrix of type '<class 'numpy.float64'>'
 	with 9124050 stored elements in Compressed Sparse Row format>)

In [22]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [23]:
x = trn_term_doc
test_x = test_term_doc

In [24]:
print(test_term_doc)

  (0, 2215173)	0.10006306272618043
  (0, 2212065)	0.05151512124954903
  (0, 2105952)	0.09996826803093134
  (0, 2105940)	0.06336574570502959
  (0, 1996994)	0.14831466813202215
  (0, 1983679)	0.13983321394374026
  (0, 1971172)	0.04160843146272801
  (0, 1798807)	0.1787242327120229
  (0, 1798727)	0.13399055560130785
  (0, 1716339)	0.19907968714630236
  (0, 1716332)	0.1263121417255389
  (0, 1599967)	0.2434632153655707
  (0, 1599775)	0.10649646500957297
  (0, 1506149)	0.17761794575083575
  (0, 1489429)	0.15217178170363835
  (0, 1489353)	0.126315536287002
  (0, 1477275)	0.08792959116969182
  (0, 1475035)	0.05775955002093855
  (0, 1454998)	0.10702980048215424
  (0, 1453821)	0.11963761528822553
  (0, 1433887)	0.05240820742605657
  (0, 1155529)	0.16509154857066993
  (0, 1155471)	0.14593373993520745
  (0, 1129089)	0.13363479822079352
  (0, 1128178)	0.032696420253560725
  :	:
  (97318, 84615)	0.09412734290309624
  (97318, 77325)	0.0912675136919613
  (97318, 62948)	0.02586068672312247
  (97318, 421

In [25]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [27]:
import pickle
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    try:
        d = open(str(j) + '_model.p', 'wb')
        pickle.dump(m, d)
    finally:
        d.close()
    try:
        e = open(str(j) + '_r.p','wb')
        pickle.dump(r, e)
    finally:
        e.close()  

fit male
fit female
fit homosexual_gay_or_lesbian
fit christian
fit jewish
fit muslim
fit black
fit white
fit psychiatric_or_mental_illness


In [28]:
try:
    d = open('tf_idf_vectorizer.p', 'wb')
    pickle.dump(vec, d)
finally:
    d.close()

In [32]:
import pickle
import numpy as np
import tweepy
import pandas as pd

vectorizer = pickle.load(open('tf_idf_vectorizer.p','rb'))

male_model = pickle.load(open('male_model.p','rb'))
male_r = pickle.load(open('male_r.p','rb'))

female_model = pickle.load(open('female_model.p','rb'))
female_r = pickle.load(open('female_r.p','rb'))

homosexual_gay_or_lesbian_model = pickle.load(open('homosexual_gay_or_lesbian_model.p','rb'))
homosexual_gay_or_lesbian_r = pickle.load(open('homosexual_gay_or_lesbian_r.p','rb'))

christian_model = pickle.load(open('christian_model.p','rb'))
christian_r = pickle.load(open('christian_r.p','rb'))

jewish_model = pickle.load(open('jewish_model.p','rb'))
jewish_r = pickle.load(open('jewish_r.p','rb'))

muslim_model = pickle.load(open('muslim_model.p','rb'))
muslim_r = pickle.load(open('muslim_r.p','rb'))

black_model = pickle.load(open('black_model.p','rb'))
black_r = pickle.load(open('black_r.p','rb'))

white_model = pickle.load(open('white_model.p','rb'))
white_r = pickle.load(open('white_r.p','rb'))

psychiatric_or_mental_illness_model = pickle.load(open('psychiatric_or_mental_illness_model.p','rb'))
psychiatric_or_mental_illness_r = pickle.load(open('psychiatric_or_mental_illness_r.p','rb'))

In [33]:
label_cols =  ['male','female','homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black','white', 'psychiatric_or_mental_illness']
models = {"male" : [male_model,male_r],"female" : [female_model,female_r],
            "homosexual_gay_or_lesbian":[homosexual_gay_or_lesbian_model,homosexual_gay_or_lesbian_r],
          "christian":[christian_model,christian_r],"jewish":[jewish_model,jewish_r],
            "muslim":[muslim_model,muslim_r], "black":[black_model,black_r], "white":[white_model,white_r],
         "psychiatric_or_mental_illness":[psychiatric_or_mental_illness_model,psychiatric_or_mental_illness_r]}

In [34]:
def get_model(name):
    return models[name]

In [72]:
text = ['i will kill jews']
v = vectorizer.transform(text)
p = np.zeros((len(text), len(label_cols)))
for i, j in enumerate(label_cols):
    model = get_model(j)
    p[:,i] = model[0].predict_proba(v.multiply(model[1]))[:,1]
result = pd.concat([pd.DataFrame(p, columns = label_cols)], axis=1)

In [73]:
print(result)

       male   female  homosexual_gay_or_lesbian  christian    jewish  \
0  0.998483  0.00006                    0.00106   0.042003  0.998086   

     muslim     black     white  psychiatric_or_mental_illness  
0  0.990945  0.964633  0.971019                       0.983536  


In [74]:
exposure_count = 0
for index, row in result.iterrows():
    for col in label_cols:
        if(row[col] > 0.1):
            exposure_count += 1

In [84]:
score = round(exposure_count/len(label_cols)*100,2)
print(score)

66.67
