[Jigsaw Unintended Bias in Toxicity Classification](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification)

# 导入工具

In [38]:
import warnings
warnings.filterwarnings('ignore')
import sys
import os
import json

TEMP_FOLDER = os.path.abspath('./temp/')
print(f"文件夹 '{TEMP_FOLDER}' 将被用来存储语料和临时性字典")

import logging
logging.basicConfig(format="%(asctime)s: %(levelname)s: %(message)s", level=logging.INFO)

import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd

import seaborn as sns
sns.set()
import matplotlib.pyplot as  plt
import matplotlib.patches as patches
plt.rcParams['font.sans-serif'] = ['SimHei']
%matplotlib inline

from keras.models import Sequential, Model
from keras.layers import Input, Dense, SpatialDropout1D, add, concatenate
from keras.layers import Embedding
from keras.layers import Bidirectional, LSTM
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence

from gensim.models import KeyedVectors

文件夹 'D:\Github\NLP\Artificial_Intelligence_for_NLP\Week_13_0928_RNN\assignments\temp' 将被用来存储语料和临时性字典


In [19]:
# embedding files
EMBEDDING_FILES = ['data/crawl-300d-2M.gensim', 'data/glove.840B.300d.gensim']

In [20]:
# model params
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
EMBEDDING_DIM = 300
MAX_LEN = 220

In [21]:
IDENTITY_COLUMNS = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 
                  'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
AUX_COLUMS = ['severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
STOP_CHARS = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

# 数据

In [3]:
df = pd.read_csv('data/train.csv', iterator=True)

In [4]:
df_chunk = df.get_chunk(10000)

In [5]:
df_chunk.tail(3)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
9997,254559,0.0,Is that a “Taurus revolver known as “The Judge...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,51627,approved,0,0,0,0,0,0.0,4,4
9998,254560,0.0,Motives matter. Is Mueller posting as private ...,0.0,0.0,0.0,0.0,0.0,,,...,51876,approved,0,0,0,2,0,0.0,0,4
9999,254561,0.0,"[... continued]\nMueller is at his best with ""...",0.0,0.0,0.0,0.0,0.0,,,...,51876,approved,0,0,0,2,0,0.0,0,4


In [6]:
df_chunk.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count'],
      dtype='object')

In [7]:
# data params
# train_df = pd.csv('data/train.csv') # to much data, just run a demo
train_df = df_chunk

In [8]:
test_df = pd.read_csv('data/test.csv')

In [9]:
test_df.tail()

Unnamed: 0,id,comment_text
97315,7194635,He should lose his job for promoting mis-infor...
97316,7194636,"""Thinning project is meant to lower fire dange..."
97317,7194637,I hope you millennials are happy that you put ...
97318,7194638,I'm thinking Kellyanne Conway (a.k.a. The Trum...
97319,7194639,I still can't figure why a pizza in AK cost mo...


In [11]:
x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
y_aux_train = train_df[AUX_COLUMS].values
x_test = test_df[TEXT_COLUMN].astype(str)

In [12]:
for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= .5, True, False)

In [13]:
x_test.tolist()[0]

'[ Integrity means that you pay your debts.]\n\nDoes this apply to President Trump too?'

In [14]:
tokenizer = text.Tokenizer(filters=STOP_CHARS, lower=False)
tokenizer.fit_on_texts(x_train.tolist() + x_test.tolist())

In [15]:
word2index = tokenizer.word_index

In [16]:
x_train = tokenizer.texts_to_sequences(x_train)

In [22]:
padded_x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)

In [23]:
x_test = tokenizer.texts_to_sequences(x_test)
padded_x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [24]:
sample_weights = np.ones(len(x_train), dtype='float32')

In [25]:
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()

I1027 17:31:44.629886  5696 utils.py:141] NumExpr defaulting to 8 threads.


# 模型

In [26]:
def build_matrix(word_index, path):
    embedding_file = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM))
    for word, index in word_index.items():
        try:
            embedding_matrix[index] = embedding_file.wv[word.lower()]
        except:
            pass
    return embedding_matrix

In [131]:
# np.concatenate(axis=-1) align to last axis
embedding_matrix = np.concatenate([build_matrix(word2index, f) for f in EMBEDDING_FILES],
                                 axis=-1)

I1027 16:52:46.610595  2524 utils.py:422] loading Word2VecKeyedVectors object from data/crawl-300d-2M.gensim
I1027 16:52:56.458011  2524 utils.py:461] loading vectors from data/crawl-300d-2M.gensim.vectors.npy with mmap=r
I1027 16:52:56.521013  2524 utils.py:494] setting ignored attribute vectors_norm to None
I1027 16:52:56.522010  2524 utils.py:428] loaded data/crawl-300d-2M.gensim
I1027 16:58:47.366861  2524 utils.py:422] loading Word2VecKeyedVectors object from data/glove.840B.300d.gensim
I1027 16:58:58.194862  2524 utils.py:461] loading vectors from data/glove.840B.300d.gensim.vectors.npy with mmap=r
I1027 16:58:58.252863  2524 utils.py:494] setting ignored attribute vectors_norm to None
I1027 16:58:58.253862  2524 utils.py:428] loaded data/glove.840B.300d.gensim


In [27]:
embedding_matrix = build_matrix(word2index, EMBEDDING_FILES[0])

I1027 17:32:19.032387  5696 utils.py:422] loading Word2VecKeyedVectors object from data/crawl-300d-2M.gensim
I1027 17:32:24.629389  5696 utils.py:461] loading vectors from data/crawl-300d-2M.gensim.vectors.npy with mmap=r
I1027 17:32:24.634388  5696 utils.py:494] setting ignored attribute vectors_norm to None
I1027 17:32:24.635388  5696 utils.py:428] loaded data/crawl-300d-2M.gensim


In [28]:
embedding_matrix.shape

(101242, 300)

In [76]:
def build_model(embedding_matrix, num_aux_targets):
#     tf.reset_default_graph()
    model = Sequential([
        Embedding(*embedding_matrix.shape, input_length=MAX_LEN, 
                  weights=[embedding_matrix], trainable=False),
        SpatialDropout1D(.2),
        Bidirectional(LSTM(LSTM_UNITS, return_sequences=True)),
        Bidirectional(LSTM(LSTM_UNITS, return_sequences=True)),
#         concatenate([GlobalMaxPooling1D(), GlobalAveragePooling1D()]),
        Dense(DENSE_HIDDEN_UNITS, activation='relu'),
        Dense(DENSE_HIDDEN_UNITS, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [77]:
model = build_model(embedding_matrix, 5)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 220, 300)          30372600  
_________________________________________________________________
spatial_dropout1d_12 (Spatia (None, 220, 300)          0         
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 220, 256)          439296    
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 220, 256)          394240    
_________________________________________________________________
dense_30 (Dense)             (None, 220, 512)          131584    
_________________________________________________________________
dense_31 (Dense)             (None, 220, 512)          262656    
_________________________________________________________________
dense_32 (Dense)             (None, 220, 1)            513       
Total para

In [33]:
def build_model_new(embedding_matrix, num_aux_targets):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x)
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [39]:
checkpoint_predictions = []
weights = []

for model_idx in range(NUM_MODELS):
    
    model = build_model_new(embedding_matrix, y_aux_train.shape[-1])
    
    for global_epoch in range(EPOCHS):
        model.fit(padded_x_train, 
                  [y_train, y_aux_train], 
                  batch_size=BATCH_SIZE, 
                  epochs=1,
                  verbose=2, 
                  sample_weight=[sample_weights.values, np.ones_like(sample_weights)])
        
        checkpoint_predictions.append(model.predict(padded_x_test, 
                                                    batch_size=2048)[0].flatten())
        
        weights.append(2 ** global_epoch)
predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

W1027 17:50:05.526159  5696 deprecation_wrapper.py:119] From C:\Users\Administrator\Anaconda3\envs\nlp\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1027 17:50:05.556161  5696 deprecation.py:323] From C:\Users\Administrator\Anaconda3\envs\nlp\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/1
 - 797s - loss: 0.7635 - dense_12_loss: 0.5660 - dense_13_loss: 0.1975
Epoch 1/1
 - 944s - loss: 0.5961 - dense_12_loss: 0.5020 - dense_13_loss: 0.0940
Epoch 1/1
 - 1461s - loss: 0.5440 - dense_12_loss: 0.4569 - dense_13_loss: 0.0871
Epoch 1/1
 - 1664s - loss: 0.5102 - dense_12_loss: 0.4294 - dense_13_loss: 0.0808
Epoch 1/1
 - 1912s - loss: 0.7564 - dense_16_loss: 0.5606 - dense_17_loss: 0.1958
Epoch 1/1
 - 2124s - loss: 0.5939 - dense_16_loss: 0.4981 - dense_17_loss: 0.0958
Epoch 1/1
 - 2361s - loss: 0.5417 - dense_16_loss: 0.4552 - dense_17_loss: 0.0864
Epoch 1/1
 - 2554s - loss: 0.5139 - dense_16_loss: 0.4323 - dense_17_loss: 0.0815


In [67]:
data = np.arange(8).reshape((4,-1))
np.average(data, axis=1, weights=[1./4, 3./4])

array([0.75, 2.75, 4.75, 6.75])

In [70]:
np.average(data, axis=1, weights=[2./4, 2./4])

array([0.5, 2.5, 4.5, 6.5])

In [71]:
np.average(data, axis=1)

array([0.5, 2.5, 4.5, 6.5])

In [68]:
data

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [42]:
(797+944+1461+1664+1921+2124+2361+2554)/60/60

3.8405555555555555

In [50]:
checkpoint_predictions

[array([0.27149367, 0.25406474, 0.2540115 , ..., 0.27283764, 0.2295815 ,
        0.16598171], dtype=float32),
 array([0.11059389, 0.10830662, 0.30880198, ..., 0.20929837, 0.10653675,
        0.0228349 ], dtype=float32),
 array([0.10825571, 0.21817312, 0.47729072, ..., 0.47704375, 0.1608783 ,
        0.0623906 ], dtype=float32),
 array([0.04029423, 0.10366952, 0.27328348, ..., 0.40838522, 0.06009936,
        0.02198204], dtype=float32),
 array([0.31031466, 0.30524558, 0.32145077, ..., 0.33725137, 0.28382364,
        0.20742115], dtype=float32),
 array([0.24327421, 0.25667733, 0.5568447 , ..., 0.44904125, 0.29117572,
        0.10760072], dtype=float32),
 array([0.16006488, 0.32627714, 0.5602039 , ..., 0.61189336, 0.25650308,
        0.18900347], dtype=float32),
 array([0.05497602, 0.12771028, 0.38922197, ..., 0.5216385 , 0.10533962,
        0.03966653], dtype=float32)]

In [51]:
weights

[1, 2, 4, 8, 1, 2, 4, 8]

just run a demo with 10000 train_data spend nearly 4 hours...

In [40]:
submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': predictions
})
submission.to_csv('submission.csv', index=False)