In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import tensorflow as tf
import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, Bidirectional, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
import re
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

classes = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

data = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
train_text = data['comment_text']
train_text = train_text.str.lower()
train_label = data[classes]
data_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_text = data_test['comment_text']
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

def preprocess(text):
    txt = re.sub('[^a-zA-Z]','',text)
    txt = re.sub(r"\s+[a-zA-Z]\s+",'',txt)
    txt = re.sub(r"\s+",'',txt)
    return txt
        
train_txt = []
for text in train_text:
    train_txt.append(preprocess(text))

test_txt = []
for text in test_text:
    test_txt.append(preprocess(text))
    
y = train_label.values
x_train,x_test,y_train,y_test = train_test_split(train_txt,y,test_size = 0.25,random_state=45)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

test = tokenizer.texts_to_sequences(test_txt)
test_pad = pad_sequences(test,padding='post',maxlen=200)

x_train = pad_sequences(x_train,padding='post',maxlen=250)
x_test = pad_sequences(x_test,padding='post',maxlen=250)

vocab_size = len(tokenizer.word_index)+1
glove = open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt',encoding='utf8')
word_embedding = dict()
for line in glove:
    words = line.split()  
    word = words[0]
    vector = np.asarray(words[1:],dtype='float32')
    word_embedding[word] = vector
glove.close()
matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = word_embedding.get(word)
    if embedding_vector is not None:
        matrix[index] = embedding_vector
  
        
input = Input(shape = (250,))
x = Embedding(vocab_size,100,weights=[matrix],trainable = False)(input)
x = Bidirectional(LSTM(64,return_sequences = False))(x)
output = Dense(6,activation='softmax')(x)
model = Model(input,output)
model.compile(loss='binary_crossentropy',optimizer='adadelta',metrics=['accuracy'])
print(model.summary())
history = model.fit(x_train, y_train, batch_size=150, epochs=10, verbose=1, validation_split=0.25)

y_pred = model.predict(test_pad,100,verbose=1)
print(y_pred)
test_ids = data_test['id'].values
test_ids = test_ids.reshape((len(test_ids),1))

submit_file = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
submit_file[classes] = y_pred
print(submit_file)
submit_file.to_csv('submission_2.csv',index = False)