In [None]:
#!pip install keras_self_attention

In [1]:
# Load Libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.layers.recurrent import LSTM, GRU,SimpleRNN

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model , Sequential
from keras.layers import Dense,Embedding,Dropout,Bidirectional,Input,BatchNormalization
from keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
#from keras_self_attention import SeqSelfAttention
import tensorflow as tf


from numpy.random import seed
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
def reset_random_seeds():
   os.environ['PYTHONHASHSEED']=str(42)
   np.random.seed(42)
reset_random_seeds()
np.random.get_state()[1][0]

42

In [3]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [4]:
import os
print(os.listdir("../input"))

['test.csv', 'challenge_data.csv', 'train.csv']


In [5]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
train.shape , test.shape

((903916, 4), (397320, 4))

In [6]:
# Create labels
label = train[train.challenge_sequence > 10][['user_id','challenge']]
label.rename(columns={'challenge':'label'},inplace=True)

label.head()

Unnamed: 0,user_id,label
10,4576,CI24958
11,4576,CI23667
12,4576,CI23691
23,4580,CI24915
24,4580,CI25727


In [7]:
# Treat the sequence of challenges as text
df = train[train.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
df.shape

(69532, 2)

In [8]:
df.head()

Unnamed: 0,user_id,challenge
0,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...
1,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...
2,4581,CI26155 CI26156 CI26157 CI26158 CI26159 CI2616...
3,4582,CI23855 CI24915 CI24917 CI23933 CI23663 CI2495...
4,4585,CI23855 CI23975 CI24917 CI25135 CI23848 CI2371...


In [9]:
# Merge Labels
df = df.merge(label)
df.head()

Unnamed: 0,user_id,challenge,label
0,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI24958
1,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI23667
2,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI23691
3,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...,CI24915
4,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...,CI25727


In [10]:
df.label.value_counts()

CI25135    1191
CI23848    1169
CI24958    1105
CI23663    1089
CI23714    1070
           ... 
CI25478       1
CI28524       1
CI25226       1
CI26326       1
CI27713       1
Name: label, Length: 4538, dtype: int64

In [11]:
# Validation split for early stopping
df_train, df_validation = train_test_split(df.sample(frac=1), test_size=0.1, shuffle=True)
df_train.shape , df_validation.shape

((187736, 3), (20860, 3))

In [12]:
df_train.head()

Unnamed: 0,user_id,challenge,label
40014,25639,CI23763 CI23765 CI23909 CI23956 CI23703 CI2391...,CI23924
76486,44635,CI24188 CI24527 CI24958 CI24531 CI24915 CI2418...,CI24052
142104,79141,CI26886 CI26889 CI26898 CI26899 CI26900 CI2690...,CI26221
45106,28263,CI28173 CI28177 CI28182 CI28190 CI28188 CI2818...,CI28194
107324,60864,CI25142 CI25143 CI25179 CI25180 CI25136 CI2517...,CI24440


In [13]:
# Load all the challenges
challenges = pd.read_csv('../input/challenge_data.csv')
challenges.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [14]:
# Encode challenges
encoder = LabelEncoder()
encoder.fit(challenges['challenge_ID'])
df_train['brand_id_encoded'] = encoder.transform(df_train.label)
df_validation['brand_id_encoded'] = encoder.transform(df_validation.label)
    
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts((df_train['challenge']))

df_train.head()

Unnamed: 0,user_id,challenge,label,brand_id_encoded
40014,25639,CI23763 CI23765 CI23909 CI23956 CI23703 CI2391...,CI23924,446
76486,44635,CI24188 CI24527 CI24958 CI24531 CI24915 CI2418...,CI24052,574
142104,79141,CI26886 CI26889 CI26898 CI26899 CI26900 CI2690...,CI26221,2743
45106,28263,CI28173 CI28177 CI28182 CI28190 CI28188 CI2818...,CI28194,4716
107324,60864,CI25142 CI25143 CI25179 CI25180 CI25136 CI2517...,CI24440,962


In [15]:
# Constants
NB_WORDS = len(tokenizer.word_index)+1
MAX_SEQUENCE_LENGTH = 10
N_CATEGORIES = challenges.shape[0]


print("NB_WORDS",NB_WORDS)
print("MAX_SQUENCE",MAX_SEQUENCE_LENGTH)
print("N_CATEGORIES",N_CATEGORIES)
    
# Create sequences
sequences_train = tokenizer.texts_to_sequences(df_train['challenge'])
sequences_validation = tokenizer.texts_to_sequences(df_validation['challenge'])
    
# Pad sequences
x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
x_validation = pad_sequences(sequences_validation, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    
# Set Labels
y_train = df_train['brand_id_encoded'].values
y_validation= df_validation['brand_id_encoded'].values


x_train

NB_WORDS 4962
MAX_SQUENCE 10
N_CATEGORIES 5606


array([[ 730,  813,  581, ...,  476,  851,  872],
       [ 147,   15,    5, ...,   51,    1,   21],
       [ 956,  725,  416, ...,  217,  298,  211],
       ...,
       [   2,   75,  641, ...,  269, 1141, 1339],
       [ 173,   35,   42, ...,  133,   87,  122],
       [ 126,   61,   96, ...,   38,   26,  107]], dtype=int32)

In [16]:
x_train.shape

(187736, 10)

In [17]:
with strategy.scope():
    model = Sequential()
    model.add(Embedding(NB_WORDS,256,
                     input_length=MAX_SEQUENCE_LENGTH))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.7))
    model.add(Dense(N_CATEGORIES, activation='softmax'))
    

    model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 256)           1270272   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               142800    
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 5606)              566206    
Total params: 1,979,278
Trainable params: 1,979,278
Non-trainable params: 0
_________________________________________________________________


In [18]:
cd /kaggle/working/

/kaggle/working


In [19]:

    
# Model callbacks
path = 'best_model_weights'
es_callback = EarlyStopping(monitor="val_loss", patience=5,verbose=1)

mc_callback = ModelCheckpoint('{}.hdf5'.format(path), monitor='val_loss',
                              verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)
callbacks = [es_callback,mc_callback]

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 256)           1270272   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               142800    
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 5606)              566206    
Total params: 1,979,278
Trainable params: 1,979,278
Non-trainable params: 0
_________________________________________________________________


In [20]:
%%time
# Fit the model

model.fit(x_train,
              y_train,
              epochs=100,
              batch_size=64*strategy.num_replicas_in_sync,
              validation_data=(x_validation, y_validation),
              callbacks = callbacks
             )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 00077: early stopping
CPU times: user 2min 48s, sys: 31.1 s, total: 3min 19s
Wall time: 6min 31s


<tensorflow.python.keras.callbacks.History at 0x7f6f0b84dc10>

In [21]:
cd /kaggle/working/

/kaggle/working


In [22]:
# Test preprocessing
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_text = test[test.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
x_test = padding(test_text.challenge)

# Get top 3 predictions for each user
pred = model.predict(x_test,batch_size=2048)
pred = pred.argsort(axis=1)[:,-3:][:,::-1]

In [None]:
# Test preprocessing
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_text = test[test.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
x_test = padding(test_text.challenge)

# Get top 3 predictions for each user
pred = model.predict(x_test,batch_size=2048)
pred = pred.argsort(axis=1)[:,-3:][:,::-1]

In [None]:
pred.shape

In [23]:
df_list = []
for i in range(3):
    test_11 = test_text[['user_id']]
    test_11['user_sequence'] = test_11.user_id.astype(str) + '_'+str(i+11)
    test_11['challenge'] = encoder.inverse_transform(pred[:,i])
    df_list.append(test_11[['user_sequence','challenge']])
pd.concat(df_list).to_csv('bes2_submission.csv',index=False)

ValueError: Length of values does not match length of index

In [None]:
pwd

In [None]:
pd.concat(df_list).to_csv('submission.csv',index=False)

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=10000,
                                 output_dim=300,
                                 mask_zero=True))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128,
                                                       return_sequences=True)))
model.add(SeqSelfAttention(attention_activation='sigmoid'))
model.add(keras.layers.Dense(units=5))
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['categorical_accuracy'],
)
model.summary()

In [None]:
with strategy.scope():
    # GRU with glove embeddings and two dense layers
     model = Sequential()
     model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len,
                     trainable=False))
     model.add(SpatialDropout1D(0.3))
     model.add(GRU(300))
     model.add(Dense(1, activation='sigmoid'))

     model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    