In [1]:
!wget https://datahack-prod.s3.amazonaws.com/test_file/test_HLxMpl7.zip
!wget https://datahack-prod.s3.amazonaws.com/train_file/train_mddNHeX.zip
!wget https://datahack-prod.s3.amazonaws.com/sample_submission/sample_submission_J0OjXLi_DDt3uQN.csv

--2020-07-22 05:53:45--  https://datahack-prod.s3.amazonaws.com/test_file/test_HLxMpl7.zip
Resolving datahack-prod.s3.amazonaws.com (datahack-prod.s3.amazonaws.com)... 52.219.64.48
Connecting to datahack-prod.s3.amazonaws.com (datahack-prod.s3.amazonaws.com)|52.219.64.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2686933 (2.6M) [application/zip]
Saving to: ‘test_HLxMpl7.zip’


2020-07-22 05:53:47 (1.57 MB/s) - ‘test_HLxMpl7.zip’ saved [2686933/2686933]

--2020-07-22 05:53:51--  https://datahack-prod.s3.amazonaws.com/train_file/train_mddNHeX.zip
Resolving datahack-prod.s3.amazonaws.com (datahack-prod.s3.amazonaws.com)... 52.219.62.80
Connecting to datahack-prod.s3.amazonaws.com (datahack-prod.s3.amazonaws.com)|52.219.62.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4566884 (4.4M) [application/zip]
Saving to: ‘train_mddNHeX.zip’


2020-07-22 05:53:54 (2.37 MB/s) - ‘train_mddNHeX.zip’ saved [4566884/4566884]

--2020-07-22 05:53

In [2]:
!unzip train_mddNHeX.zip
!unzip test_HLxMpl7.zip

Archive:  train_mddNHeX.zip
  inflating: challenge_data.csv      
  inflating: train.csv               
Archive:  test_HLxMpl7.zip
  inflating: test.csv                
   creating: __MACOSX/
  inflating: __MACOSX/._test.csv     


In [3]:
# Load Libraries
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,Input,BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint

pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

Using TensorFlow backend.
  app.launch_new_instance()


In [5]:
train = pd.read_csv('train.csv')
chal = pd.read_csv('challenge_data.csv')
test = pd.read_csv('test.csv')

In [6]:
# Create labels
label = train[train.challenge_sequence > 10][['user_id','challenge']]
label.rename(columns={'challenge':'label'},inplace=True)

In [7]:
# Treat the sequence of challenges as text
df = train[train.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()

In [8]:
df = df.merge(label)

In [9]:
# Validation split for early stopping
df_train, df_validation = train_test_split(df.sample(frac=1,random_state=123), test_size=0.05, random_state=123)


In [10]:
# Encode challenges
encoder = LabelEncoder()
encoder.fit(challenges['challenge_ID'])
df_train['brand_id_encoded'] = encoder.transform(df_train.label)
df_validation['brand_id_encoded'] = encoder.transform(df_validation.label)

In [11]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['challenge'])

In [12]:
# Constants
NB_WORDS = len(tokenizer.word_index)
MAX_SEQUENCE_LENGTH = 10
N_CATEGORIES = challenges.shape[0]

In [14]:
# Create sequences
sequences_train = tokenizer.texts_to_sequences(df_train['challenge'])
sequences_validation = tokenizer.texts_to_sequences(df_validation['challenge'])

In [15]:
# Pad sequences
x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
x_validation = pad_sequences(sequences_validation, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')


In [20]:
# Set Labels
y_train = df_train['brand_id_encoded'].values
y_validation= df_validation['brand_id_encoded'].values

In [16]:
# NN architecture
def get_model(path='',lr=0.001):
    adam = Adam(lr=lr)
    inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
    x = Embedding(NB_WORDS,1024)(inp)
    x = BatchNormalization()(x)
    x = Bidirectional(LSTM(512, dropout=0.1, recurrent_dropout=0.1))(x)
    x = Dropout(0.4)(x)
    x = Dense(N_CATEGORIES, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    if path != '':
        model.load_weights(path)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

In [17]:
# Initialize the model
model = get_model()

In [18]:
path = 'best_model_weights'
es_callback = EarlyStopping(monitor="val_loss", patience=5)
mc_callback = ModelCheckpoint('{}.hdf5'.format(path), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)
callbacks = [es_callback,mc_callback]

In [None]:
# Fit the model
model.fit(x_train,
          y_train,
          epochs=15,
          batch_size=4096,
          validation_data=(x_validation, y_validation),
          callbacks = callbacks
)

In [None]:
# Load best weights
model = get_model('{}.hdf5'.format(path))

In [None]:
# Test preprocessing
def padding(text):
	return pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_text = test[test.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
x_test = padding(test_text.challenge)

In [None]:
# Get top 3 predictions for each user
pred = model.predict(x_test,batch_size=4096)
pred = pred.argsort(axis=1)[:,-3:][:,::-1]

In [None]:
# Write Predictions
brain = []
for i in range(3):
	test_11 = test_text[['user_id']]
	test_11['user_sequence'] = test_11.user_id.astype(str) + '_'+str(i+11)
	test_11['challenge'] = encoder.inverse_transform(pred[:,i])
	brain.append(test_11[['user_sequence','challenge']])
pd.concat(brain).to_csv('final_csv.csv',index=False)