In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing import text,sequence
from tqdm import tqdm
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from fastai.train import Learner
from fastai.train import DataBunch
from fastai.callbacks import *
from fastai.metrics import *
import os
import random

Using TensorFlow backend.


In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [3]:
train = pd.read_csv('../input/incedohackerearth/incedo_nlpcadad7d/incedo_participant/train_dataset.csv')
test = pd.read_csv('../input/incedohackerearth/incedo_nlpcadad7d/incedo_participant/test_dataset.csv')
submission = pd.read_csv('../input/incedohackerearth/incedo_nlpcadad7d/incedo_participant/sample_submission.csv')

In [4]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [5]:
train_text = preprocess(train['EssayText'])
test_text = preprocess(test['EssayText'])

In [6]:
MAX_LEN = max([len(x.split()) for x in train_text] + [len(x.split()) for x in test_text])
MAX_LEN

334

In [7]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(train_text) + list(test_text))

x_train = tokenizer.texts_to_sequences(train_text)
x_test = tokenizer.texts_to_sequences(test_text)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [8]:
train[['coherent','clarity']] = train[['coherent','clarity']].fillna('Unknown')
test[['coherent','clarity']] = test[['coherent','clarity']].fillna('Unknown')

In [9]:
rev = lambda x : {v:k for k,v in x.items()}
index_1 = rev(dict(enumerate(list(train['clarity'].value_counts().keys()))))
index_2 = rev(dict(enumerate(list(train['coherent'].value_counts().keys()))))

train['clarity'] = train['clarity'].apply(lambda x : index_1[x])
train['coherent'] = train['coherent'].apply(lambda x : index_2[x])
test['clarity'] = test['clarity'].apply(lambda x : index_1[x])
test['coherent'] = test['coherent'].apply(lambda x : index_2[x])

In [10]:
train['Essayset'] = train['Essayset'].fillna(-1)
test['Essayset'] = test['Essayset'].fillna(-1)

In [11]:
score = []

for _,row in train.iterrows():
    total = 5
    total -= 1 if row['score_1'] == np.nan else 0
    total -= 1 if row['score_2'] == np.nan else 0
    total -= 1 if row['score_3'] == np.nan else 0
    total -= 1 if row['score_4'] == np.nan else 0
    total -= 1 if row['score_5'] == np.nan else 0
    row = row.fillna(0)
    raw = (row['score_1'] +row['score_2'] +row['score_3'] +row['score_4'] +row['score_5'])/total
    score.append(raw)
score = np.asarray(score)

In [12]:
score.min(),score.max(),score.mean()

(0.0, 3.0, 0.9460717009916095)

In [13]:
LSTM_UNITS = 75
DENSE_HIDDEN_UNITS = 2*LSTM_UNITS

In [14]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear3 = nn.Linear(4, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        
    def forward(self, x):
        inp1 = x[:,:-4].clone().detach().long()
        inp2 = x[:,-4:].clone().detach()
        h_embedding = self.embedding(inp1)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        pool = torch.mean(h_lstm2, 1)
        poool,_ = torch.max(h_lstm2, 1)
        
        h_conc_linear1  = F.relu(self.linear1(pool))
        h_conc_linear2  = F.relu(self.linear2(poool))
        h_conc_linear3  = F.relu(self.linear3(inp2))        

        hidden = h_conc_linear1 + h_conc_linear2  + h_conc_linear3
        
        result = self.linear_out(hidden)
        out = torch.zeros_like(result)
        
        for i in range(len(out)):
            out[i,0] = torch.clamp(result[i,0],max=inp2[i,3])
        
        return out

In [15]:
import gensim
import re
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

# https://www.kaggle.com/cpmpml/spell-checker-using-word2vec
fast_text = gensim.models.KeyedVectors.load_word2vec_format('../input/wikinews300d1mvec/wiki-news-300d-1M.vec')
words = fast_text.index2word
w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i
WORDS = w_rank
def get_ft_word(word):
    try:
        return fast_text.get_vector(word)
    except:
        return None

# Use fast text as vocabulary
def words(text): return re.findall(r'\w+', text.lower())
def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)
def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)
def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or [word])
def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def singlify(word):
    return "".join([letter for i,letter in enumerate(word) if i == 0 or letter != word[i-1]])

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def load_glove(word_dict):
    EMBEDDING_FILE = '../input/glove840b300dtxt/glove.840B.300d.txt'
    embeddings_index = load_embeddings(EMBEDDING_FILE)
    embed_size = 300
    nb_words = len(word_dict)+1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
    print(unknown_vector[:5])
    for key in tqdm(word_dict):
        word = key
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            word = correction(key)
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector  
        nb_words -= 1
    return embedding_matrix, nb_words

def load_fasttext(word_dict):
    embed_size = 300
    nb_words = len(word_dict)+1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
    print(unknown_vector[:5])
    for key in tqdm(word_dict):
        word = key
        embedding_vector = get_ft_word(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = get_ft_word(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = get_ft_word(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = get_ft_word(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = get_ft_word(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = get_ft_word(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = get_ft_word(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            word = correction(key)
            embedding_vector = get_ft_word(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector
        nb_words -= 1
    return embedding_matrix, nb_words 

In [16]:
max_features = len(tokenizer.word_index) + 1
max_features

18799

In [17]:
matrix_1, words = load_glove(tokenizer.word_index)
print('words: ', words)
matrix_2, words = load_fasttext(tokenizer.word_index)
print('words: ', words)

2196018it [02:51, 12834.67it/s]
 32%|███▏      | 5987/18798 [00:00<00:00, 59869.06it/s]

[-1. -1. -1. -1. -1.]


100%|██████████| 18798/18798 [00:01<00:00, 11630.51it/s]
 29%|██▊       | 5388/18798 [00:00<00:00, 53759.44it/s]

words:  16978
[-1. -1. -1. -1. -1.]


100%|██████████| 18798/18798 [00:02<00:00, 8748.31it/s]

words:  16908





In [18]:
matrix = np.concatenate((matrix_1,matrix_2),axis=1)
matrix.shape

(18799, 600)

In [19]:
x_train = np.hstack((x_train,train['coherent'][:,np.newaxis],train['clarity'][:,np.newaxis],train['Essayset'][:,np.newaxis], train['max_score'][:,np.newaxis]))
x_test = np.hstack((x_test,test['coherent'][:,np.newaxis],test['clarity'][:,np.newaxis],test['Essayset'][:,np.newaxis], test['max_score'][:,np.newaxis]))

In [20]:
x_train_torch = torch.tensor(x_train, dtype=torch.float32)
y_train_torch = torch.tensor(score[:,np.newaxis], dtype=torch.float32)

In [21]:
x_test_torch = torch.tensor(x_test, dtype=torch.float32)

In [22]:
batch_size = 512

train_dataset = data.TensorDataset(x_train_torch[batch_size:], y_train_torch[batch_size:])
valid_dataset = data.TensorDataset(x_train_torch[:batch_size], y_train_torch[:batch_size])
test_dataset = data.TensorDataset(x_test_torch)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,num_workers=2)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

databunch = DataBunch(train_dl=train_loader,valid_dl=valid_loader)

In [23]:
model = NeuralNet(matrix)
learn = Learner(databunch,model=model,loss_func=nn.L1Loss())

In [24]:
learn.fit_one_cycle(5,max_lr=1e-2)

epoch,train_loss,valid_loss,time
0,0.463114,0.448623,00:09
1,0.376793,0.410535,00:09
2,0.327455,0.361204,00:09
3,0.293735,0.344698,00:09
4,0.272296,0.345047,00:09


In [25]:
learn.fit_one_cycle(20,max_lr=1e-2)

epoch,train_loss,valid_loss,time
0,0.24743,0.345446,00:09
1,0.245114,0.359905,00:09
2,0.24255,0.33288,00:09
3,0.248319,0.327566,00:09
4,0.246187,0.3221,00:09
5,0.244638,0.323693,00:09
6,0.233423,0.33124,00:09
7,0.231161,0.303655,00:09
8,0.221937,0.309278,00:09
9,0.210581,0.302956,00:09


In [26]:
test_preds = np.zeros((len(test), 1))    
for i, x_batch in enumerate(test_loader):
    X = x_batch[0].cuda()
    y_pred = learn.model(X).detach().cpu().numpy()
    test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

In [27]:
col = []

for idx,row in test.iterrows():
    if test_preds[idx][0] <= row['max_score']:
        col.append(test_preds[idx][0])
    else:
        print('bs')
        col.append(row['max_score'])
col = np.asarray(col)
test['essay_score'] = col.round().astype(np.int8)
test.drop(columns=['min_score','max_score','clarity','coherent','EssayText'],inplace=True)

In [28]:
test.columns = submission.columns
test

Unnamed: 0,id,essay_set,essay_score
0,1673,1,1
1,1674,1,1
2,1675,1,3
3,1676,1,0
4,1677,1,0
5,1678,1,2
6,1679,1,1
7,1680,1,1
8,1681,1,3
9,1682,1,2


In [29]:
test.to_csv('submission.csv',index=False)