In [1]:
import numpy as np
from preprocess import load_data,load_dict
import pandas as pd

train_path = './essay_dataset/training_set_rel3.tsv'
training_data = load_data(train_path)
training_data.dropna(subset=['essay_set','domain1_score', 'essay'],how='any',inplace = True)
training_data = training_data[['essay_set','domain1_score', 'domain2_score','essay']]
training_data.domain1_score = training_data.domain1_score.astype('float64')

glove_dict = load_dict('glove.840B.300d.pkl')
print(len(training_data))

12976


In [2]:
from string import punctuation
word2ind = {}
for text in training_data.essay:
    lowered = ''.join([(' '+c+' ') if c in punctuation else c for c in text.lower() ])
    sequence = lowered.split()
    for word in sequence:
        if word not in word2ind:
            word2ind[word] = len(word2ind)
      

In [3]:
len(word2ind)

39759

In [4]:
valid_path = './essay_dataset/valid_set.tsv'
valid_label_path = './essay_dataset/valid_sample_submission_2_column.csv'
valid_data = load_data(valid_path)
valid_label = pd.read_csv(valid_label_path)

In [5]:
label_dict = dict(zip(valid_label.prediction_id, valid_label.predicted_score))
for (i,row) in valid_data.iterrows():
    valid_data.at[i,'domain1_predictionid'] = label_dict[row['domain1_predictionid']]
valid_data = valid_data[['essay_set','essay','domain1_predictionid']]
valid_data = valid_data.rename(index=str, columns={'domain1_predictionid':'domain1_score'})
valid_data.domain1_score = valid_data.domain1_score.astype('float64')

In [6]:
def process_scores(data, score_domain):
    
    for (i,row) in data.iterrows():
        col = score_domain
        if row['essay_set'] == 1:
            data.at[i, col] = row[score_domain] - 2
        elif row['essay_set'] == 2:
            data.at[i, col] =(row[score_domain] - 1)*2
        elif row['essay_set'] == 3 or row['essay_set'] == 4:
            data.at[i, col] =row[score_domain]/3.0*10
            
        elif row['essay_set'] == 5 or row['essay_set'] == 6:
            data.at[i, col]= row[score_domain]/4.0*10
            
        elif row['essay_set'] == 7:
            data.at[i, col] =row[score_domain]/3.0
            
        elif row['essay_set'] == 8:
            data.at[i, col] =row[score_domain]/6.0
    return data

In [7]:
training_data = process_scores(training_data, 'domain1_score')
valid_data = process_scores(valid_data, 'domain1_score')

In [19]:
from configs import cfg

cfg['dropout'] = 0.005 # dropout rate between two layers of LSTM; useful only when layers > 1; between 0 and 1
cfg['bidirectional'] = True # True or False; True means using a bidirectional LSTM
cfg['batch_size'] = 100 # batch size of input
cfg['learning_rate'] = 1e-4 # learning rate to be used
cfg['L2_penalty'] = 1e-4 # weighting constant for L2 regularization term; this is a parameter when you define optimizer
cfg['epochs'] = 15 # number of epochs for which the model is trained
cfg['embed'] = False

In [20]:
from dataloader import *
from models import *
import time

model = LSTM_Score(cfg, True)

model = model.to(torch.device(cfg['device']))
criterion = nn.MSELoss()
#optimizer = torch.optim.RMSprop(model.parameters(), lr=cfg['learning_rate'], weight_decay=cfg['L2_penalty'])
optimizer = torch.optim.Adam(model.parameters(), lr=cfg['learning_rate'], weight_decay=cfg['L2_penalty'])
#train_indices, valid_indices = train_valid_split(len(training_data), cfg['train_split'])
train_indices = list(range(len(training_data)))
valid_indices = list(range(len(valid_data)))
#np.save('val2.npy', valid_indices)
print('ready')

ready


In [None]:
 for epoch in range(cfg['epochs']):
    tloader = DataLoader(training_data, train_indices, cfg, glove_dict)
    vloader = DataLoader(valid_data, valid_indices, cfg, glove_dict)
    
    count = 0
    avg_loss = 0
    while tloader.has_next():
        train, label = tloader.get_next()
        
        model.zero_grad()
        
        y = model(train)
        
        
        loss = criterion(y, label)
        loss.backward()
        optimizer.step()

        count += 1
        avg_loss += loss.item()
        if count % 20 == 0:
            print("count = %d, loss = %.5f" %(count, avg_loss / 20))
            save_model(model, 'models_score/e' + str(epoch + 1) + 'b' + str(count) + '.pt')
            avg_loss = 0
        del train, label, y, loss
    
    count = 0
    avg_loss = 0
    with torch.no_grad():
        while vloader.has_next():
            train, label = vloader.get_next()
            y = model(train)
            loss = criterion(y, label)
            count += 1
            avg_loss += loss.item()
            del train, label, y, loss
    print('validation loss:', avg_loss / count)
    print('epoch finished:', epoch + 1)
    

count = 20, loss = 34.54701
count = 40, loss = 32.89048
count = 60, loss = 31.93664
count = 80, loss = 31.66302
count = 100, loss = 31.77691
count = 120, loss = 31.14269
validation loss: 30.69636903490339
epoch finished: 1
count = 20, loss = 30.64656
count = 40, loss = 30.23009
count = 60, loss = 28.48638
count = 80, loss = 28.53532
count = 100, loss = 27.80485
count = 120, loss = 26.82688
validation loss: 27.483848390125093
epoch finished: 2
count = 20, loss = 26.54094
count = 40, loss = 26.59741
count = 60, loss = 25.24757
count = 80, loss = 26.23231
count = 100, loss = 24.26094
count = 120, loss = 25.03706
validation loss: 24.546043305169967
epoch finished: 3
count = 20, loss = 23.83122
count = 40, loss = 23.77767
count = 60, loss = 23.34849
count = 80, loss = 22.39374
count = 100, loss = 22.99848
count = 120, loss = 22.09768


In [18]:
cfg['batch_size'] = 5
vloader = DataLoader(valid_data, valid_indices, cfg, glove_dict)
count = 0
avg_loss = 0
with torch.no_grad():
    while vloader.has_next():
        train, label = vloader.get_next()
        y = model(train)
        #y = y.permute(0, 2, 1)
        print(label - y)
        loss = criterion(y, label)
        count += 1
        avg_loss += loss.item()
        del train, label, y, loss
        break
print('validation loss:', avg_loss / count)

tensor([[ 0.4052],
        [ 4.6292],
        [ 6.1769],
        [-1.2990],
        [ 0.8844]], device='cuda:0')
validation loss: 12.443446159362793
