In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import backend as K

import torchtext

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, random_split

import csv
import os
import pathlib
import random

from numpy import random
from scipy import spatial

from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, AdamW, get_linear_schedule_with_warmup

from tqdm import tqdm, trange

In [3]:
data = pd.read_csv('/kaggle/input/embeddings/all_embeddings_forML.csv')
voc = np.unique(data[['c1', 'c2', 'cmp']].values.reshape(-1))

In [4]:
c1 = list(data['c1'])
c2 = list(data['c2'])
compounds = list(data['cmp'])

In [5]:
vocab = torchtext.vocab.GloVe(name='6B', dim=50)

In [6]:
x_c1 = [vocab[w] for w in c1]
x_c2 = [vocab[w] for w in c2]

X = tf.stack([tf.concat([a,b], axis=0) for a, b in zip(x_c1, x_c2)])
y = tf.stack([vocab[w] for w in compounds])

In [7]:
class WordsDataset(Dataset):
    
    def __init__(self, raw):
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.inputs = []

        for word in raw:
            token = self.tokenizer.encode(f"<|startoftext|>{word}<|endoftext|>")
            self.inputs.append(torch.tensor(token))

        self.count = len(self.inputs)
        
    def __len__(self):
        return self.count

    def __getitem__(self, item):
        return self.inputs[item]

In [8]:
dataset = WordsDataset(c1 + c2 + compounds)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [9]:
args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    logging_steps=500,
    save_steps=500,                                   
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=10,
    weight_decay=0.1,
    report_to="none"
)

In [10]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
trainer = Trainer(
    model=model
  , args=args
  , train_dataset=train_dataset
  , data_collator=lambda data: {
      'input_ids': torch.stack([f[0] for f in data]),
      'attention_mask': torch.stack([f[1] for f in data]),
      'labels': torch.stack([f[0] for f in data])
  }).train()

In [None]:
generated = tokenizer("<|startoftext|>break", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=5, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))