# Preprocessing

In [25]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from scipy import spatial

### Method (1: Sum, 2: Softmax)

In [26]:
def get_dataset(method, p):
    atom_env = pd.read_csv('atom_env_matrix.csv', index_col=0).transpose()
    atom_env = atom_env.reset_index().values
    # np.random.shuffle(atom_env)

    atoms = []
    dataset = []
    if method == 1:
        for i in range(len(atom_env)): 
            tmp = atom_env[i]
            atoms.append(tmp[0])
            dataset.append(tmp[1:]/(sum(tmp[1:]**p)**(1/p)))
        dataset = np.float32(dataset)
    elif method == 2:
        for i in range(len(atom_env)): 
            tmp = atom_env[i]
            atoms.append(tmp[0])
            dataset.append(F.softmax(torch.from_numpy(np.float32(tmp[1:])),dim=0))
        for i in range(len(dataset)):
            dataset[i] = dataset[i].numpy()
            
    return atoms, dataset

In [27]:
atoms, dataset = get_dataset(method=1, p=2)

### Check the similarity between two atoms in the same group before making atom2vec

In [28]:
atom_dict = {}
for i in range(len(atoms)):
    atom_dict[atoms[i]] = dataset[i]

In [30]:
first_atom_vec = atom_dict['Rb']
second_atom_vec = atom_dict['Na']
similarity = 1 - spatial.distance.cosine(first_atom_vec, second_atom_vec)
print (similarity)

0.423652768135


# Make atom2vec using SVD, NMF, AE

In [31]:
from torch import nn
from torch.utils.data import DataLoader
from sklearn.decomposition import NMF

In [32]:
which_gpu = 0

In [33]:
class autoencoder_phase1(nn.Module):
    def __init__(self, vec_num):
        super(autoencoder_phase1, self).__init__()
        self.enc = nn.Sequential(
            nn.Linear(23095,8192),
            nn.ReLU(),
            nn.Linear(8192,4096),
            nn.ReLU(),
            nn.Linear(4096,2048),
            nn.ReLU(),
            nn.Linear(2048,vec_num),
            nn.ReLU())
        
        self.dec = nn.Sequential(
            nn.Linear(vec_num,2048),
            nn.ReLU(),
            nn.Linear(2048,4096),
            nn.ReLU(),
            nn.Linear(4096,8192),
            nn.ReLU(),
            nn.Linear(8192,23095),
            nn.ReLU())
        
        # Skip connection consider

    def forward(self, x):
        atom2vec = self.enc(x)
        recon_x = self.dec(atom2vec)
        #recon_x = torch.min(recon_x, torch.ones(recon_x.shape[0], recon_x.shape[1]).cuda(which_gpu))
        return atom2vec, recon_x

In [34]:
def set_AE(batch_size, vec_num, learning_rate):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model = autoencoder_phase1(vec_num).cuda(which_gpu)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=100, verbose=True)
    return dataloader, model, optimizer, scheduler

def run_AE(dataloader, model, optimizer, scheduler, num_epochs):
    for epoch in range(num_epochs):
        atom2vec = []
        epoch_loss = 0
        for i, data in enumerate(dataloader):
            x = data.type(torch.FloatTensor).cuda(which_gpu)
            tmp_batch = x.shape[0]
            # ===================forward=====================
            tmp, recon_x = model(x)
            atom2vec.append(tmp)
            
            loss = loss5(x, recon_x, tmp_batch)
            epoch_loss += loss.item()
            # ===================backward====================
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        epoch_loss = epoch_loss/len(dataloader)
        scheduler.step(epoch_loss)
        if epoch % (num_epochs//5) == 0:
            print('epoch [{}/{}], loss:{:.20f}'.format(epoch + 1, num_epochs, epoch_loss))
    atom2vec = torch.cat(atom2vec, dim=0).cpu().detach().numpy()
    return atom2vec

def loss1(x, recon_x, tmp_batch):
    log_recon_x = torch.log2(F.softmax(recon_x, dim=1))
    loss = -torch.mean(torch.bmm(x.view(tmp_batch, 1, 23095), log_recon_x.view(tmp_batch, 23095, 1)).view(-1))
    return loss

def loss2(x, recon_x, tmp_batch):
    similarity = torch.sum(F.cosine_similarity(x, recon_x, dim=1))/tmp_batch
    loss = 1-similarity
    return loss

def loss3(x, recon_x, tmp_batch):
    loss = torch.sum(F.pairwise_distance(x, recon_x, p=2))/tmp_batch
    return loss
    
def loss4(x, recon_x, tmp_batch):
    diff = (recon_x-x)**2
    #diff = torch.abs(recon_x-x)
    
    weight_x = x.clone()
    weight_x[x==0] = 0.01
    weight_x[x!=0] = 1
    
    loss = torch.mean(torch.bmm(weight_x.view(tmp_batch, 1, 23095), diff.view(tmp_batch, 23095, 1)).view(-1))
    return loss

def loss5(x, recon_x, tmp_batch):
    loss = torch.sum(x * (torch.log(x+1e-8)-torch.log(recon_x+1e-8)) - x + recon_x)
    return loss

---

In [35]:
def SVD_or_NMF_or_AE(method, vec_num):
    if method == 'SVD':
        u, s, vh = np.linalg.svd(dataset, full_matrices=False)
        tmp = []
        for i in range(len(u)):
            tmp.append(u[i][:vec_num])
        tmp = np.array(tmp)

    elif method == 'NMF':
        model = NMF(n_components=vec_num, solver='cd', init='nndsvd', tol = 1e-6, max_iter=5000)
        W = model.fit_transform(dataset)
        H = model.components_
        tmp = np.array(W)
    
    elif method == 'AE':
        dataloader, model, optimizer, scheduler = set_AE(batch_size=90, vec_num=vec_num, learning_rate=1e-3)
        tmp = run_AE(dataloader, model, optimizer, scheduler, num_epochs=2000+1)
        
    file_name = 'atom2vec_' + method + str(vec_num) +'.csv'
    tmp_pd = pd.DataFrame(tmp, index=atoms)
    tmp_pd.to_csv(file_name)

In [11]:
SVD_or_NMF_or_AE('SVD', 20)

In [12]:
SVD_or_NMF_or_AE('NMF', 20)

In [13]:
SVD_or_NMF_or_AE('AE', 1024)

epoch [1/2001], loss:16361.20703125000000000000
epoch [401/2001], loss:14188.54003906250000000000
Epoch   591: reducing learning rate of group 0 to 1.0000e-04.
Epoch   692: reducing learning rate of group 0 to 1.0000e-05.
Epoch   793: reducing learning rate of group 0 to 1.0000e-06.
epoch [801/2001], loss:14088.27441406250000000000
Epoch   894: reducing learning rate of group 0 to 1.0000e-07.
Epoch   995: reducing learning rate of group 0 to 1.0000e-08.
epoch [1201/2001], loss:14087.99414062500000000000
epoch [1601/2001], loss:14087.98339843750000000000
epoch [2001/2001], loss:14087.96972656250000000000


---

In [21]:
phase1_ae = pd.read_csv('atom2vec_AE1024.csv', index_col=0)
'''
for i in range(1024):
    idx = str(i)
    if sum(phase1_ae[idx]) == 0:
        phase1_ae = phase1_ae.drop(idx, 1)
'''
phase1_ae = phase1_ae.reset_index().values

atoms = []
dataset = []
for i in range(len(phase1_ae)): 
    tmp = phase1_ae[i]
    atoms.append(tmp[0])
    dataset.append(tmp[1:])
dataset = np.float32(dataset)

In [22]:
class autoencoder_phase2(nn.Module):
    def __init__(self, vec_num):
        super(autoencoder_phase2, self).__init__()
        self.enc = nn.Sequential(
            nn.Linear(1024,256),
            nn.ReLU(),
            nn.Linear(256,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,vec_num),
            nn.ReLU())
        
        self.dec = nn.Sequential(
            nn.Linear(vec_num,64),
            nn.ReLU(),
            nn.Linear(64,128),
            nn.ReLU(),
            nn.Linear(128,256),
            nn.ReLU(),
            nn.Linear(256,1024),
            nn.ReLU())
        
        # Skip connection consider

    def forward(self, x):
        atom2vec = self.enc(x)
        recon_x = self.dec(atom2vec)
        #recon_x = torch.min(recon_x, torch.ones(recon_x.shape[0], recon_x.shape[1]).cuda(which_gpu))
        return atom2vec, recon_x

In [23]:
def set_AE(batch_size, vec_num, learning_rate):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model = autoencoder_phase2(vec_num).cuda(which_gpu)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=100, verbose=True)
    return dataloader, model, optimizer, scheduler

def SVD_or_NMF_or_AE(method, vec_num):
    if method == 'SVD':
        u, s, vh = np.linalg.svd(dataset, full_matrices=False)
        tmp = []
        for i in range(len(u)):
            tmp.append(u[i][:vec_num])
        tmp = np.array(tmp)

    elif method == 'NMF':
        model = NMF(n_components=vec_num, solver='cd', init='nndsvd', tol = 1e-6, max_iter=5000)
        W = model.fit_transform(dataset)
        H = model.components_
        tmp = np.array(W)
    
    elif method == 'AE':
        dataloader, model, optimizer, scheduler = set_AE(batch_size=90, vec_num=vec_num, learning_rate=1e-3)
        tmp = run_AE(dataloader, model, optimizer, scheduler, num_epochs=20000+1)
        
    file_name = 'atom2vec_' + method + str(vec_num) +'.csv'
    tmp_pd = pd.DataFrame(tmp, index=atoms)
    tmp_pd.to_csv(file_name)

In [24]:
SVD_or_NMF_or_AE('AE', 20)

epoch [1/20001], loss:9966.43847656250000000000
Epoch   913: reducing learning rate of group 0 to 1.0000e-04.
Epoch  3428: reducing learning rate of group 0 to 1.0000e-05.
Epoch  3529: reducing learning rate of group 0 to 1.0000e-06.
Epoch  3630: reducing learning rate of group 0 to 1.0000e-07.
Epoch  3731: reducing learning rate of group 0 to 1.0000e-08.
epoch [4001/20001], loss:3706.92944335937500000000
epoch [8001/20001], loss:3706.87548828125000000000
epoch [12001/20001], loss:3706.71606445312500000000
epoch [16001/20001], loss:3706.50390625000000000000
epoch [20001/20001], loss:3706.34497070312500000000
