# Word Segmentation Modelling (version : alam.1.2)

This is the modelling documentation of Word Segmentation

First Structured :


Input :

    INPUT LENGTH =15
    TRAIN = 90k data (Random state : 342)
    

output :

    Seq2Seq


## Plan

    [*] Get Data
    [*] Transform the data
    [ ] Prepare the model
    [ ] Train it

In [1]:
import sys
sys.path.append('../scr/')
import pandas as pd
import torch
import numpy as np
import os
import pickle
import WordSegmenTools as wst
from torch.autograd import Variable
import random
from sklearn.preprocessing import OneHotEncoder


In [2]:
PATH_DATA_RAW='../../data/raw/'
PATH_DATA_CLN='../../data/clean/'
PATH_MODEL='../../data/model/'

In [3]:
os.listdir(PATH_DATA_CLN)

['data_clean_100k.res']

## Get Data

In [4]:
INPUT_LENGTH=15


In [5]:
# data_cln=pd.read_csv(PATH_DATA_CLN+'data_clean_100k.res')
data_cln=pickle.load(open(PATH_DATA_CLN+'data_clean_100k.res','rb'))

In [6]:
data_cln['len_source']=data_cln.source.apply(len)
data_used=data_cln[data_cln.len_source==INPUT_LENGTH]
data_used=data_used.reset_index()[['source','target','index']]
data_used.columns=['source','target','ids']

In [7]:
data_used.sample(10)

Unnamed: 0,source,target,ids
70882,diiniselainyang,di ini selain yang,1735243
97532,sebelumnyadasar,sebelumnya dasar,1862347
86145,perhatianramsey,perhatian ramsey,1807857
78675,ceobagianmadrid,ceo bagian madrid,1772446
97129,yangrincitvbaik,yang rinci tv baik,1860404
1359,dansaksikanfoto,dan saksikan foto,1406416
88169,memperbaikiatas,memperbaiki atas,1817330
12582,adasebagaikalla,ada sebagai kalla,1458956
2204,sergibahwasenin,sergi bahwa senin,1410438
53157,pemulihanversus,pemulihan versus,1651357


## Transform Data

In [8]:
data_used['flag_space'] = data_used['target'].map(wst.get_flag_space)
word2idx, idx2word = wst.get_label_index(data_used.source)

In [9]:
data_used_train=data_used.sample(90000,random_state=342)
data_used_test=data_used[~(data_used.ids.isin(data_used_train.ids))]

In [10]:
## Target 
Y_tr=np.array(data_used_train.flag_space.apply(list).tolist(),dtype=np.double)
Y_tr=Y_tr.reshape(-1,INPUT_LENGTH,1)
Y_te=np.array(data_used_test.flag_space.apply(list).tolist(),dtype=np.double)
Y_te=Y_te.reshape(-1,INPUT_LENGTH,1)

encoder = OneHotEncoder()
encoder.fit(Y_tr.reshape(-1,1))

Y_tr=encoder.transform(Y_tr.reshape(-1,1)).toarray().reshape(-1,INPUT_LENGTH,len(encoder.categories_[0]))
Y_te=encoder.transform(Y_te.reshape(-1,1)).toarray().reshape(-1,INPUT_LENGTH,len(encoder.categories_[0]))

## Source
X_tr=wst.char_vectorizer(data_used_train.source.tolist(), word2idx, INPUT_LENGTH)
X_te=wst.char_vectorizer(data_used_test.source.tolist(), word2idx, INPUT_LENGTH)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
# word2idx

In [12]:
X_tr.shape

(90000, 15, 26)

In [13]:
Y_tr.shape

(90000, 15, 2)

## Prepare the Model

In [14]:
class Encoder(torch.nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.lstm = torch.nn.LSTM(input_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = torch.nn.Dropout(p=dropout)
        
    def forward(self, src):
        embedded = self.dropout(src)
        outputs, (hidden, cell) = self.lstm(src)
        return outputs,hidden, cell


class Decoder(torch.nn.Module):
    def __init__(self, input_dim ,output_dim, hid_dim, n_layers, dropout):
        super(Decoder, self).__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.lstm = torch.nn.LSTM(input_dim, hid_dim, n_layers, dropout = dropout)
        self.Lin = torch.nn.Linear(hid_dim, output_dim)
        self.out = torch.nn.Softmax()
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):        
        input = input.unsqueeze(0)
        embedded = self.dropout(input)

        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.out(self.Lin(output.squeeze(0)))
        return prediction, hidden, cell


class Seq2Seqv1(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seqv1,self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def duplicate_hidden(self,h,n_layers,batch_size):
        list_h=[]
        for i in range(n_layers):
            list_h+=[h[:,-1,:][i]]*batch_size
        enc_h=torch.cat(list_h,0).view(n_layers,batch_size,-1)
        return enc_h
        
    def forward(self, src, trg=None, teacher_forcing_ratio = 0.5):  
        if teacher_forcing_ratio==0:
            trg=torch.zeros(batch_size,max_len,trg_vocab_size)
            
        batch_size = trg.shape[0]
        max_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        n_layers = self.decoder.n_layers
        
        #tensor to store decoder outputs
        outputs = torch.zeros(batch_size, max_len, trg_vocab_size)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        out_enc, h, c = self.encoder(src)
        enc_h=self.duplicate_hidden(h,n_layers,batch_size)
        enc_c=self.duplicate_hidden(c,n_layers,batch_size)          
        
        #first input to the decoder is the <sos> tokens
        input = trg[:,0]
        outputs[:,0]=input

        for t in range(1, max_len):
            output, enc_h, enc_c = self.decoder(input, enc_h, enc_c)
            outputs[:,t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = (trg[:,t] if teacher_force else output)

        return outputs
    


In [15]:
random.random()<0

False

In [16]:
# layers=2
# batch=3
# seq_len=5
# embed=10
# hidden_layer=30
# output_dim=1
# dropout=0.1

# model0=Encoder(embed,hidden_layer,layers,dropout)
# model1=Decoder(output_dim,output_dim,hidden_layer,layers,dropout)
# model_seq=Seq2Seqv1(model0,model1)

# X_source=torch.rand(batch,seq_len,embed)
# X_target=torch.rand(batch,seq_len,output_dim)

# model_seq(X_source,X_target,0)

In [17]:
def data_batching_torch(X,y,batch):
    batches=[(torch.from_numpy(X[i*(batch):i*(batch)+batch]).type('torch.DoubleTensor'),
              torch.from_numpy(y[i*(batch):i*(batch)+batch]).type('torch.DoubleTensor'))   
                  for i in range(int(len(X)/batch)+1)]
    return batches

In [18]:
BATCH_SIZE=25
training_samples=data_batching_torch(X_tr,Y_tr,BATCH_SIZE)
len(training_samples)

3601

In [19]:
# Get the model
LAYERS=1
EMBED=26
HIDDEN_LAYER=15
DROPO=0.1
OUT_DIM=2
learningRate=0.0009
teach_f=0.2

model0=Encoder(EMBED,HIDDEN_LAYER,LAYERS,DROPO)
model1=Decoder(OUT_DIM,OUT_DIM,HIDDEN_LAYER,LAYERS,DROPO)
model_seq=Seq2Seqv1(model0,model1)

  "num_layers={}".format(dropout, num_layers))


In [20]:
## Choose the optimizer and criterion
# criterion = torch.nn.CrossEntropyLoss()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model_seq.parameters(),lr=learningRate)
# optimizer = torch.optim.SGD(model.parameters(),lr=learningRate)

In [21]:
AA=training_samples[0][1]

In [22]:
AA.type()

'torch.DoubleTensor'

## Train it

In [24]:
epochs=10
running_loss_train = 0
model_seq.double()
model_seq.train()

for epoch in range(epochs):
    running_loss=[]
    # Converting inputs and labels to Variable
    
    for batch_idx, (data, target) in enumerate(training_samples, 1):
        if data.size()[0]>0:
            inputs = Variable(data)
            labels = Variable(target)
            optimizer.zero_grad()
            outputs= model_seq(inputs,labels,teach_f)
            # get loss for the predicted output
            # print(outputs.type(),labels.float().type())
            loss = criterion(outputs, labels.float())
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model_seq.parameters(), 1)
            # update parameters
            optimizer.step()

            running_loss.append(loss.item())
            running_loss_train += (loss.item() - running_loss_train) / batch_idx
        
    if epoch % 2 == 0:
        print("running loss train", running_loss_train)
        



running loss train 0.3063901637577338
running loss train 0.3063124970346681
running loss train 0.3064514522916733
running loss train 0.3063511881480616
running loss train 0.30615947727527937


In [20]:
model.eval()

ModelLstm(
  (lstm): LSTM(26, 25)
  (linear): Linear(in_features=25, out_features=15, bias=False)
  (sigmoid): Sigmoid()
)

In [21]:
## Training
XX=X_tr.copy()
XX=Variable(torch.from_numpy(XX).type('torch.DoubleTensor'))
yy=model(XX)

In [33]:
yy0=yy[0].data.numpy()
yy0=(yy0>0.5).astype(int).astype(str)
yy0=[''.join(i) for i in yy0]
yy0[:10]

['000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000']

In [34]:
data_used_train['y_pred']=yy0

In [36]:
data_used_train.sample(10)

Unnamed: 0,source,target,ids,flag_space,y_pred
50144,melainkandaerah,melainkan daerah,1637189,1000000,0
67640,menegaskantiket,menegaskan tiket,1719973,100000,0
87884,pelakusetiapini,pelaku setiap ini,1815936,1000001000,0
32800,tahunberbobotdi,tahun berbobot di,1554907,10000000100,0
60884,terpopulerdalam,terpopuler dalam,1687901,100000,0
15182,identikinisudah,identik ini sudah,1471301,100100000,0
65497,denganpekerjaan,dengan pekerjaan,1709857,1000000000,0
48114,mobiltahunsedan,mobil tahun sedan,1627587,10000100000,0
58791,terkaitterhadap,terkait terhadap,1678152,100000000,0
31506,sertatariftahap,serta tarif tahap,1548878,10000100000,0


## Test it

In [37]:
## Test it
XX=X_te.copy()
XX=Variable(torch.from_numpy(XX).type('torch.DoubleTensor'))
yy=model(XX)

In [38]:
yy0=yy[0].data.numpy()
yy0=(yy0>0.5).astype(int).astype(str)
yy0=[''.join(i) for i in yy0]
yy0[:10]

['000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000']

In [39]:
data_used_test['y_pred']=yy0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [40]:
data_used_train.sample(10)

Unnamed: 0,source,target,ids,flag_space,y_pred
12342,sepakperusahaan,sepak perusahaan,1457750,10000000000,0
9138,miliarmelakukan,miliar melakukan,1442854,1000000000,0
3403,asalpendaftaran,asal pendaftaran,1416167,100000000000,0
74342,jelasnyastadium,jelasnya stadium,1751813,10000000,0
12597,cmnphargaplakat,cmnp harga plakat,1459043,100001000000,0
86940,bahwadenganyang,bahwa dengan yang,1811573,10000010000,0
54344,terjadibniingin,terjadi bni ingin,1657235,100100000,0
8989,untuktetappiper,untuk tetap piper,1442103,10000100000,0
15108,ratelalusamping,rate lalu samping,1471008,100010000000,0
28477,danyangnokiadwi,dan yang nokia dwi,1534286,1000100001000,0
