# Word Segmentation Modelling (version : alam.1.1)

This is the modelling documentation of Word Segmentation

First Structured :


Input :

    INPUT LENGTH =15
    TRAIN = 90k data (Random state : 342)
    



output :


## Plan

    [*] Get Data
    [*] Transform the data
    [ ] Prepare the model
    [ ] Train it

In [1]:
import sys
sys.path.append('../scr/')
import pandas as pd
import torch
import numpy as np
import os
import pickle
import WordSegmenTools as wst
from torch.autograd import Variable


In [2]:
PATH_DATA_RAW='../../data/raw/'
PATH_DATA_CLN='../../data/clean/'
PATH_MODEL='../../data/model/'

In [3]:
os.listdir(PATH_DATA_CLN)

['data_clean_100k.res']

## Get Data

In [4]:
INPUT_LENGTH=15

In [5]:
# data_cln=pd.read_csv(PATH_DATA_CLN+'data_clean_100k.res')
data_cln=pickle.load(open(PATH_DATA_CLN+'data_clean_100k.res','rb'))

In [6]:
data_cln['len_source']=data_cln.source.apply(len)
data_used=data_cln[data_cln.len_source==INPUT_LENGTH]
data_used=data_used.reset_index()[['source','target','index']]
data_used.columns=['source','target','ids']

In [7]:
data_used.sample(10)

Unnamed: 0,source,target,ids
36198,sudahitupertama,sudah itu pertama,1571209
50419,diapoinlalubisa,dia poin lalu bisa,1638521
50095,kepadayanglepas,kepada yang lepas,1636987
54026,iniemasakanpssi,ini emas akan pssi,1655633
95416,saatharustampil,saat harus tampil,1852455
68649,kestadionsecara,ke stadion secara,1724854
14332,beberapaberarti,beberapa berarti,1467352
31970,ladiskonpranowo,la diskon pranowo,1551072
42282,satubeberapaitu,satu beberapa itu,1600506
38921,afckepadaantara,afc kepada antara,1584568


## Transform Data

In [8]:
data_used['flag_space'] = data_used['target'].map(wst.get_flag_space)
word2idx, idx2word = wst.get_label_index(data_used.source)

In [9]:
data_used_train=data_used.sample(90000,random_state=342)
data_used_test=data_used[~(data_used.ids.isin(data_used_train.ids))]

In [10]:
## Target 
Y_tr=np.array(data_used_train.flag_space.apply(list).tolist(),dtype=np.double)
Y_te=np.array(data_used_test.flag_space.apply(list).tolist(),dtype=np.double)

## Source
X_tr=wst.char_vectorizer(data_used_train.source.tolist(), word2idx, INPUT_LENGTH)
X_te=wst.char_vectorizer(data_used_test.source.tolist(), word2idx, INPUT_LENGTH)

In [11]:
X_tr.shape

(90000, 15, 26)

## Prepare the Model

In [12]:
## LSTM Example
class ModelLstm(torch.nn.Module):

    def __init__(self, input_size, output_size, hidden_dim):        
        super(ModelLstm, self).__init__()
        self.input_dim = input_size
        self.hidden_dim = hidden_dim

        # self.lstm = torch.nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
        self.lstm = torch.nn.LSTM(self.input_dim, self.hidden_dim, bidirectional=True)
        self.linear = torch.nn.Linear(self.hidden_dim*2, output_size)
        self.sigmoid = torch.nn.Sigmoid()

    # Why do sometimes need hidden init sometimes dont?
    def init_hidden(self,embed_len,batch):
        # This is what we'll initialise our hidden state as
        return (torch.zeros(embed_len, batch, self.hidden_dim),
               torch.zeros(embed_len, batch, self.hidden_dim))

    def forward(self, input):
        torch.manual_seed(200)
        batch_size=len(input)
        embedding_len=len(input[0][0])        
        input=input.view(batch_size,len(input[0]), -1)
        
        lstm_out, hidden2 = self.lstm(input)
        # lstm_out,h = self.lstm(input)
        
        # Only take the output from the final timetep
        out = lstm_out[:,-1]   
        out = out.view(batch_size,-1)
        y_pred = self.linear(out)
        y_pred = self.sigmoid(y_pred)
        
        return y_pred,'-'
    
## Remember : Batch, Seq, Embedding

In [13]:
def data_batching_torch(X,y,batch):
    batches=[(torch.from_numpy(X[i*(batch):i*(batch)+batch]).type('torch.DoubleTensor'),torch.from_numpy(y[i*(batch):i*(batch)+batch]).type('torch.DoubleTensor'))   
                  for i in range(int(len(X)/batch)+1)]
    return batches

In [14]:
BATCH_SIZE=25
training_samples=data_batching_torch(X_tr,Y_tr,BATCH_SIZE)
len(training_samples)

3601

In [15]:
# Get the model
outputDim = 1        # takes variable 'y'
hidden = 52
learningRate = 0.009
EMBED_SIZE=len(word2idx)
model = ModelLstm(input_size=EMBED_SIZE,output_size=INPUT_LENGTH,hidden_dim=hidden)
model=model.double()

##### For GPU #######
if torch.cuda.is_available():
    model.cuda()
    
## Input Length == Output Length

In [16]:
## Choose the optimizer and criterion
# criterion = torch.nn.CrossEntropyLoss()
criterion = torch.nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(),lr=learningRate)
# optimizer = torch.optim.SGD(model.parameters(),lr=learningRate)

In [17]:
AA=training_samples[0][1]

In [18]:
AA.type()

'torch.DoubleTensor'

## Train it

In [20]:
epochs=15
running_loss_train = 0
model.train()
for epoch in range(epochs):
    running_loss=[]
    # Converting inputs and labels to Variable
    
    for batch_idx, (data, target) in enumerate(training_samples, 1):
        if data.size()[0]>0:
            inputs = Variable(data)
            labels = Variable(target)
            optimizer.zero_grad()
            outputs, hidden = model(inputs)
            # get loss for the predicted output
            loss = criterion(outputs, labels)
            loss.backward()
            # update parameters
            optimizer.step()

            running_loss.append(loss.item())
            running_loss_train += (loss.item() - running_loss_train) / batch_idx
        
    if epoch % 3 == 0:
        print("running loss train", running_loss_train)

running loss train 0.2995820108231775
running loss train 0.2988295904131774
running loss train 0.2988753420897336
running loss train 0.29849424817887377
running loss train 0.2983588797584567


In [20]:
model.eval()

ModelLstm(
  (lstm): LSTM(26, 25)
  (linear): Linear(in_features=25, out_features=15, bias=False)
  (sigmoid): Sigmoid()
)

In [21]:
## Training
XX=X_tr.copy()
XX=Variable(torch.from_numpy(XX).type('torch.DoubleTensor'))
yy=model(XX)

In [33]:
yy0=yy[0].data.numpy()
yy0=(yy0>0.5).astype(int).astype(str)
yy0=[''.join(i) for i in yy0]
yy0[:10]

['000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000']

In [34]:
data_used_train['y_pred']=yy0

In [36]:
data_used_train.sample(10)

Unnamed: 0,source,target,ids,flag_space,y_pred
50144,melainkandaerah,melainkan daerah,1637189,1000000,0
67640,menegaskantiket,menegaskan tiket,1719973,100000,0
87884,pelakusetiapini,pelaku setiap ini,1815936,1000001000,0
32800,tahunberbobotdi,tahun berbobot di,1554907,10000000100,0
60884,terpopulerdalam,terpopuler dalam,1687901,100000,0
15182,identikinisudah,identik ini sudah,1471301,100100000,0
65497,denganpekerjaan,dengan pekerjaan,1709857,1000000000,0
48114,mobiltahunsedan,mobil tahun sedan,1627587,10000100000,0
58791,terkaitterhadap,terkait terhadap,1678152,100000000,0
31506,sertatariftahap,serta tarif tahap,1548878,10000100000,0


## Test it

In [37]:
## Test it
XX=X_te.copy()
XX=Variable(torch.from_numpy(XX).type('torch.DoubleTensor'))
yy=model(XX)

In [38]:
yy0=yy[0].data.numpy()
yy0=(yy0>0.5).astype(int).astype(str)
yy0=[''.join(i) for i in yy0]
yy0[:10]

['000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000',
 '000000000000000']

In [39]:
data_used_test['y_pred']=yy0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [40]:
data_used_train.sample(10)

Unnamed: 0,source,target,ids,flag_space,y_pred
12342,sepakperusahaan,sepak perusahaan,1457750,10000000000,0
9138,miliarmelakukan,miliar melakukan,1442854,1000000000,0
3403,asalpendaftaran,asal pendaftaran,1416167,100000000000,0
74342,jelasnyastadium,jelasnya stadium,1751813,10000000,0
12597,cmnphargaplakat,cmnp harga plakat,1459043,100001000000,0
86940,bahwadenganyang,bahwa dengan yang,1811573,10000010000,0
54344,terjadibniingin,terjadi bni ingin,1657235,100100000,0
8989,untuktetappiper,untuk tetap piper,1442103,10000100000,0
15108,ratelalusamping,rate lalu samping,1471008,100010000000,0
28477,danyangnokiadwi,dan yang nokia dwi,1534286,1000100001000,0
