In [1]:
__author__ = 'Daisuke Yoda'
__Date__ = 'January 2019'

## Loading Modules

In [2]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
__dir__ = os.getcwd()[:-11]

from chainer import Chain, Variable, optimizers, serializers
import chainer.functions as F
import chainer.links as L

from gensim.models.keyedvectors import KeyedVectors

## Functions

In [3]:
def word_to_index(word):
    word_index = [ord (char) - 97 for char in word]
    return word_index


def one_hot_encoding(indices, n_class=27):
    return np.eye(n_class)[indices]

def padding(sentences):
    max_len = np.max([len(s) for s in sentences])
    paded_vec = []
    for sentence in sentences:
        pad_len = max_len - len(sentence)
        pad_vec = [26] * pad_len
        sentence.extend(pad_vec)
        paded_vec.append(sentence)

    return np.array(paded_vec, dtype=np.int32)

## LSTM model

In [4]:
class LSTM(Chain):
    def __init__(self, in_size, hidden_size,out_size):
        super(LSTM, self).__init__(
            h1 = L.NStepLSTM (
                n_layers=1,
                in_size=in_size,
                out_size=hidden_size,
                dropout=0.5),
            hy = L.Linear(hidden_size*17,out_size))


    def __call__(self,input_data,hx=None):
        if np.any(hx):
            hx = hx.reshape(1,-1,self.h1.out_size)
        input_x = [Variable(x) for x in input_data]
        hx,cx,y = self.h1(hx,None,input_x)
        y2 = [F.concat(x, axis=0) for x in F.pad_sequence(y,length=17, padding=0.)]
        y2 = F.concat([F.expand_dims(x,axis=0) for x in y2],axis=0)

        out = self.hy(y2)

        return out

    def predict(self,word,hx=None):
        test_vec = word_to_index(word)
        test_vec = one_hot_encoding(test_vec).astype(np.float32)
        res = self([test_vec],hx)[0]
        return F.argmax(res)


## Original Dataset (Random)
### (words and their split point)

In [5]:
df = pd.read_csv(__dir__ + 'data/split_point_2.csv', index_col=0)
df = df[np.random.permutation (df.columns)]
df.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
wearing,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,,,
societies,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,,,,,
list,0.0,0.0,0.0,1.0,,,,,,,,,,,,,
consultation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,,
regulated,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,,,,,,


## Loading the glove data and Using it for all words into glove vectors

In [6]:
word_vectors = KeyedVectors.load_word2vec_format(__dir__ + 'data/glove.6B.200d.bin')
word_vec = np.array([word_vectors.get_vector(word) for word in df.columns], dtype=np.float32)

## Data arrangement

In [7]:
original_data = [word_to_index(x) for x in df.columns]
original_data = [one_hot_encoding(x).astype (np.float32) for x in original_data]
split_point = np.nan_to_num(df, 0).T

dataX = original_data
dataY = split_point.astype(np.float32)

## Loading the pretrained model 

In [8]:
model = LSTM(27, 200, 17)
serializers.load_npz(__dir__ + 'data/model5.npz',model)

## Calculating the accuracy

### REMARK:
### This model is for windows OS and LInux OS, not for Mac OS
### Also the dataset includes both training data and test data

In [9]:
accuracy = 100*np.sum(np.argmax(model(dataX).data, axis=1)==np.argmax(dataY,axis=1))/len(dataX)
print('accuracy:',accuracy)

accuracy: 88.91904115784713


## The difference between model1 and model4

In [43]:
ix = np.where(np.argmax(model(dataX).data, axis=1)!=np.argmax(dataY,axis=1))[0]
model1_split = df[df.columns[ix]].apply(np.argmax)
model1_split.name = 'model_1'
split_result = pd.DataFrame(model1_split)
split_result['model_4'] = np.argmax(model(dataX).data, axis=1)[ix]
split_result.sample(10)

Unnamed: 0,model_1,model_4
had,1,2
code,3,2
pressures,7,6
suite,4,3
selection,5,8
varying,2,3
confrontation,12,10
island,5,3
terminate,8,7
pass,3,2


In [72]:
def print_split_point(dataframe):
    print('model1:',dataframe.name[:dataframe.ix[0]+1],'+',dataframe.name[dataframe.ix[0]+1:])
    print('model4:',dataframe.name[:dataframe.ix[1]+1],'+',dataframe.name[dataframe.ix[1]+1:])
    
split_result_sample = split_result.sample(40)
for i in range(40):
    print_split_point(split_result_sample.iloc[i])
    print('=====================')

model1: solv + e
model4: solve + 
model1: africa + n
model4: african + 
model1: var + ying
model4: vary + ing
model1: prove + d
model4: prov + ed
model1: arche + d
model4: arch + ed
model1: leav + e
model4: leave + 
model1: episode + 
model4: episod + e
model1: yield + s
model4: yiel + ds
model1: pressure + s
model4: pressur + es
model1: propos + es
model4: propose + s
model1: promis + e
model4: promise + 
model1: tax + es
model4: taxe + s
model1: represent + ing
model4: represen + ting
model1: code + 
model4: cod + e
model1: base + 
model4: bas + e
model1: type + 
model4: typ + e
model1: operat + es
model4: operate + s
model1: estimate + d
model4: estimat + ed
model1: respond + 
model4: respon + d
model1: rifle + 
model4: rifl + e
model1: terminate + d
model4: terminat + ed
model1: giv + es
model4: give + s
model1: pursu + ed
model4: pursued + 
model1: deep + er
model4: deeper + 
model1: stay + 
model4: sta + y
model1: alleged + ly
model4: allegedl + y
model1: command + er
model4: com