# SENTENCE SEGMENTATION

In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
cd drive/MyDrive/

[Errno 2] No such file or directory: 'drive/MyDrive/'
/content/drive/MyDrive


In [50]:
import tensorflow as tf
tf.test.is_gpu_available()

True

**IMPORT DATASET**

In [51]:
import pandas as pd
import numpy as np
df=pd.read_csv('/content/drive/MyDrive/COLAB/sentence_seg.csv',header=None)

In [52]:
df.drop(0, inplace=True)

In [53]:
df.dropna(inplace=True)  #remove empty rows

**CONVERT TEXT TO LABELS: 1= FIRST WORD, 0 = WORDS IN THE MIDDLE, 2= FINAL WORD**

In [54]:
def labels(row):  # assign labels
    output=[]
    for word in row.values[1].split():
        if word == '.':
            output.append('3')
        elif word == ',':
            output.append('2')
        elif word.istitle():
            output.append('1')
        else:
            output.append('0')
    return output
df['five_class']=df.apply(labels,axis=1)

In [55]:
df

Unnamed: 0,0,1,five_class
1,the jury further said in presentments that the...,The jury further said in presentments that the...,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 0, 0, ..."
2,the term jury had been charged by fulton super...,The term jury had been charged by Fulton Super...,"[1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, ..."
3,only a relative handful of such reports was re...,Only a relative handful of such reports was re...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, ..."
4,the jury said it did find that many of registr...,The jury said it did find that many of registr...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,it recommended that fulton legislators act to ...,It recommended that Fulton legislators act to ...,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
48839,first of all the six figures of the buddha you...,"First of all , the six figures of the Buddha y...","[1, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ..."
48840,now you probably share the widespread western ...,"Now , you probably share the widespread Wester...","[1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, ..."
48841,well ordinarily he is except as the wheel of t...,"Well , ordinarily he is , except as the Wheel ...","[1, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, ..."
48842,very peculiar retribution indeed seems to over...,Very peculiar retribution indeed seems to over...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 3]"


In [56]:
rows=df.shape[0]//5  #create rows of 5 sentences each

In [57]:
indices=[[ii]*5 for ii in range(rows)]
df['indices']=np.array(indices).ravel()

In [58]:
df.set_index("indices",inplace=True)

In [59]:
df[['five_class']]

Unnamed: 0_level_0,five_class
indices,Unnamed: 1_level_1
0,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 0, 0, ..."
0,"[1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, ..."
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, ..."
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
0,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...
9767,"[1, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ..."
9767,"[1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, ..."
9767,"[1, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, ..."
9767,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 3]"


In [60]:
new_df = (df[['five_class']].assign(labels = df[['five_class']].groupby(level = 0).cumcount())
            .groupby([df[['five_class']].index,'labels']).first()
            .unstack('labels')
            .sort_index(axis =1,level = 1)
            .droplevel(1,axis = 1))

In [61]:
new_df1 = (df[[0]].assign(labels = df[[0]].groupby(level = 0).cumcount())
            .groupby([df[[0]].index,'labels']).first()
            .unstack('labels')
            .sort_index(axis =1,level = 1)
            .droplevel(1,axis = 1))

In [62]:
def join_one(row):
    output=row.values[0]

    for i in range(1,len(row)):
        output=output+" "+row.values[i]
    return output

def join_one1(row):
    output=row.values[0]

    for i in range(1,len(row)):
        output=output+row.values[i]
    return output

new_df['output']=new_df[:].apply(join_one1,axis=1)
new_df1['output']=new_df1[:].apply(join_one,axis=1)

In [63]:
new_df1[['output']]

Unnamed: 0_level_0,output
indices,Unnamed: 1_level_1
0,the jury further said in presentments that the...
1,the grand jury commented on a number of other ...
2,it urged that the next legislature provide ena...
3,failure to do this will continue to place a di...
4,the jury did not elaborate but it added that t...
...,...
9763,i merely draw an etymological distinction hopi...
9764,as doubtless forgotten the circumstances in th...
9765,you then descended one story glommed a televis...
9766,the foregoing aided by several clues withhold ...


**SET BATCH SIZE, EPOCHS...**

In [64]:
import numpy as np
import tensorflow as tf
from tensorflow import keras


batch_size = 64 
epochs = 20 
latent_dim = 128  # Latent dimensionality of the encoding space.


**VECTORIZE THE DATA**

In [65]:
data_path = "/content/drive/MyDrive/COLAB/sentence_seg.csv"
input_texts = []
target_texts = []
input_words = set()
target_words = set()
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
with open(data_path, "r", encoding="utf-8") as f:
    target_lines = f.read().split("\n")
    
    
lines=new_df1[['output']].values
target_lines=new_df[['output']].values
    
for input_text, target_text in zip(lines[:], target_lines[:]):
    input_texts.append(input_text[0])
    target_texts.append(target_text[0])
    for word in input_text:
        if word not in input_words:
            input_words.add(word)
    input_words.add(' ')
    target_words.add(' ')

In [66]:
input_words = sorted(list(input_words))
target_words = sorted(list(target_words))
max_encoder_seq_length = max([len(txt.split()) for txt in input_texts]) # max input and output length in the train set
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print("Number of samples:", len(input_texts))
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 9768
Max sequence length for inputs: 360
Max sequence length for outputs: 385


In [67]:
len(target_texts[0])

144

**TOKENIZE THE DATA**

In [68]:
from keras.preprocessing.text import Tokenizer

In [69]:
tokenizer = Tokenizer()  #tokenize the data
tokenizer.fit_on_texts(input_texts)

In [70]:
encoded_docs = tokenizer.texts_to_sequences(input_texts) #padding

In [71]:
input_token_index = dict([(char, i) for i, char in enumerate(input_words)])
target_token_index = dict([(char, i) for i, char in enumerate(target_words)])


In [72]:
vocab_size = len(tokenizer.word_index)+1
vocab_size

38726

**ENCODER AND DECODER**

In [73]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, 300), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, 5), dtype="float32"
)

decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, 5), dtype="float32"
)

**LOAD WORD2VEC**

In [74]:
# word embeddings
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [75]:
pretrained_values=np.zeros((vocab_size,300))
for key,value in tokenizer.word_index.items():
    if key in wv:
        pretrained_values[value,:]=wv[key]
    else:
        continue

In [76]:
np.save('pretrained.npy', pretrained_values) #save the words embeddings in a numpy file (in order not to load the wv again)

In [77]:
pretrained_values1=np.load('pretrained.npy',)

In [78]:
dict_words=tokenizer.word_index

In [79]:
for ii,sample in enumerate(encoded_docs[:]):
    for jj,index in enumerate(sample):
        output=pretrained_values1[index]
        encoder_input_data[ii,jj,:]=output

In [80]:
decoder_target_data.shape

(9768, 385, 5)

In [81]:
decoder_input_data.shape

(9768, 385, 5)

In [82]:
print(sample)

[71, 2, 35, 1, 410, 909, 2, 1, 11987, 41, 38713, 256, 14725, 2357, 5, 453, 38714, 38715, 6, 1, 372, 3, 5, 236, 5282, 433, 2, 1, 2979, 11987, 16, 5, 38716, 1204, 75, 41, 346, 1103, 1, 3345, 702, 1623, 7, 1, 1161, 11987, 8, 1, 80, 18367, 2, 1, 5744, 95, 46, 52, 56, 38717, 3, 38718, 3, 1, 593, 102, 5821, 10, 8, 500, 13, 1, 1813, 2, 1, 301, 14307, 219, 19389, 3288, 48, 11581, 7982, 27, 38719, 1, 1331, 110, 3596, 14437, 586, 340, 4, 15342, 74, 38720, 999, 27, 619, 94, 365, 5, 825, 2, 712, 23495, 7244, 5, 16251, 38721, 190, 24, 1, 38722, 38723, 6, 1615, 3, 5, 317, 212, 4667, 57, 1, 38724, 14, 9490, 6177, 2, 38725]


In [83]:
for ii,sample in enumerate(target_texts[:]):
    for jj,index in enumerate(sample):
        output=[0 for _ in range(5)]
        output[int(index)] =1
        decoder_input_data[ii,jj,:]=output
        decoder_target_data[ii,jj,:]=output

In [84]:
num_decoder_tokens=5

**BUILD THE MODEL**

In [85]:
# Define an input sequence and process it.
encoder_inputs = keras.Input(shape=(None, 300))  ## 300
encoder = keras.layers.Bidirectional(keras.layers.LSTM(latent_dim, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
state_h = keras.layers.Concatenate()([forward_h, backward_h])
state_c = keras.layers.Concatenate()([forward_c, backward_c])


encoder_states = [state_h, state_c]# discard encoder_outputs and only keep the states.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))# Set up the decoder as initial state.


decoder_lstm = keras.layers.LSTM(2*latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# encoder_input_data and decoder_input_data into decoder_target_data
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)



In [86]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None, 300)]  0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 256), (None, 439296      input_3[0][0]                    
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None, 5)]    0                                            
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 256)          0           bidirectional_1[0][1]            
                                                                 bidirectional_1[0][3]      

**TRAIN THE MODEL**

In [87]:
opt = keras.optimizers.Adam(0.002)  #set the optimizer and learning rate
model.compile(
    optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy",tf.keras.metrics.Recall(), tf.keras.metrics.Precision()] # set the loss function, and metrics
)
epochs = 10 #number of epochs
model_data = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,  #split the test data
)
# Save the model
model.save("sentence_seg")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: sentence_seg/assets


INFO:tensorflow:Assets written to: sentence_seg/assets


In [92]:
encoder_input_data[:10].shape

(10, 360, 300)

### New Data to predict

In [93]:
with open('/content/drive/MyDrive/COLAB/story2.txt', 'r') as file:
    story = file.readline().strip()

In [94]:
new_word_np=np.zeros((1,385,300))
all_text=story.split()

sentences = []
for ii in range(0,len(all_text)-385,385):

    sentence1=all_text[ii:ii+385]

    for jj,word in enumerate(sentence1):
        if word in dict_words:
            index=dict_words[word]
            new_word_np[:,jj,:]=pretrained_values1[index]
        else:
            continue
            
    y_pred=model.predict([new_word_np, decoder_input_data[:1]])
    
    y_pred=np.argmax(y_pred,axis=2)
    
    output_sentence=""
    for ii,value in enumerate(y_pred[0]):
        if value=='2':
            output_sentence=output_sentence+" "+sentence1[ii]+'\n\n'
        else:
            output_sentence=output_sentence + " " +sentence1[ii]
        
    print('\nInput Sentence: '," ".join(sentence1))
    print("\nOutput Sentence: ",output_sentence)  
    sentences.append(output_sentence) 


Input Sentence:  north richmond street being blind was a quiet street except at the hour when the christian brothers school set the boys freean uninhabited house of two storeys stood at the blindend detached from it sneigh bours in a square ground the other houses of the street conscious of decent lives within the mgazed at one another with brown imperturbable faces the formerten ant of our houseapriest had died in the backdrawing roomair musty from having been long enclosed hung in all the rooms and the waste room behind the kitchen was littered with olduseless papers among these i found a few paper covered books the pages of which we recurled and damp the abbot by walter scott the devout communicant and the memoirs of vidocqiliked the last best because its leaves were yel low the wild garden behind the house contained a central appletree and a few straggling bushe sunder one of which i found the latetenants rusty bicycle pumphe had been a very charitable priest in his will he had le

In [95]:
all_text=story.split()

print(len(y_pred[0]))

385
