# Imports

In [0]:
%matplotlib inline
from IPython.display import clear_output
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import re
import string
from unidecode import unidecode

# Import data

In [0]:
Data = pd.read_excel('modeldata.xlsx')

Data.head()

Unnamed: 0,Nom,Colonne1
0,10-TEC-San,Sanitaires
1,10-TEC-San,Sanitaires
2,40-TEC-Palier,Circulation
3,40-TEC-Palier,Circulation
4,Accès Foodcourt / RIE,Circulation


# Data treatment functions

The goal is to create function to turn the raw data into suitable and structured data to feed in the machine learning model. The data comes as strings, that are cleaned of special characters and numbers (every number is turned into a 0). The strings are then turned into arrays based on the characters they contain, as the model will learn from character sequences. Padding allows all the arrays to have the same length. We then wrap thoses functions in a definition that will chain them together and take the full dataset as an input. 

In [0]:

stringe = "22°C Degrés? Yes!  ÈÙæñÿÈÀÇ"

def stringcleaner(string) :
    #turn special characters into points whilst keeping spaces
    s = re.sub('[^\w ]', '.', string)
    #turn special characters into normal characters
    s = unidecode(s)
    #turn numbers into 0
    s = re.sub('\d+', '0', s)
    s = s.lower()
    return s


stringcleaner(stringe)

'0.c degres. yes.  euaenyeac'

In [0]:
#data cleaning functions
def series_cleaner(series) :
    name = series.name
    li = series.tolist()
    lis = []
    for e in li :
        e = stringcleaner(e)
        lis.append(e)
    se = pd.Series( lis )
    se.name = name
    return se

In [0]:
series_cleaner(Data['Nom'])[0:10]

0                  0.tec.san
1                  0.tec.san
2               0.tec.palier
3               0.tec.palier
4      acces foodcourt . rie
5      acces foodcourt . rie
6    acces goupe electrogene
7              acces parking
8              acces parking
9              acces parking
Name: Nom, dtype: object

In [0]:
# dataframe manipulation and cleaning
def retrieve_series(df) : 
    series = []
    for e in df.columns.values :
        series.append(df[e])
    return series

def prepare_DB_all (df) :
    series = retrieve_series(df)
    series_cl = []
    for e in series :
        cl_col = series_cleaner(e)
        series_cl.append(cl_col)
    df_se = pd.concat(series_cl, axis=1)
    return df_se
        

In [0]:
PreparedData = prepare_DB_all(Data)
PreparedData[10:30]

Unnamed: 0,Nom,Colonne1
10,acces parking,parking
11,acces parking,parking
12,acces parking,parking
13,acces parking,parking
14,acces parking,parking
15,acces parking,parking
16,acces terrasse,circulation
17,acces terrasse,circulation
18,acces vestiaire,circulation
19,accueil,utilitaire


In [0]:
#list of characters
acceptedcharacters =''.join([' ', string.ascii_lowercase, '.', '0'])
acceptedcharacters

' abcdefghijklmnopqrstuvwxyz.0'

In [0]:
num_attributes = len(acceptedcharacters)
num_attributes

29

In [0]:
# create dictionnary for characters
dictionnary = {}
i=0
for char in acceptedcharacters :
    dictionnary[char] = acceptedcharacters.index(char)
dictionnary['a']


1

In [0]:
#turn words into an array of shape (number of characters, number of features)
def converttotensor (string, dictionnary) :
    list = []
    arraylist = []
    for c in string :
        list.append(dictionnary[c])
    for num in list :
        zeros = np.zeros(num_attributes)
        zeros[num] = 1
        arraylist.append(zeros)
    
    array = np.array(arraylist)
    return array
   
#add padding corresponding to the longest of the words
def padsequencesofcharacters (listofstrings, dictionnary):
    vectors = [converttotensor(x, dictionnary) for x in listofstrings]
    paddedvectors = tf.keras.preprocessing.sequence.pad_sequences(vectors, maxlen=None, dtype='int32')
    return paddedvectors    

a = padsequencesofcharacters(PreparedData['Nom'][0:10], dictionnary)


a[0][20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [0]:
targets = np.array(PreparedData['Colonne1'])
uniqueclasses = np.unique(targets)
uniqueclasses


array(['bureaux', 'circulation', 'circulation technique',
       'circulation verticale', 'commerce', 'commun', 'cuisine',
       'exterieur', 'gt', 'lt', 'parking', 'restauration', 'sanitaires',
       'utilitaire'], dtype=object)

In [0]:
#turn a column of a dataframe into an one hotted array
def onehot_series (df, series) :
    columns = retrieve_series(df)
    df_ohe = df.copy()
    for column in columns :
        if column.name == series :
            df_ohe = pd.concat([df_ohe, pd.get_dummies(column ,'', prefix_sep='')], axis=1)
            df_ohe.drop(column.name, axis=1, inplace=True)
        else : df_ohe.drop(column.name, axis=1, inplace=True)
    return np.array(df_ohe)

#turn a column of a dataframe into an one hotted array
def onehot_df_array (df, series) :
    columns = retrieve_series(df)
    df_ohe = df
    for column in columns :
        df_ohe = pd.concat([df_ohe, pd.get_dummies(column ,'', prefix_sep='')], axis=1)
        df_ohe.drop(column.name, axis=1, inplace=True)
    return np.array(df_ohe)

onehot_series (PreparedData, 'Colonne1')[0:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint8)

In [0]:

def DatatoRNN (dataframe) :
    #returns a dataframe
    prepareddata = prepare_DB_all(dataframe)
    #prune duplicates
    #prepareddata = pd.DataFrame.drop_duplicates(PreparedData, subset=None, keep='first', inplace=False)
    
    #returns an array 3-D
    paddeddata = padsequencesofcharacters(prepareddata['Nom'], dictionnary)
    #array for target
    targetdata = onehot_series(prepareddata, 'Colonne1')
    #returns 4 lists of arrays
    x_train, x_test, y_train, y_test = train_test_split(paddeddata, targetdata, test_size = 0.2, random_state=42)
    
    
    return x_train, y_train, x_test, y_test

    
tornn = DatatoRNN(Data)
tornn[0][0][0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int32)

# Actual data conversion for RNN
Directly reuses the functions written before to setup for the machine learning model training.

In [0]:
dataforRNN = DatatoRNN(Data)
x_train = dataforRNN[0]
y_train = dataforRNN[1]
x_test = dataforRNN[2]
y_test = dataforRNN[3]
len(y_train), y_train[0:10]

(3976, array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint8))

In [0]:
len(x_train), x_train[0:10]

(3976, array([[[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 1, 0],
         [1, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]],
 
        [[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 1, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]],
 
        [[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]],
 
        ...,
 
        [[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 1, 0],
         [0, 0, 0, ..., 0, 0, 1],
         [0, 1, 0, ..., 0, 0, 0]],
 
        [[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 

# Machine learning model construction
We used the keras API to get tensorflow to build a LSTM-cells bases model taking sequences as inputs and categorizing them with the help of a fully connected layer on top. This version uses Bidirectionnal layers but simple layers work just fine. Dropout is added to regularize the model.

In [None]:
#hyperparameters of the model
lengthtrainingdata = len(dataforRNN[0])
batch_size = 25
num_steps = len(dataforRNN[0][0][0])
num_epochs = 100
learning_rate = 0.01

_optimizer = tf.keras.optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

In [0]:
#number of different characters
inputshape = len(acceptedcharacters)
outputshape = uniqueclasses.size

#Model
model = Sequential()

model.add(Bidirectional(LSTM(128, activation='relu', return_sequences=True ),  input_shape=(50, num_attributes), merge_mode="concat" ))
model.add(Dropout(0.3))

model.add(LSTM(64, activation='relu', return_sequences=False), merge_mode="concat"))
model.add(Dropout(0.3))

model.add(Dense(outputshape, activation='softmax'))





In [0]:
model.summary()
print ("Inputs: {}".format(model.input_shape))
print ("Outputs: {}".format(model.output_shape))
print ("Actual input: {}".format(x_train.shape))
print ("Actual output: {}".format(y_train.shape))

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_12 (Bidirectio (None, 50, 256)           161792    
_________________________________________________________________
dropout_17 (Dropout)         (None, 50, 256)           0         
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 128)               164352    
_________________________________________________________________
dropout_18 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 14)                1806      
Total params: 327,950
Trainable params: 327,950
Non-trainable params: 0
_________________________________________________________________
Inputs: (None, 50, 29)
Outputs: (None, 14)
Actual input: (3976, 50, 29)
Actual output: (3976, 14)


# Model training
For the training, we added callbacks that will save the best model after training steps. A manual save command at the end can do the trick but LSTMs are pretty unstable.

In [0]:
model.compile(loss='categorical_crossentropy', optimizer=_optimizer, metrics=['categorical_accuracy'])

#callbacks to save best model : val_acc isn't registered ? somehow doesnt work. maybe by inputting 'val_categorical_accuracy'
filepath="SavedModels\\Onedenselayerandnoduplicates{epoch:02d}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=0, save_best_only=True, mode='max')



callbacks_list = [checkpoint]          



history = model.fit(x_train,
          y_train,
          epochs= num_epochs,
          validation_data=(x_test, y_test),
          callbacks=callbacks_list)