In [38]:
import pandas as pd 
import gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras import Sequential 
from keras.layers import Dense, InputLayer
from keras.utils.np_utils import to_categorical
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
tf.config.run_functions_eagerly(True)
import numpy as np
import pickle

In [2]:
df = pd.read_csv('./dataset/sentences.csv', sep='\t', 
                            encoding='utf8', 
                            index_col=0,
                            names=['lang','text'])

In [3]:
LanguageList = ['eng','fra', 'spa','ita','deu']
ngramLength = 3
maxFeatures = 500
sentencePerLanguageForVocab = 1000
sentencePerLanguageForDataSet = 50000
validationSentences = 25000
testSentences = 25000

In [4]:
filtereddf = pd.DataFrame(columns=df.columns)
for l in LanguageList:
    filtereddf = pd.concat([filtereddf, 
                            df[df['lang'] == l].sample(
                                sentencePerLanguageForDataSet)])

filtereddf.describe()

Unnamed: 0,lang,text
count,250000,250000
unique,5,250000
top,deu,È la ragazza perfetta per lei.
freq,50000,1


In [5]:
filtereddf = filtereddf.sample(frac=1)
validation_df = filtereddf[:validationSentences]
test_df = filtereddf[validationSentences:validationSentences + testSentences]
train_df = filtereddf[validationSentences + testSentences :]
train_df.describe()

Unnamed: 0,lang,text
count,200000,200000
unique,5,200000
top,ita,È la ragazza perfetta per lei.
freq,40070,1


In [6]:
del df,filtereddf
gc.collect()

0

In [7]:
def get_feature_vectors(dataset):
    vectorizer = CountVectorizer(analyzer='char',ngram_range=(ngramLength,ngramLength),max_features=maxFeatures)
    vectorizer.fit_transform(dataset)
    trigrams = vectorizer.get_feature_names()
    return trigrams

In [8]:
vocab = set()
for l in LanguageList:
    currData = train_df[train_df['lang'] == l].sample(sentencePerLanguageForVocab)
    currFeatures = get_feature_vectors(currData['text'].to_list())
    vocab.update(currFeatures)
print(len(vocab))

1463


In [9]:
word_vectorizer = CountVectorizer(analyzer='char',ngram_range=(ngramLength,ngramLength), vocabulary=vocab)
feature_names = word_vectorizer.get_feature_names()
langEncoder = LabelEncoder()
langEncoder.fit(LanguageList)

LabelEncoder()

In [10]:
train_x = word_vectorizer.transform(train_df['text'].to_list())
min_value = train_x.min(axis=0).toarray()
min_df = pd.Series(min_value[0], index=feature_names)
max_value = train_x.max(axis=0).toarray()
max_df = pd.Series(max_value[0], index=feature_names)
del max_value,min_value,train_x

In [11]:
def data_generator(dataset, batch_size):
    noOfBatches = len(dataset)//batch_size
    batches = np.array_split(dataset,noOfBatches)
    i = 0
    while True:
        batch = batches[i]
        i += 1
        if i == noOfBatches:
            i = 0
        # getting x
        x = word_vectorizer.transform(batch['text'].to_list())
        xdf = pd.DataFrame(data=x.toarray(), columns=feature_names)
        xdf = (xdf - min_df)/(max_df - min_df)
        x_num = xdf.to_numpy()
    
        # getting y
        y = batch['lang'].to_list()
        y_enc = langEncoder.transform(y)
        y_num = to_categorical(y_enc, num_classes=len(LanguageList))
        yield x_num,y_num    

In [12]:
# testing generator
gen = iter(data_generator(train_df,3))
tempx, tempy = next(gen)
tempx.shape
tempy.shape

In [14]:
batch_size = 5

In [15]:
train_generator = data_generator(train_df, batch_size)
val_generator = data_generator(validation_df, batch_size)
test_generator = data_generator(test_df, batch_size)

In [16]:
train_steps_per_epoch = len(train_df)//batch_size
val_steps_per_epoch = len(validation_df)//batch_size
test_steps_per_epoch = len(test_df)//batch_size

In [17]:
model = Sequential()
model.add(Dense(256,input_dim=len(vocab), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(len(LanguageList), activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               374784    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 645       
Total params: 408,325
Trainable params: 408,325
Non-trainable params: 0
_________________________________________________________________


2021-07-22 17:57:08.192221: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-22 17:57:08.635031: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-07-22 17:57:08.635067: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ninjabox): /proc/driver/nvidia/version does not exist
2021-07-22 17:57:08.636279: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
model.fit(train_generator,
          steps_per_epoch=train_steps_per_epoch,
          validation_data=val_generator,
          validation_steps=val_steps_per_epoch,
          epochs=4)

2021-07-22 17:57:18.670730: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-07-22 17:57:18.690288: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2699905000 Hz


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fb5d1b41b80>

In [36]:
model.save('./models/basic-model-2.model')

INFO:tensorflow:Assets written to: ./models/basic-model-2.model/assets


In [24]:
sentence = 'Buenos dias'
man_test_x = word_vectorizer.transform([sentence])
man_test_df = pd.DataFrame(data=man_test_x.toarray(), columns=word_vectorizer.get_feature_names())
man_test_df = (man_test_df - min_df)/(max_df - min_df)
man_test_num = man_test_df.to_numpy()
y = model.predict(man_test_num)
label = np.argmax(y)
prediction = langEncoder.inverse_transform([label])
prediction = prediction[0]
print(prediction)



spa


In [37]:
import pickle
preprocessing_objects = {
    'word_vectorizer': word_vectorizer,
    'lang_encoder': langEncoder,
    'min_df': min_df,
    'max_df': max_df
}
pickle.dump(preprocessing_objects,open('./models/basic-model-2-preprocessing-objects.pkl','wb'))