In [1]:
import pandas as pd 
import gc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras import Sequential 
from keras.layers import Dense, InputLayer
from keras.utils.np_utils import to_categorical
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
tf.config.run_functions_eagerly(True)
import numpy as np
import pickle

2021-07-24 19:23:03.920169: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
df = pd.read_csv('./dataset/shortened-sentences.csv',index_col=0,names=['lang','text'])
# df = pd.read_csv('./dataset/sentences.csv', sep='\t', 
#                             encoding='utf8', 
#                             index_col=0,
#                             names=['lang','text'])

In [4]:
LanguageList = ['eng','fra', 'spa','ita','deu']
ngramLength = 3
maxFeatures = 500
sentencePerLanguageForVocab = 1000
sentencePerLanguageForDataSet = 50000
validationSentences = 25000
testSentences = 25000

In [5]:
filtereddf = pd.DataFrame(columns=df.columns)
for l in LanguageList:
    filtereddf = pd.concat([filtereddf, 
                            df[df['lang'] == l].sample(
                                sentencePerLanguageForDataSet)])

filtereddf.describe()

Unnamed: 0,lang,text
count,250000,250000
unique,5,250000
top,deu,Il nemico ha fatto saltare in aria il ponte.
freq,50000,1


In [6]:
filtereddf = filtereddf.sample(frac=1)
filtereddf.index = [i for i in range(len(filtereddf))]
validation_df = filtereddf[:validationSentences]
print(validationSentences + testSentences)
test_df = filtereddf[validationSentences:int(validationSentences + testSentences)]
train_df = filtereddf[int(validationSentences + testSentences):]
train_df.describe()

50000


Unnamed: 0,lang,text
count,200000,200000
unique,5,200000
top,fra,Il nemico ha fatto saltare in aria il ponte.
freq,40201,1


In [7]:
del df,filtereddf
gc.collect()

0

In [8]:
def get_feature_vectors(dataset):
    vectorizer = CountVectorizer(analyzer='char',ngram_range=(ngramLength,ngramLength),max_features=maxFeatures)
    vectorizer.fit_transform(dataset)
    trigrams = vectorizer.get_feature_names()
    return trigrams

In [9]:
vocab = set()
for l in LanguageList:
    currData = train_df[train_df['lang'] == l].sample(sentencePerLanguageForVocab)
    currFeatures = get_feature_vectors(currData['text'].to_list())
    vocab.update(currFeatures)
print(len(vocab))

1487


In [10]:
word_vectorizer = CountVectorizer(analyzer='char',ngram_range=(ngramLength,ngramLength), vocabulary=vocab)
feature_names = word_vectorizer.get_feature_names()
langEncoder = LabelEncoder()
langEncoder.fit(LanguageList)

LabelEncoder()

In [11]:
train_x = word_vectorizer.transform(train_df['text'].to_list())
min_value = train_x.min(axis=0).toarray()
min_df = pd.Series(min_value[0], index=feature_names)
max_value = train_x.max(axis=0).toarray()
max_df = pd.Series(max_value[0], index=feature_names)
del max_value,min_value,train_x

In [12]:
def data_generator(dataset, batch_size):
    print(len(dataset))
    noOfBatches = len(dataset)//batch_size
    print(noOfBatches)
    batches = np.array_split(dataset,noOfBatches)
    i = 0
    while True:
        batch = batches[i]
        i += 1
        if i == noOfBatches:
            i = 0
        # getting x
        x = word_vectorizer.transform(batch['text'].to_list())
        xdf = pd.DataFrame(data=x.toarray(), columns=feature_names)
        xdf = (xdf - min_df)/(max_df - min_df)
        x_num = xdf.to_numpy()
    
        # getting y
        y = batch['lang'].to_list()
        y_enc = langEncoder.transform(y)
        y_num = to_categorical(y_enc, num_classes=len(LanguageList))
        yield x_num,y_num    

In [13]:
# testing generator
gen = iter(data_generator(train_df,3))
tempx, tempy = next(gen)
print(tempx.shape)
print(tempy.shape)

200000
66666
(4, 1487)
(4, 5)


In [14]:
batch_size = 5

In [15]:
train_generator = data_generator(train_df, batch_size)
val_generator = data_generator(validation_df, batch_size)
test_generator = data_generator(test_df, batch_size)

In [16]:
train_steps_per_epoch = len(train_df)//batch_size
val_steps_per_epoch = len(validation_df)//batch_size
test_steps_per_epoch = len(test_df)//batch_size

In [17]:
model = Sequential()
model.add(Dense(256,input_dim=len(vocab), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(len(LanguageList), activation='softmax'))
model.summary()

2021-07-24 19:23:54.308632: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-24 19:23:54.381850: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-24 19:23:54.382931: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce 940MX computeCapability: 5.0
coreClock: 1.2415GHz coreCount: 3 deviceMemorySize: 3.95GiB deviceMemoryBandwidth: 13.41GiB/s
2021-07-24 19:23:54.383010: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-07-24 19:23:54.390310: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-07-24 19:23:54.390413: I tensorflow/stream_executor/platfor

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               380928    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 645       
Total params: 414,469
Trainable params: 414,469
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
model.fit(train_generator,
          steps_per_epoch=train_steps_per_epoch,
          validation_data=val_generator,
          validation_steps=val_steps_per_epoch,
          epochs=4)

200000
40000


2021-07-24 19:24:04.303102: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-07-24 19:24:04.320685: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2699905000 Hz
2021-07-24 19:24:04.341630: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11


Epoch 1/4


2021-07-24 19:24:04.695284: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11


5000
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f99d3d91610>

In [20]:
model.save('./models/basic-model-shortened-sentence.model')

2021-07-24 21:23:41.935082: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./models/basic-model-shortened-sentence.model/assets


In [21]:
sentence = 'Buenos dias'
man_test_x = word_vectorizer.transform([sentence])
man_test_df = pd.DataFrame(data=man_test_x.toarray(), columns=word_vectorizer.get_feature_names())
man_test_df = (man_test_df - min_df)/(max_df - min_df)
man_test_num = man_test_df.to_numpy()
y = model.predict(man_test_num)
label = np.argmax(y)
prediction = langEncoder.inverse_transform([label])
prediction = prediction[0]
print(prediction)



spa


In [22]:
import pickle
preprocessing_objects = {
    'word_vectorizer': word_vectorizer,
    'lang_encoder': langEncoder,
    'min_df': min_df,
    'max_df': max_df
}
pickle.dump(preprocessing_objects,open('./models/basic-model-shortened-sentence-preprocessing-objects.pkl','wb'))