In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
train_df = pd.read_csv("/home/kishore/Data/train_languages.csv")
train_df.head()

Unnamed: 0.1,Unnamed: 0,sentence,language
0,0,"Jean Beauverie (Fontaines-sur-Saône, 18 febbra...",italian
1,1,Il pinguino saltarocce (Eudyptes chrysocome (F...,italian
2,2,Maison Ikkoku - Cara dolce Kyoko (めぞん一刻 Mezon ...,italian
3,3,La mia città è un singolo della cantante itali...,italian
4,4,L'Armata Rossa dei Lavoratori e dei Contadini ...,italian


In [3]:
def label_encoding(data):
    try:
        if data.empty!= True:
            Y = data['language']
            encoder = LabelEncoder()
            encoder.fit(Y)
            Y = encoder.transform(Y)
            Y = tf.keras.utils.to_categorical(Y,num_classes=4)
            return Y,encoder
    except:
        print("Label is empty")

        
        
def dataPreprocessing(train_df,max_features,maxlen):
   
    train_df['sentence'] = train_df["sentence"].str.lower()
    train_df['sentence_no_punctuation'] = train_df['sentence'].str.replace('[^\w\s]','')
    train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")
    tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step
    tok.fit_on_texts(list(train_df['sentence_no_punctuation'])) #fit to cleaned text
    vocab_size = len(tok.word_index) + 1 
    train_df = tok.texts_to_sequences(list(train_df['sentence_no_punctuation'])) #this is how we create sequences
    train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step
    return train_df,vocab_size
def data_sampling(train_df,Y):
    X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)
    return X_train, X_test, y_train, y_test

def model(vocab_size,maxlen):
    embedding_dim = 50 
    model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, #embedding input
                           output_dim=embedding_dim,#embedding output
                           input_length=maxlen), #maximum length of an input sequence
    #tf.keras.layers.Flatten(),#flatten layer
    #tf.keras.layers.LSTM(64,return_sequences=True),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.GRU(64),
    tf.keras.layers.Dense(128,activation=tf.nn.relu),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(4, activation=tf.nn.softmax)])
    
    model.compile(optimizer='adam',
              loss='categorical_crossentropy', #we recommend this loss function you
              metrics=['accuracy'])
    return model

def training(model,X_train,y_train):
    model.fit(np.array(X_train), np.array(y_train), epochs=10) #let's fit the model
    return model

def model_evaluation(model,X_test,y_test):
    evl=model.evaluate(np.array(X_test), np.array(y_test))
    return evl

def confusion_matrix_test(model,X_test,y_test):
    predictions = model.predict(X_test) #here we make predictions
    cm = confusion_matrix(predictions.argmax(axis=1), y_test.argmax(axis=1))
    print("confusion matix======================")
    print(cm)

In [4]:
train_df = pd.read_csv("/home/kishore/Data/train_languages.csv")
train_df.head()
max_features=5000
maxlen=400
# cleaning data and transforming it to sequence of tokens

clen_data,vocab_size=dataPreprocessing(train_df,max_features,maxlen)

# label encoding

encoded_label,encoder=label_encoding(train_df)

# sampling data into spliting into train=80% , test=20%

X_train, X_test, y_train, y_test=data_sampling(clen_data,encoded_label)

# model building 
languageModel=model(vocab_size,maxlen)

# training 
train_model=training(languageModel,X_train,y_train)
print("evaluation model accuracy")
accuracy=model_evaluation(train_model,X_test,y_test)
print("============accuracy=",accuracy)
confusion_matrix_test(train_model,X_test,y_test)

Train on 3269 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
evaluation model accuracy
[[ 91   0   0   0]
 [  0  88   0   0]
 [  0   0 102   0]
 [  0   0   0  83]]


#### Custom data prediction 

In [5]:
#these are the codes for each language in order to evaluate properly
print('english', encoder.transform(['english']))
print('french', encoder.transform(['french']))
print('italian', encoder.transform(['italian']))
print('spanish', encoder.transform(['spanish']))

english [0]
french [1]
italian [2]
spanish [3]


In [6]:
new_text = ["tensorflow is a great tool you can find a lot of tutorials from packt"]
#new_text = ["tensorflow est un excellent outil vous pouvez trouver beaucoup de tutoriels de packt"]
#new_text = ["tensorflow è un ottimo strumento puoi trovare molti tutorial di packt"]
#new_text = ["tensorflow es una gran herramienta puedes encontrar muchos tutoriales de packt"]

In [7]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
test_text = tok.texts_to_sequences(new_text) #this is how we create sequences
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=maxlen)

In [8]:
np.set_printoptions(suppress=True)
predictions = train_model.predict(test_text)
print(predictions.argmax())
print(predictions) #spanish you can get confused with italian which makes sense since they are more similar languages

0
[[0.38839382 0.16683602 0.2180193  0.2267509 ]]
