# Import the necessary libraries

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
%matplotlib inline

In [19]:
Model = tf.keras.models.Model
Tokenizer = tf.keras.preprocessing.text.Tokenizer
sequence = tf.keras.preprocessing.sequence
to_categorical = tf.keras.utils.to_categorical
EarlyStopping = tf.keras.callbacks.EarlyStopping


### Load the data into Pandas dataframe

In [20]:
df = pd.read_csv('../datasets/all.csv',delimiter=',',encoding='latin-1')
df.head()

Unnamed: 0,text,level
0,ArlÃ¶f also sold Peugeot cars under their own ...,A1
1,An FBI document describes him as shiftless a...,A1
2,Also called the Grapevine Dance and the Weaver...,A1
3,"After studying veterinary medicine in 1928 , f...",A1
4,Alexis also represented the United States at t...,A1


Understand the distribution better.

In [21]:
id_to_label_mapping = {0: 'A1', 1: 'A2', 2: 'B1', 3: 'B2', 4: 'C1', 5: 'C2'}
label_to_id_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
cefr_levels = ["A1", "A2", "B1", "B2", "C1", "C2"]

In [22]:
X = df.text
Y = df.level
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [23]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [24]:
max_words = 1000
max_len = 350
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [25]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [26]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [27]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 105ms/step - accuracy: 0.1673 - loss: -36.0988 - val_accuracy: 0.1740 - val_loss: -178.2696
Epoch 2/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 99ms/step - accuracy: 0.1677 - loss: -230.5057 - val_accuracy: 0.1740 - val_loss: -440.9165
Epoch 3/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 100ms/step - accuracy: 0.1652 - loss: -511.0602 - val_accuracy: 0.1740 - val_loss: -800.3666
Epoch 4/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 103ms/step - accuracy: 0.1611 - loss: -890.6808 - val_accuracy: 0.1740 - val_loss: -1256.3964
Epoch 5/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 103ms/step - accuracy: 0.1683 - loss: -1374.3365 - val_accuracy: 0.1740 - val_loss: -1809.5756
Epoch 6/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 104ms/step - accuracy: 0.1665 - loss: -1917.3083 - val_accuracy: 0.1740 - val_loss: 

<keras.src.callbacks.history.History at 0x187ae223d10>

Process the test set data.

In [28]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

Evaluate the model on the test set.

In [29]:
accr = model.evaluate(test_sequences_matrix,Y_test)

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.1625 - loss: -5689.2065


In [30]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: -5616.319
  Accuracy: 0.167
