In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
df = pd.read_fwf('./wili dataset/x_train.txt', header=None)
X_train = df[[0]]
df = pd.read_fwf('./wili dataset/x_test.txt', header=None)
X_test = df[[0]] 

In [3]:
target = pd.read_fwf('./wili dataset/y_train.txt',header = None)
y_train = target[[0]]
target = pd.read_fwf('./wili dataset/y_test.txt',header = None)
y_test = target[[0]]

In [4]:
y_train[y_train == 'be-tara'] = 'be-tarask'
y_train[y_train == 'roa-tar'] = 'roa-tara'

In [5]:
languages = ['eng', 'hin', 'mar', 'guj', 'tam', 'tcy', 'tel', 'pan', 'ben', 'mai']

In [6]:
train_index = y_train.index[y_train[0].isin(languages)].tolist()

In [7]:
test_index = y_test.index[y_test[0].isin(languages)].tolist()

In [8]:
X_train_p = X_train.iloc[train_index,:]
X_test_p = X_test.iloc[test_index,:]

In [9]:
y_train_p = y_train.iloc[train_index, :]
y_test_p = y_test.iloc[test_index, :]

### Tokenize Labels

In [10]:
label2int = {}
int2label = {}
counter = 0
for label in languages:
    if label not in label2int:
        label2int[label] = counter
        int2label[counter] = label
    counter += 1

In [11]:
pickle.dump(label2int, open('label2int.pkl', 'wb'))
pickle.dump(int2label, open('int2label.pkl', 'wb'))

### Tokenize target Variables

In [12]:
y_train_int = []
for label in y_train_p[0]:
    y_train_int.append(label2int[label])

In [13]:
y_test_int = []
for label in y_test_p[0]:
    y_test_int.append(label2int[label])

### Data Preprocessing

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer = TfidfVectorizer(analyzer='char',min_df=25,lowercase=True, norm='l2')

In [16]:
vectorizer.fit(X_train_p[0])

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=25,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
X_train2int = vectorizer.transform(X_train_p[0]).toarray()

In [18]:
X_test2int = vectorizer.transform(X_test_p[0]).toarray()

In [19]:
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

### Scaler

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
sc = StandardScaler()

In [22]:
sc.fit(X_train2int)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [23]:
X_train2int_sc = sc.transform(X_train2int)
X_test2int_sc = sc.transform(X_test2int)

In [24]:
pickle.dump(sc, open('scaler.pkl', 'wb'))

### PCA

In [25]:
from sklearn.decomposition import PCA

In [26]:
pca = PCA(n_components=80)

In [27]:
pca.fit(X_train2int_sc)

PCA(copy=True, iterated_power='auto', n_components=80, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [28]:
X_train_pca = pca.transform(X_train2int_sc)

In [29]:
X_test_pca = pca.transform(X_test2int_sc)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel("No. of components")
plt.ylabel("cummulative explained Variance");

In [None]:
pickle.dump(pca, open('pca.pkl', 'wb'))

### One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
y_train_int = np.array(y_train_int).reshape(-1, 1)
y_test_int = np.array(y_test_int).reshape(-1, 1)

In [None]:
enc = OneHotEncoder()

In [None]:
enc.fit(y_train_int)

In [None]:
y_train_enc = enc.transform(y_train_int)
y_test_enc = enc.transform(y_test_int)

## Design ANN

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import model_from_json

In [None]:
clf = Sequential()
clf.add(Dense(output_dim = 60, kernel_initializer = 'uniform', activation='relu', input_dim = 80))
clf.add(Dropout(rate=0.2))
clf.add(Dense(output_dim = 60, kernel_initializer = 'uniform', activation='relu'))
clf.add(Dropout(rate=0.2))
clf.add(Dense(output_dim = 60, kernel_initializer = 'uniform', activation='relu'))
clf.add(Dropout(rate=0.2))
clf.add(Dense(output_dim =10  , kernel_initializer = 'uniform', activation='softmax'))

In [None]:
clf.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics = ['accuracy'])

In [None]:
with tf.device('/gpu:0'):
    history = clf.fit(X_train_pca, y_train_enc, batch_size=32, epochs=50, validation_data=(X_test_pca,y_test_enc))

In [None]:
model_json = clf.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
clf.save_weights("model.h5")

## Evaluation

In [None]:
y_pred_test_label = clf.predict(X_test_pca)
y_pred_train_label = clf.predict(X_train_pca)

In [None]:
y_pred_test = np.argmax(y_pred_test_label, axis=1)
y_pred_train = np.argmax(y_pred_train_label, axis=1)

In [None]:
y_train_true = y_train_int.astype(np.int)
y_test_true = y_test_int.astype(np.int)

## Graph Plot

### Model Accuracy Graph

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

### Model Loss Graph

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## Classification Report

In [None]:
from sklearn.metrics import classification_report

#### Train Data report

In [None]:
print(classification_report(y_train_true, y_pred_train))

#### Test Data Report

In [None]:
print(classification_report(y_test_true, y_pred_test))

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

### Train data

In [None]:
print(confusion_matrix(y_pred_train, y_train_true))

### Test data

In [None]:
print(confusion_matrix(y_pred_test, y_test_true))

### Making RNN

In [None]:
from keras.layers import Embedding
from keras.layers import SpatialDropout1D
from keras.layers import LSTM

In [None]:
# Using RNN
# Initialising classifier
clf = Sequential()
# Adding First Embedded Layer
clf.add(Embedding(150, 128, input_length=X_train_pca.shape[1]))
clf.add(SpatialDropout1D(0.2))

# Adding Lstm Layer
clf.add(LSTM(98, dropout=0.2, recurrent_dropout=0.2))

# Adding fully connected layer
clf.add(Dense(100, activation='relu'))

# Adding output layer
clf.add(Dense(10, activation='softmax'))

# Compiling classifier
clf.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

### Making Single Prediction

In [None]:
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model.h5")
model.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics = ['accuracy'])

In [None]:
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
scaler = pickle.load(open('scaler.pkl', 'rb'))
pca = pickle.load(open('pca.pkl', 'rb'))
label2int = pickle.load(open('label2int.pkl', 'rb'))
int2label = pickle.load(open('int2label.pkl', 'rb'))

In [None]:
input_single = "मेरा नाम जतिन है"
X_single = pd.DataFrame(data=[input_single], index=None)
X_single.to_csv('input.csv', encoding='utf-8', sep='\n', index=False,header=None)
X_single = pd.read_fwf('input.csv', delimiter = '\n', header = None, squeeze = True)
X_single

In [None]:
X_single = vectorizer.transform(X_single).toarray()
X_single = pca.transform(X_single)
y_pred_single = model.predict(X_single)

In [None]:
y_pred_single

In [None]:
y_pred_single_val = np.argmax(y_pred_single, axis=1)

In [None]:
int2label[y_pred_single_val[0]]