# Hindi Letter Recognition system

## IMPORTING REQUIRED MODULES

In [None]:
import numpy as np
from PIL import Image
import cv2
import glob
import csv
import os
import pandas as pd
from sklearn.utils import shuffle
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn import decomposition
from sklearn.manifold import TSNE

from sklearn.model_selection import train_test_split
import joblib
from sklearn.svm import SVC
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

## CONVERTING IMAGE DATA TO CSV OF ARRAY OF PIXELS

In [None]:
header = ["label"]
for i in range(0, 1024):
    header.append(f"pxl_{i}")

with open("pixel_data.csv", "w", newline = '') as f:
    writer = csv.writer(f)
    writer.writerow(header)

IMG_DIR = 'MainData/Data'
for label in os.listdir(IMG_DIR):
    dirlist = glob.glob(f"{IMG_DIR}/{label}/*.png")

    for img_path in dirlist:
        img = cv2.imread(img_path)
        img_grey = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img_grey = cv2.GaussianBlur(img_grey, (15,15), 0)
        roi = cv2.resize(img_grey, (32, 32), interpolation = cv2.INTER_AREA)
        data = []
        data.append(label)

        rows, cols = roi.shape

        for i in range(rows):
            for j in range(cols):
                k = roi[i, j]
                data.append(k)

        with open("pixel_data.csv", "a", newline = '') as f:
            writer = csv.writer(f)
            writer.writerow(data)


## READING THE pixel_data.csv FILE

In [None]:
data = pd.read_csv('pixel_data.csv')

## TAKING 500 IMAGES FOR EACH CHARACTER

In [None]:
grouped_data = data.groupby("label")
data = grouped_data.head(500)
data = shuffle(data)

## SEPARATING FEATURES AND LABELS FROM PIXEL DATA

In [None]:
features = data.drop(["label"], axis = 1)
labels = data["label"]

## SPLITTING THE DATA INTO TEST AND TRAIN

In [None]:
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size = 0.2)

## ENCODING TEST AND TRAIN LABELS

In [None]:
lb = LabelEncoder()
train_y_encoded = lb.fit_transform(train_y)
test_y_encoded = lb.transform(test_y)

#### Mapping the encoded values to their actual names

In [None]:
classes = {
    0: "क्ष",
    1: "त्र",
    2: "ज्ञ",
    3: "ग",
    4: "घ",
    5: "ड",
    6: "च",
    7: "छ",
    8: "ज",
    9: "झ"
}

## SCALING TRAIN AND TEST DATA

In [None]:
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)
train_x_scaled_df = pd.DataFrame(train_x_scaled, columns=features.columns)
test_x_scaled_df = pd.DataFrame(test_x_scaled, columns=features.columns)

## USING PCA(PRINCIPAL COMPONENT ANALYSIS) FOR DIMENSIONALITY REDUCTION ON TRAIN AND TEST

In [None]:
pca_model = decomposition.PCA()
n_comp = 85
pca_model.n_components = n_comp
pca_data_train = pca_model.fit_transform(train_x_scaled_df)
pca_data_test = pca_model.transform(test_x_scaled_df)

## CREATING THE REQUIRED PANDAS DATAFRAME FROM THE NUMPY-ND ARRAY

In [None]:
lst = []
for i in range(n_comp):
    lst.append(f"f{i+1}")
pca_df_train = pd.DataFrame(data = pca_data_train, columns = tuple(lst))
pca_df_test = pd.DataFrame(data = pca_data_test, columns = tuple(lst))

# TSNE AFTER PCA

In [None]:
# Hyper Parameters
dimensions = 3
perplexity = 5
n_iter = 5000

tsne_model = TSNE(n_components = dimensions, perplexity = perplexity, random_state = 42, n_iter = n_iter, n_jobs = -1)

In [None]:
tsne_data_train = tsne_model.fit_transform(pca_df_train)
tsne_data_test = tsne_model.fit_transform(pca_df_test)
tsne_df_train = pd.DataFrame(data = tsne_data_train, columns = ("f1", "f2","f3"))
tsne_df_test = pd.DataFrame(data = tsne_data_test, columns = ("f1", "f2","f3"))

## CHECKING THE ACCURACY OF THE NN TRAINED MODEL ON TEST DATA

In [None]:
train_eval = classifier_nn.evaluate(tsne_df_train, train_y_encoded, verbose=0)
test_eval = classifier_nn.evaluate(tsne_df_test, test_y_encoded, verbose=0)
h = ["loss", "accuracy"]
l = [train_eval, test_eval]
eval_df = pd.DataFrame(l, columns = h)
eval_df.insert(loc = 0, column = "Data", value = ["Train", "Test"])
acc_pca_tSNE_nn = round(eval_df['accuracy'][1], 3)
eval_df

## Training Model using pca

### Using SVC

In [None]:
classifier = SVC(kernel = "linear", random_state = 6)
classifier.fit(pca_df_train, train_y_encoded)

In [None]:
#Checking the accuracy of SVC model
prediction = classifier.predict(pca_df_test)
accuracy = metrics.accuracy_score(prediction, test_y_encoded)
acc_pca_svc = accuracy
print(metrics.classification_report(prediction, test_y_encoded, target_names = classes.values()))
print(f"PCA-SVM accuracy: {accuracy}")

In [None]:
##Training model with neural network
classifier_nn = tf.keras.models.Sequential()
classifier_nn.add(tf.keras.layers.Dense(128, activation = 'relu'))
classifier_nn.add(tf.keras.layers.Dense(10, activation = 'softmax'))
classifier_nn.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
history = classifier_nn.fit(pca_df_train, train_y_encoded, epochs = 20)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPooling2D, GlobalMaxPool1D, Embedding, Activation, Flatten,Input
from keras.layers import Conv2D

In [None]:
CNN_model.predict(pca_data_test)

In [None]:
train_eval = classifier_nn.evaluate(pca_df_train, train_y_encoded, verbose=0)
test_eval = classifier_nn.evaluate(pca_df_test, test_y_encoded, verbose=0)
h = ["loss", "accuracy"]
l = [train_eval, test_eval]
eval_df = pd.DataFrame(l, columns = h)
eval_df.insert(loc = 0, column = "Data", value = ["Train", "Test"])
acc_pca_nn = round(eval_df['accuracy'][1], 3)
eval_df