In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  This is some starter code to encourage doing the TMNIST competition

Please feel free to extend this notebook.

# TMNIST competition

The TMNIST competition is part of the Cognitive Type project <a href='http://cognitivetype.org/'>http://cognitivetype.org/</a>. It is the first in a series of datasets we will release to build AI that can generate cognitive type.

The TMNIST competition is very similar to the MNIST competition, except that there are far more classes and we are detecting designed typefaces rather than hand written digits. We encourage people to be as creative as they wish but this notebook will describe a very standard approach to classifying images by using a CNN,

## Add TMNIST data and import some base libraries

One can search for the TMNIST alphabet data in the Kaggle add data tab. You can check under the data tab that it is in your /kaggle/working directory.  Load the ,csv file assuming it is in your current directory and check its shape and first few rows to see if it loads correctly.  

We will also add some python libraries that we are going to use.  We can add more later should we need them.

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.applications import VGG16
import warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#reading the dataset
df=pd.read_csv('/kaggle/input/tmnist-alphabet-94-characters/94_character_TMNIST.csv')
df.head()

# EDA

We successfully loaded the TMNIST data set 

In [None]:
#size of dataframe
print('Total number of rows in datafame',df.shape[0])
print('Total number of columns in datafame',df.shape[1])
#getting total number of unique classes present in the dataframe
num_classes=df['labels'].nunique()
print('Total number of classes are ',num_classes)
#printing all the unique classes
print(df['labels'].unique())
#getting the total number of font types present in the data
num_font=df['names'].nunique()
print('The total number of different type of fonts present are -',num_font)

In [None]:
#defining the class column and removing other unneccesary columns
y=df['labels']
X=df.drop(['names','labels'],axis=1)
#plotting some of the values and corresponding labels as title
plt.figure(figsize=(15, 6))
for i in range(15):  
  plt.subplot(3,5,i+1)
  plt.title(y.iloc[i])
  plt.imshow(X.values[i].reshape(28,28), cmap=plt.get_cmap('gray'))
plt.show()

In [None]:
#splitiing the data into test and train
#we are using stratified sampling in order to get same distribution of classes in train and test data
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42,stratify=y)

In [None]:
#checking the distribution of train and test dataset

plt.figure(figsize=(30, 10))
plt.bar(y_train.unique(),y_train.value_counts(),color='#851D2D')
plt.title('Class Distribution in train Data')
plt.xlabel('Classes')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(30, 10))
plt.bar(y_test.unique(),y_test.value_counts(),color='#851D2D')
plt.title('Class Distribution in test Data')
plt.xlabel('Classes')
plt.ylabel('Count')

In [None]:
#the pixel values are from 0-255, for neural network models we are squishing them between 0-1 
X_train= (X_train.astype('float32'))/255.0
X_test = (X_test.astype('float32'))/255.0

In [None]:
#encoding the y values using one hot encoder
enc = OneHotEncoder(sparse=False,handle_unknown='ignore')
y_train_encoded=enc.fit_transform(y_train.values.reshape(-1,1))
y_test_encoded=  enc.transform(y_test.values.reshape(-1,1))

In [None]:
X_train_norm=X_train.values.reshape(X_train.shape[0],28,28)
X_test_norm=X_test.values.reshape(X_test.shape[0],28,28)

In [None]:
#since each pixel is a number between 0-1 we will first start with basic Neural Network
model = Sequential()
model.add(Flatten(input_shape=(28,28)))
model.add(Dense(512,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
opt = Adam()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
#fitting the model
model_history=model.fit(X_train_norm, y_train_encoded, epochs=20, validation_data=(X_test_norm, y_test_encoded), verbose=2,batch_size=128)

In [None]:
plt.figure(figsize=(14, 6))
plt.subplot(1,2,1)
epochs=np.arange(20)
plt.title('Accuracy vs Epochs')
plt.plot(epochs,model_history.history['accuracy'],label='train', color='#851D2D')
plt.ylabel('Accuracy')
plt.plot(epochs,model_history.history['val_accuracy'],label='test', color='#306844')
plt.legend()
plt.subplot(1,2,2)
plt.title('Loss vs Epochs')
plt.plot(epochs,model_history.history['loss'],label='train', color='#851D2D')
plt.ylabel('Loss')
plt.plot(epochs,model_history.history['val_loss'],label='test', color='#306844')
plt.legend()
plt.show()

In [None]:
model_2 = Sequential()
model_2.add(Conv2D(128, (5, 5), activation='relu', input_shape=(28, 28,1)))
model_2.add(MaxPooling2D((2, 2)))

model_2.add(Conv2D(64, (3, 3), activation='relu'))
model_2.add(MaxPooling2D((2, 2)))

model_2.add(Flatten())
model_2.add(Dense(64, activation='relu'))
model_2.add(Dense(32, activation='relu'))
model_2.add(Dense(num_classes, activation='softmax'))

opt = Adam()
model_2.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model_2.summary()

In [None]:
X_train_norm=X_train_norm.reshape((X_train_norm.shape[0],28,28,1))
X_test_norm=X_test_norm.reshape((X_test_norm.shape[0],28,28,1))

In [None]:
#fitting the model
model_history2=model_2.fit(X_train_norm, y_train_encoded, epochs=20, validation_data=(X_test_norm, y_test_encoded), verbose=2,batch_size=128)

In [None]:
plt.figure(figsize=(14, 6))
plt.subplot(1,2,1)

plt.title('Accuracy vs Epochs')
plt.plot(epochs,model_history2.history['accuracy'],label='train', color='#851D2D')
plt.ylabel('Accuracy')
plt.plot(epochs,model_history2.history['val_accuracy'],label='test', color='#306844')
plt.legend()
plt.subplot(1,2,2)
plt.title('Loss vs Epochs')
plt.plot(epochs,model_history2.history['loss'],label='train', color='#851D2D')
plt.ylabel('Loss')
plt.plot(epochs,model_history2.history['val_loss'],label='test', color='#306844')
plt.legend()
plt.show()