## Data Preprocessing - Includes reading training images

In [1]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
from PIL import Image, ImageFilter
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss, confusion_matrix
import matplotlib.pyplot as plt

np.random.seed(100)
LEVEL = 'level_1'

In [2]:
def read_all(folder_path, key_prefix=""):
    '''
    It returns a dictionary with 'file names' as keys and 'flattened image arrays' as values.
    '''
    print("Reading:")
    images = {}
    files = os.listdir(folder_path)
    for i, file_name in tqdm_notebook(enumerate(files), total=len(files)):
        file_path = os.path.join(folder_path, file_name)
        image_index = key_prefix + file_name[:-4]
        image = Image.open(file_path)
        image = image.convert("L")
        images[image_index] = np.array(image.copy()).flatten()
        image.close()
    return images

In [3]:
languages = ['ta', 'hi', 'en']

images_train = read_all("../input/level_1_train/"+LEVEL+"/"+"background", key_prefix='bgr_') # change the path
for language in languages:
  images_train.update(read_all("../input/level_1_train/"+LEVEL+"/"+language, key_prefix=language+"_" ))
print(len(images_train))

images_test = read_all("../input/level_1_test/kaggle_"+LEVEL, key_prefix='') # change the path
print(len(images_test))

Reading:


HBox(children=(IntProgress(value=0, max=450), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


900
Reading:


HBox(children=(IntProgress(value=0, max=300), HTML(value='')))


300


In [4]:
list(images_test.keys())[:5]

['171', '249', '82', '163', '16']

In [5]:
X_train = []
Y_train = []
for key, value in images_train.items():
    X_train.append(value)
    if key[:4] == "bgr_":
        Y_train.append(0)
    else:
        Y_train.append(1)

ID_test = []
X_test = []
for key, value in images_test.items():
  ID_test.append(int(key))
  X_test.append(value)
  
        
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)

print(X_train.shape, Y_train.shape)
print(X_test.shape)

(900, 256) (900,)
(300, 256)


## MP Neuron Model

In [6]:
class MPNeuron:
  
  def __init__(self):
    self.b = None
    
  def model(self, x):
    return (sum(x) <= self.b)
  
  def predict(self, X):
    Y = []
    for x in X:
      result = self.model(x)
      Y.append(result)
    return np.array(Y)
  
  def fit(self, X, Y):
    accuracy = {}
    
    for b in range(X.shape[1] + 1):
      self.b = b
      Y_pred = self.predict(X)
      accuracy[b] = accuracy_score(Y_pred, Y)
      
    best_b = max(accuracy, key = accuracy.get)
    self.b = best_b
    return accuracy

## Fit the MP Neuron Model

In [9]:
threshold = 255
X_train_binarised = (X_train >= threshold).astype(np.int)
X_test_binarised = (X_test >= threshold).astype(np.int)

mpneuron = MPNeuron()
accuracy = mpneuron.fit(X_train_binarised, Y_train)
print(mpneuron.b)
print('Accuracy on train set is: {}'.format(accuracy[mpneuron.b]))

222
Accuracy on train set is: 1.0


## Sample Submission

In [None]:
# evaluate the model over the test data
Y_pred_test = mpneuron.predict(X_test_binarised)
Y_pred_binarised_test = (Y_pred_test >= 0.5).astype("int").ravel()

submission = {}
submission['ImageId'] = ID_test
submission['Class'] = Y_pred_binarised_test

submission = pd.DataFrame(submission)
submission = submission[['ImageId', 'Class']]
submission = submission.sort_values(['ImageId'])
print(submission.head())
submission.to_csv("submisision.csv", index=False)