# About the Data

Each image is stored as pixels 0 through 783 (28x28 image) in a CSV file where each row is a separate image.\
The first column in each row is the label (0-9) and the rest are the pixels (0-255 where 0 is white and 255 is black).

Each image takes the following format:

000 001 002 003 ... 026 027\
028 029 030 031 ... 054 055\
056 057 058 059 ... 082 083\
... ... ... ... ... ... ...\
728 729 730 731 ... 754 755\
756 757 758 759 ... 782 783

# Load the Data

In [294]:
import pandas as pd
import plotly.express as px
import random

In [295]:
digits = pd.read_csv('train.csv')

# Make Observations

In [296]:
digits.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [297]:
# Find the number of samples per digit
counts = digits["label"].value_counts().sort_index()
count_fig = px.bar(counts, x=counts.index, y=counts.values, labels={"index": "Digit", "y": "Count"}, title="Number of Samples per Digit")
count_fig.show()

In [298]:
def cols_to_coords():
    colnames = {}
    pixel_num = 0
    for i in range(0, 28):
        for j in range(0, 28):
            colnames[f'pixel{pixel_num}'] = f'{i},{j}'
            pixel_num += 1
    return colnames

In [299]:
# show an example of a digit (we will use 8)
one_digit = digits[digits["label"] == 8].iloc[1]

colnames = cols_to_coords()
one_digit = one_digit.rename(colnames).reset_index()

col = one_digit.columns[1]
one_digit = one_digit.rename(columns={"index": "pixel", col: "brightness"})

one_digit[['x', 'y']] = one_digit['pixel'].str.split(',', expand=True).rename(columns={0: "x", 1: "y"})
one_digit = one_digit.drop(columns=["pixel"])

one_digit = one_digit.pivot_table(index='x', columns='y', values='brightness', sort=False)

one_digit_heatmap = px.imshow(one_digit, color_continuous_scale='gray_r', title="Heatmap of One Digit")
one_digit_heatmap.show()

In [300]:
# show heatmap of average brightness of 8s (can be any digit)
eight = digits[digits['label'] == 8]

colnames = cols_to_coords()
eight = eight.rename(columns=colnames).drop('label', axis=1)

eight = eight.mean(axis=0).reset_index().rename(columns={'index': 'pixel', 0: 'brightness'})
eight[["x","y"]] = eight["pixel"].str.split(",", expand=True).rename(columns={0: "x", 1: "y"})
eight = eight.drop('pixel', axis=1)

eight = eight.pivot_table(index='x', columns='y', values='brightness', sort=False)

heatmap = px.imshow(eight, color_continuous_scale='gray_r', title='Average Brightness of 8s')
heatmap.show()


In [301]:
digits["percent black pixels"] = digits.drop("label", axis=1, inplace=False).apply(lambda row: (sum(row != 0) / 784) * 100, axis=1)

black = px.box(digits, x='label', y='percent black pixels', labels={"label": "Digit"}, title='Percent of Black Pixels in Each Digit')
black.show()

In [302]:
digits["percent white pixels"] = digits.drop("label", axis=1, inplace=False).apply(lambda row: (sum(row == 0) / 784) * 100, axis=1)

white = px.box(digits, x='label', y='percent white pixels', labels={"label": "Digit"}, title='Percent of White Pixels in Each Digit')
white.show()

In [303]:
# symmetry functions
def horizontal_symmetry(row):
    matrix = row.to_numpy().reshape(28, 28)
    count = 0
    for i in range(0, 28):
        for j in range(0, 14):
            if matrix[i][j] == matrix[i][28 - j - 1] and (matrix[i][j] != 0 and matrix[i][28 - j - 1] != 0):
                count += 1
    return count / (784 / 2) * 100
                
        
def vertical_symmetry(row):
    matrix = row.to_numpy().reshape(28, 28)
    count = 0
    for i in range(0, 14):
        for j in range(0, 28):
            if matrix[i][j] == matrix[28 - i - 1][j] and (matrix[i][j] != 0 and matrix[28 - i - 1][j] != 0):
                count += 1
                
    return count / (784 / 2) * 100
        

In [304]:
# TODO: horizontal symmetry --> loosen up the exact match requirement
digits["percent horizontal symmetry"] = digits.drop(["label", "percent black pixels", "percent white pixels"], axis=1, inplace=False).apply(horizontal_symmetry, axis=1)

horizontal_symmetry = px.box(digits, x='label', y='percent horizontal symmetry', labels={"label": "Digit"}, title='Percent of Horizontal Symmetry in Each Digit')
horizontal_symmetry.show()

In [305]:
# TODO: vertical symmetry --> loosen up the exact match requirement
digits["percent vertical symmetry"] = digits.drop(["label", "percent black pixels", "percent horizontal symmetry", "percent white pixels"], axis=1, inplace=False).apply(vertical_symmetry, axis=1)

vertical_symmetry = px.box(digits, x='label', y='percent vertical symmetry', labels={"label": "Digit"}, title='Percent of Vertical Symmetry in Each Digit')
vertical_symmetry.show()

In [306]:
digits.describe()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,percent black pixels,percent white pixels,percent horizontal symmetry,percent vertical symmetry
count,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,...,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0
mean,4.456643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017238,0.002857,0.0,0.0,0.0,0.0,19.152964,80.847036,1.991484,1.745347
std,2.88773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.894498,0.414264,0.0,0.0,0.0,0.0,5.298041,5.298041,2.149293,1.982542
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.336735,55.229592,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,15.561224,77.295918,0.510204,0.510204
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,19.132653,80.867347,1.27551,1.020408
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,22.704082,84.438776,2.806122,2.295918
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,62.0,0.0,0.0,0.0,0.0,44.770408,95.663265,26.27551,25.255102


In [307]:
def add_random_noise(val):
    return val + 0.25 * random.random()

In [312]:
# TODO: VS vs HS for easily confused digits
one_and_seven = digits[(digits['label'] == 1) | (digits['label'] == 7)]
pct_vert_sym = one_and_seven['percent vertical symmetry'].apply(add_random_noise)
# pct_horiz_sym = one_and_seven['percent horizontal symmetry'].apply(add_random_noise)

# one_and_seven = one_and_seven.assign(noisy_percent_vertical_symmetry=pct_vert_sym, noisy_percent_horizontal_symmetry=pct_horiz_sym).rename(columns={"noisy_percent_vertical_symmetry": "noisy percent vertical symmetry", "noisy_percent_horizontal_symmetry": "noisy percent horizontal symmetry"})
one_and_seven = one_and_seven.assign(noisy_percent_vertical_symmetry=pct_vert_sym).rename(columns={"noisy_percent_vertical_symmetry": "noisy percent vertical symmetry"})

one_seven_fig = px.scatter(one_and_seven, x='noisy percent vertical symmetry', y='percent horizontal symmetry', range_x=(0, 8), range_y=(0, 12), color='label', title='Vertical Symmetry vs Horizontal Symmetry for 1s and 7s')
one_seven_fig.show()

In [317]:
# TODO: VS vs Black Pixels for easily confused digits
six_and_nine = digits[(digits['label'] == 6) | (digits['label'] == 9)]
pct_vert_sym = six_and_nine['percent vertical symmetry'].apply(add_random_noise)
# pct_black_pixels = six_and_nine['percent black pixels'].apply(add_random_noise)

# six_and_nine = six_and_nine.assign(noisy_percent_vertical_symmetry=pct_vert_sym, noisy_percent_black_pixels=pct_black_pixels).rename(columns={"noisy_percent_vertical_symmetry": "noisy percent vertical symmetry", "noisy_percent_black_pixels": "noisy percent black pixels"})
six_and_nine = six_and_nine.assign(noisy_percent_vertical_symmetry=pct_vert_sym).rename(columns={"noisy_percent_vertical_symmetry": "noisy percent vertical symmetry"})

six_nine_fig = px.scatter(six_and_nine, x='noisy percent vertical symmetry', y='percent black pixels', range_x=(0, 10), color='label', title='Vertical Symmetry vs Black Pixels for 6s and 9s')
six_nine_fig.show()

# Our Model

In [310]:
# TODO: do this after data exploration presentation