# About the Data

Each image is stored as pixels 0 through 783 (28x28 image) in a CSV file where each row is a separate image.\
The first column in each row is the label (0-9) and the rest are the pixels (0-255 where 0 is white and 255 is black).

Each image takes the following format:

000 001 002 003 ... 026 027\
028 029 030 031 ... 054 055\
056 057 058 059 ... 082 083\
... ... ... ... ... ... ...\
728 729 730 731 ... 754 755\
756 757 758 759 ... 782 783

# Load the Data

In [184]:
import pandas as pd
import plotly.express as px

In [185]:
digits = pd.read_csv('train.csv')

# Make Observations

In [186]:
# Find the number of samples per digit
counts = digits["label"].value_counts().sort_index()
count_fig = px.bar(counts, x=counts.index, y=counts.values, labels={"x": "Digit", "y": "Count"})
count_fig.show()

In [187]:
def cols_to_coords():
    colnames = {}
    pixel_num = 0
    for i in range(0, 28):
        for j in range(0, 28):
            colnames[f'pixel{pixel_num}'] = f'{i},{j}'
            pixel_num += 1
    return colnames

In [188]:
# show an example of a digit (we will use 8)
one_digit = digits[digits["label"] == 8].iloc[1]

colnames = cols_to_coords()
one_digit = one_digit.rename(colnames).reset_index()

col = one_digit.columns[1]
one_digit = one_digit.rename(columns={"index": "pixel", col: "brightness"})

one_digit[['x', 'y']] = one_digit['pixel'].str.split(',', expand=True).rename(columns={0: "x", 1: "y"})
one_digit = one_digit.drop(columns=["pixel"])

one_digit = one_digit.pivot_table(index='x', columns='y', values='brightness', sort=False)

one_digit_heatmap = px.imshow(one_digit, color_continuous_scale='gray_r', title="Heatmap of One Digit")
one_digit_heatmap.show()

In [189]:
# show heatmap of average brightness of 8s (can be any digit)
eight = digits[digits['label'] == 8]

colnames = cols_to_coords()
eight = eight.rename(columns=colnames).drop('label', axis=1)

eight = eight.mean(axis=0).reset_index().rename(columns={'index': 'pixel', 0: 'brightness'})
eight[["x","y"]] = eight["pixel"].str.split(",", expand=True).rename(columns={0: "x", 1: "y"})
eight = eight.drop('pixel', axis=1)

eight = eight.pivot_table(index='x', columns='y', values='brightness', sort=False)

heatmap = px.imshow(eight, color_continuous_scale='gray_r', title='Average Brightness of 8s')
heatmap.show()


In [190]:
digits["percent black pixels"] = digits.drop("label", axis=1, inplace=False).apply(lambda row: (sum(row != 0) / 784) * 100, axis=1)

black = px.box(digits, x='label', y='percent black pixels', title='Percent of Black Pixels in Each Digit')
black.show()

In [191]:
# symmetry functions
def horizontal_symmetry(row):
    matrix = row.to_numpy().reshape(28, 28)
    count = 0
    for i in range(0, 28):
        for j in range(0, 14):
            if matrix[i][j] == matrix[i][28 - j - 1] and (matrix[i][j] != 0 and matrix[i][28 - j - 1] != 0):
                count += 1
    return count / 784 * 100
                
        
def vertical_symmetry(row):
    matrix = row.to_numpy().reshape(28, 28)
    count = 0
    for i in range(0, 14):
        for j in range(0, 28):
            if matrix[i][j] == matrix[28 - i - 1][j] and (matrix[i][j] != 0 and matrix[28 - i - 1][j] != 0):
                count += 1
                
    return count / 784 * 100
        

In [192]:
# TODO: horizontal symmetry --> loosen up the exact match requirement
digits["percent horizontal symmetry"] = digits.drop(["label", "percent black pixels"], axis=1, inplace=False).apply(horizontal_symmetry, axis=1)

horizontal_symmetry = px.box(digits, x='label', y='percent horizontal symmetry', title='Percent of Horizontal Symmetry in Each Digit')
horizontal_symmetry.show()

In [193]:
# TODO: vertical symmetry --> loosen up the exact match requirement
digits["percent vertical symmetry"] = digits.drop(["label", "percent black pixels", "percent horizontal symmetry"], axis=1, inplace=False).apply(vertical_symmetry, axis=1)

vertical_symmetry = px.box(digits, x='label', y='percent vertical symmetry', title='Percent of Vertical Symmetry in Each Digit')
vertical_symmetry.show()

# Our Model

In [194]:
# TODO: do this after data exploration presentation