# **EDA and Pre Processing to simplify further Analysis and Simulations**

### Steps:

+ Split each equation into separate columns
+ Count the amount of unique elements per equation
+ Count how many times does an element appear in each equation
+ Count the minimum and maximum times an element can appear in all equations
+ Count how many times does each element appear in each position across all equations
+ Correlate elements
+ Correlate elements across positions


# TODO:
+ Commutative equations analysis

In [1]:
import plotly.graph_objects as go
import plotly.subplots as sp
from tqdm import tqdm
import pandas as pd
import numpy as np


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 10)

## Defining constants

In [2]:
NUMBER_OF_ELEMENTS = 8
ELEMENTS = list("1234568790+-*/=")

# Loading the data

*(And setting the index)*

In [3]:
df_raw = pd.read_csv("./data/0.raw/equations_nerdle.csv")
df_raw.index = df_raw["equation"]

df_raw

Unnamed: 0_level_0,equation
equation,Unnamed: 1_level_1
12*9=108,12*9=108
13*8=104,13*8=104
13*9=117,13*9=117
14*8=112,14*8=112
14*9=126,14*9=126
...,...
9/45*5=1,9/45*5=1
9/54*6=1,9/54*6=1
9/63*7=1,9/63*7=1
9/72*8=1,9/72*8=1


### Spliting the equations

In [4]:
df_positions = pd.DataFrame()

# Extract each position
for i in range(NUMBER_OF_ELEMENTS):
    df_positions[f"p{i}"] = df_raw["equation"].str[i].astype("category")


df_positions

Unnamed: 0_level_0,p0,p1,p2,p3,p4,p5,p6,p7
equation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12*9=108,1,2,*,9,=,1,0,8
13*8=104,1,3,*,8,=,1,0,4
13*9=117,1,3,*,9,=,1,1,7
14*8=112,1,4,*,8,=,1,1,2
14*9=126,1,4,*,9,=,1,2,6
...,...,...,...,...,...,...,...,...
9/45*5=1,9,/,4,5,*,5,=,1
9/54*6=1,9,/,5,4,*,6,=,1
9/63*7=1,9,/,6,3,*,7,=,1
9/72*8=1,9,/,7,2,*,8,=,1


### Counting the number of unique elements per equation

In [5]:
df_unique = pd.DataFrame()

# Compute the number of unique elements
df_unique["count"] = df_raw["equation"].apply(lambda x: len(set(x))).astype("uint8")


df_unique

Unnamed: 0_level_0,count
equation,Unnamed: 1_level_1
12*9=108,7
13*8=104,7
13*9=117,6
14*8=112,6
14*9=126,7
...,...
9/45*5=1,7
9/54*6=1,8
9/63*7=1,8
9/72*8=1,8


Checking the distribution of equations by number of unique elements

In [6]:
# Counting unique elements distribution
df_temp = df_unique["count"].value_counts()

# Creating figure
fig = go.Figure()

# Adding trace
fig.add_trace(
    go.Pie(
        labels=df_temp.index,
        values=df_temp,
    )
)

# Updating the layout
fig.update_layout(
    title="Unique elements distribution",
    height=600,
    width=600,
)

# Showing the figure
fig.show()

Checking the mean element *"uniqueness"*

In [7]:
df_unique["count"].mean()

6.815945381707386

### Counting the amount of each element in each equation

In [8]:
df_elements = pd.DataFrame()

# Compute the element count in each equation
for element in ELEMENTS:
    df_elements[element] = df_raw["equation"].apply(lambda x: x.count(element)).astype("uint8")


df_elements

Unnamed: 0_level_0,1,2,3,4,5,6,8,7,9,0,+,-,*,/,=
equation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
12*9=108,2,1,0,0,0,0,1,0,1,1,0,0,1,0,1
13*8=104,2,0,1,1,0,0,1,0,0,1,0,0,1,0,1
13*9=117,3,0,1,0,0,0,0,1,1,0,0,0,1,0,1
14*8=112,3,1,0,1,0,0,1,0,0,0,0,0,1,0,1
14*9=126,2,1,0,1,0,1,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9/45*5=1,1,0,0,1,2,0,0,0,1,0,0,0,1,1,1
9/54*6=1,1,0,0,1,1,1,0,0,1,0,0,0,1,1,1
9/63*7=1,1,0,1,0,0,1,0,1,1,0,0,0,1,1,1
9/72*8=1,1,1,0,0,0,0,1,1,1,0,0,0,1,1,1


## Joining all DataFrames

In [9]:
# Concatenating the DataFrames into a multi-indexed single one
df = pd.concat([df_positions, df_unique, df_elements], axis=1, keys=['positions', 'unique', 'elements'])

df.head()

Unnamed: 0_level_0,positions,positions,positions,positions,positions,positions,positions,positions,unique,elements,elements,elements,elements,elements,elements,elements,elements,elements,elements,elements,elements,elements,elements,elements
Unnamed: 0_level_1,p0,p1,p2,p3,p4,p5,p6,p7,count,1,2,3,4,5,6,8,7,9,0,+,-,*,/,=
equation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
12*9=108,1,2,*,9,=,1,0,8,7,2,1,0,0,0,0,1,0,1,1,0,0,1,0,1
13*8=104,1,3,*,8,=,1,0,4,7,2,0,1,1,0,0,1,0,0,1,0,0,1,0,1
13*9=117,1,3,*,9,=,1,1,7,6,3,0,1,0,0,0,0,1,1,0,0,0,1,0,1
14*8=112,1,4,*,8,=,1,1,2,6,3,1,0,1,0,0,1,0,0,0,0,0,1,0,1
14*9=126,1,4,*,9,=,1,2,6,7,2,1,0,1,0,1,0,0,1,0,0,0,1,0,1


# Counting element occurances

In [10]:
df_elements_occurances = pd.DataFrame()

# Counting the values of each element
for element in ELEMENTS:
    df_elements_occurances = pd.concat([df_elements_occurances, df["elements"][element].value_counts()], axis=1)

# Transposing the DataFrame
df_elements_occurances = df_elements_occurances.transpose()

# Fillin the missing values and converting "dtypes"
df_elements_occurances = df_elements_occurances.fillna(0)
df_elements_occurances = df_elements_occurances.astype("uint16")


df_elements_occurances

Unnamed: 0,1,0,2,3,4
1,7526,6955,2830,400,12
2,7069,8449,1922,275,8
3,6478,9498,1534,201,12
4,6522,9355,1593,239,14
5,5961,10045,1451,253,13
...,...,...,...,...,...
+,6194,10884,645,0,0
-,7120,9913,690,0,0
*,5200,12149,374,0,0
/,3650,13699,374,0,0


In [11]:
# Plotting bars

fig = go.Figure()

for i in range(min(*df_elements_occurances.columns), max(*df_elements_occurances.columns) + 1):
    fig.add_trace(go.Bar(x=df_elements_occurances.index, y=df_elements_occurances[i], name=i))

fig.update_layout(
    title="Elements occurances",
    barmode='stack',
    height=500,
    width=1000,
)

fig.show()

# Counting the 'min' and 'max' times an element can be found in an equation

In [12]:
df_elements_minmax = pd.DataFrame()

# Computing the minimum and maximum number of occurances of each element
for element in ELEMENTS:
    df_elements_minmax.loc["min", element]= df[("elements", element)].min()
    df_elements_minmax.loc["max", element]= df[("elements", element)].max()

# Converting 'dtypes'
df_elements_max = df_elements_minmax.astype("uint8")


df_elements_max.style.background_gradient(axis=1, cmap="plasma")

Unnamed: 0,1,2,3,4,5,6,8,7,9,0,+,-,*,/,=
min,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
max,4,4,4,4,4,4,4,4,4,3,2,2,2,2,1


# Counting element frequency

In [13]:
# Computing element frequency
df_freq_count = df["positions"].apply(pd.Series.value_counts)

# Computing the "sum"
df_freq_count["sum"] = df_freq_count.sum(axis=1)

# Inserting missing elements
for element in ELEMENTS:
    if element not in df_freq_count.index:
        df_freq_count.loc[element] = 0

# Computing the "total"
totals = []
for element in df_freq_count.index:
    totals.append(df["elements"][element].value_counts().drop(0, errors="ignore").sum())

df_freq_count["total"] = totals

# Filling the missing values, converting the "dtypes" and sorting
df_freq_count = df_freq_count.fillna(0)
df_freq_count = df_freq_count.astype("uint16")
df_freq_count = df_freq_count.sort_index()


df_freq_count

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,sum,total
*,0,2978,774,1256,940,0,0,0,5948,5574
+,0,2002,3405,1606,471,0,0,0,7484,6839
-,0,649,4829,1323,1699,0,0,0,8500,7810
/,0,414,1354,1527,1103,0,0,0,4398,4024
0,0,1480,258,252,720,229,214,1830,4983,4464
...,...,...,...,...,...,...,...,...,...,...
6,1733,1194,802,1140,1192,722,1055,1844,9682,7992
7,1683,1011,674,860,1159,655,1003,1657,8702,7274
8,1708,1159,844,743,1152,695,976,1881,9158,7588
9,1800,964,776,537,1185,646,1024,1718,8650,7198


In [14]:
# Ploting Heatmap

# Sorting data
df_freq_count = df_freq_count.sort_index()

# Creating figure
fig = sp.make_subplots(
    rows=1, 
    cols=2,
    column_widths=[0.85, 0.15],
)

# Creating "Positions" traces
fig.add_trace(
    go.Heatmap(
        x=[f"p{i}" for i in range(NUMBER_OF_ELEMENTS)],
        y=df_freq_count.index,
        z=df_freq_count / len(df),
        coloraxis=f"coloraxis1",
        name="Positions",
    ),
    row=1,
    col=1,
)

# Creating "Total Unique" trace
fig.add_trace(
    go.Heatmap(
        x=["Total"],
        y=df_freq_count.index,
        z=df_freq_count[["total"]] / len(df),
        coloraxis=f"coloraxis2",
        name="Total",
    ),
    row=1,
    col=2,
)

# Updating layout
fig.update_layout(
    title="Element Frequency",
    height=600,
    width=600,
    coloraxis1=dict(
        showscale=False,
        colorscale="Portland",
    ),
    coloraxis2=dict(
        showscale=False,
        colorscale="Portland",
    ),
)

# Showing figure
fig.show()

In [15]:
# Ploting Bars

# Creating figure
fig = sp.make_subplots(
    rows=NUMBER_OF_ELEMENTS + 1, 
    cols=1,
)

# Creating a trace for each position
for i, position in enumerate(df["positions"].columns):

    # Sorting the data
    df_temp = df_freq_count.sort_values(by=position, ascending=False)

    # Creating the trace
    fig.add_trace(
        go.Bar(
            x=df_temp.index,
            y=df_temp[position] / len(df),
            name=position,
        ),
        row=i+1,
        col=1,
    )

# Creating a trace for the "Total"
df_freq_count = df_freq_count.sort_values(by="total", ascending=False)

fig.add_trace(
    go.Bar(
        x=df_freq_count.index,
        y=df_freq_count["total"] / len(df),
        name="Total",
    ),
    row=NUMBER_OF_ELEMENTS + 1,
    col=1,
)

titles_positions = {f"yaxis{i + 1}_title": position for i, position in enumerate(df["positions"].columns)}
title_total = {f"yaxis{NUMBER_OF_ELEMENTS + 1}_title": "Total"}

# Updating the layout
fig.update_layout(
    title="Element frequency",
    height=1000,
    width=1000,
    showlegend=False,
    **titles_positions,
)

fig.update_layout(
    **title_total,
)

# Showing the figure
fig.show()

# Correlating elements

In [16]:
df_elements_corr = df["elements"].corr()
df_elements_corr

Unnamed: 0,1,2,3,4,5,6,8,7,9,0,+,-,*,/,=
1,1.000000,-0.195539,-0.148369,-0.171905,-0.134690,-0.133678,-0.129248,-0.107230,-0.118694,-0.096194,0.131378,0.087631,-0.090774,-0.041223,
2,-0.195539,1.000000,-0.144566,-0.078516,-0.123496,-0.081092,-0.092887,-0.080238,-0.134354,-0.076497,-0.039783,-0.063548,0.007000,0.036211,
3,-0.148369,-0.144566,1.000000,-0.123292,-0.055652,-0.077206,-0.119792,-0.081789,-0.086170,-0.088936,-0.021826,-0.032351,-0.019215,-0.048761,
4,-0.171905,-0.078516,-0.123292,1.000000,-0.139911,-0.092684,-0.059329,-0.111644,-0.117086,-0.063442,-0.056458,-0.055092,0.005434,0.017828,
5,-0.134690,-0.123496,-0.055652,-0.139911,1.000000,-0.158886,-0.138367,-0.089452,-0.087853,-0.001207,-0.029890,-0.030587,-0.021052,-0.029951,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
+,0.131378,-0.039783,-0.021826,-0.056458,-0.029890,-0.046002,-0.026780,0.000979,0.028822,-0.036919,1.000000,-0.444895,-0.235123,-0.282373,
-,0.087631,-0.063548,-0.032351,-0.055092,-0.030587,-0.044087,-0.032546,-0.002501,0.015119,0.004749,-0.444895,1.000000,-0.211522,-0.230330,
*,-0.090774,0.007000,-0.019215,0.005434,-0.021052,-0.001189,-0.003368,-0.034259,-0.056877,-0.090972,-0.235123,-0.211522,1.000000,-0.127370,
/,-0.041223,0.036211,-0.048761,0.017828,-0.029951,0.008409,0.014044,-0.062435,-0.067988,-0.006451,-0.282373,-0.230330,-0.127370,1.000000,


In [17]:
# Ploting Heatmap

# Masking the diagonal
mask = np.triu(np.ones_like(df_elements_corr, dtype=bool))
df_temp = df_elements_corr.mask(mask)

# Creating figure
fig = go.Figure()

# Creating trace
fig.add_trace(
    go.Heatmap(
        x=df_temp.columns,
        y=df_temp.columns,
        z=df_temp,
        colorscale="Portland",
    )
)

# Updating layout
fig.update_layout(
    title="Elements correlation",
    height=600,
    width=600,
)

# Showing figure
fig.show()

# Correlating positions

In [18]:
df_positions_corr = df_freq_count[df["positions"].columns].corr()
df_positions_corr

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7
p0,1.0,-0.079213,-0.349387,0.404656,0.494965,-0.172125,0.175656,0.792846
p1,-0.079213,1.0,0.028255,0.262095,-0.52066,-0.513316,-0.527016,-0.019435
p2,-0.349387,0.028255,1.0,0.330837,0.080952,-0.332576,-0.435097,-0.516797
p3,0.404656,0.262095,0.330837,1.0,0.246748,-0.518742,-0.338896,0.039885
p4,0.494965,-0.52066,0.080952,0.246748,1.0,0.187945,0.377011,0.160125
p5,-0.172125,-0.513316,-0.332576,-0.518742,0.187945,1.0,0.934452,-0.258712
p6,0.175656,-0.527016,-0.435097,-0.338896,0.377011,0.934452,1.0,-0.005554
p7,0.792846,-0.019435,-0.516797,0.039885,0.160125,-0.258712,-0.005554,1.0


In [19]:
# Plotting Heatmap

# Masking the diagonal
mask = np.triu(np.ones_like(df_positions_corr, dtype=bool))
df_temp = df_positions_corr.mask(mask)

# Creating figure
fig = go.Figure()

# Creating trace
fig.add_trace(
    go.Heatmap(
        x=df_temp.columns,
        y=df_temp.columns,
        z=df_temp,
        colorscale="Portland",
    )
)

# Updating layout
fig.update_layout(
    title="Positions correlation",
    height=600,
    width=600,
)

# Showing figure
fig.show()

# Correlating each element to another element based on their positions

*(This method is extremely slow, but it's an easy way to get all elements in the table (including the ones not present in a given position))*

In [22]:
def correlate_positions(df, position_0, position_1, pbar):
    """ Correlating the frequencies of elements in two given positions """

    # Initializing a temporary dataframe
    df_positions_corr = pd.DataFrame([[0 for _, _ in enumerate(ELEMENTS)] for _, _ in enumerate(ELEMENTS)], columns=ELEMENTS, index=ELEMENTS)

    # Iterating over the first element
    for i, _ in enumerate(ELEMENTS):

        # Iterating over the second element
        for j, _ in enumerate(ELEMENTS):
            counter = 0

            # Update the progress bar
            pbar.set_postfix_str(f"Positions: {position_0},{position_1} - Elements: {ELEMENTS[i]},{ELEMENTS[j]}")

            # Iterating over all the equations in the dataframe
            for row in df.itertuples():

                # If the element in the positions are the same, increment the counter
                if row[position_0 + 1] == ELEMENTS[i] and row[position_1 + 1] == ELEMENTS[j]:
                    counter += 1
                
                # Update the progress bar
                pbar.update(1)

            # Storing the counter value
            df_positions_corr[ELEMENTS[i]].loc[ELEMENTS[j]] = counter
    
    # Returning the dataframe
    return df_positions_corr


def correlate_elements(df):
    """ Correlating the frequencies of elements across all positions """

    # Creating the progress bar and an empty DataFrame
    pbar = tqdm(total=int(((NUMBER_OF_ELEMENTS ** 2) - NUMBER_OF_ELEMENTS) / 2) * len(df) * len(ELEMENTS) ** 2)
    df_elements_corr = pd.DataFrame()

    # Iterating over the first position
    for i in range(NUMBER_OF_ELEMENTS):
        df_i = pd.DataFrame()

        # Iterating over the second position
        for j in range(i + 1, NUMBER_OF_ELEMENTS):
            
            # Computing the correlation between the two positions
            df_j = correlate_positions(df, i, j, pbar)

            # Concatenating the dataframes
            df_j = pd.concat([df_j], keys=[f"p{j}"], names=[f'position', 'element'], axis=0)
            df_j = pd.concat([df_j], keys=[f"p{i}"], names=[f'position', 'element'], axis=1)
            df_i = pd.concat([df_i, df_j], axis=0)

        # Concatenating the dataframes again
        df_elements_corr = pd.concat([df_elements_corr, df_i], axis=1)

    # Converting dtypes and closing the progress bar
    df_elements_corr = df_elements_corr.convert_dtypes()
    pbar.close()

    # Returning the dataframe}
    return df_elements_corr


df_elements_pos_corr = correlate_elements(df["positions"])

100%|██████████| 111654900/111654900 [03:20<00:00, 558169.48it/s, Positions: 6,7 - Elements: =,=]


In [21]:
# Ploting Heatmap

# Creating figure
fig = sp.make_subplots(
    rows=NUMBER_OF_ELEMENTS - 1, 
    cols=NUMBER_OF_ELEMENTS - 1,
    shared_xaxes=True,
    shared_yaxes=True,
    vertical_spacing=0.01,
    horizontal_spacing=0.01,
)

# Creating traces
for i in range(NUMBER_OF_ELEMENTS):
    for j in range(i + 1, NUMBER_OF_ELEMENTS):

        df_temp = df_elements_pos_corr[f"p{i}"].loc[f"p{j}"]

        if df_temp is None:
            continue

        fig.add_trace(
            go.Heatmap(
                x=df_temp.columns,
                y=df_temp.index,
                z=df_temp / len(df),
                name=f"{i}-{j}",
                visible=True,
                coloraxis=f"coloraxis{i + j + 1}",
            ),
            row=j, 
            col=i + 1,
        )

        fig.add_trace(
            go.Heatmap(
                x=df_temp.columns,
                y=df_temp.index,
                z=df_temp / len(df),
                name=f"{i}-{j}",
                visible=False,
                coloraxis=f"coloraxis1",
            ),
            row=j, 
            col=i + 1,
        )

# Updating layout
coloraxis = dict(colorscale="Portland", showscale=False)
coloraxes = {f"coloraxis{i + 1}": coloraxis for i in range(int(((NUMBER_OF_ELEMENTS ** 2) - NUMBER_OF_ELEMENTS) / 2) + 1)}

fig.update_layout(
    title="Element correlation per position",
    height=1200,
    width=1200,
    **coloraxes,
)

fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)

for i in range(NUMBER_OF_ELEMENTS - 1):
    fig.update_xaxes(title_text=f"p{i}", side="bottom", row=NUMBER_OF_ELEMENTS - 1, col=i + 1)
    fig.update_yaxes(title_text=f"p{NUMBER_OF_ELEMENTS - i - 1}", row=NUMBER_OF_ELEMENTS - i - 1, col=1)

# Adding dropdown
fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=[{"visible": [True, False]}],
                    label="Independent Scales",
                ),
                dict(
                    args=[{"visible": [False, True]}],
                    label="Single Scale",
                ),
            ]),
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=1,
            xanchor="right",
            y=1.1,
            yanchor="top"
        ),
    ]
)

# Showing figure
fig.show()