# Numpy ndarrays

## The scenario

The question how to manipulate 3d arrays and aggregate values in the different dimentions (axis).

Assume the dimensions as:

- `0`: represents pages/urls
- `1`: represents issues/problems identified in the page
- `2`: pillar ... #todo: describe this concept


In [1]:
import numpy as np

In [2]:
# create some data
a = np.zeros(300).reshape(10, -1, 6)
a[0, 0, :] = np.array([1.02, 1.02, 0.0, 0.0, 1.02, 1.02])
a.shape

(10, 5, 6)

In [3]:
a[0, :, :].shape

(5, 6)

In [4]:
# 1st page, all issue types, all pillars
print(a[0, :, :])

print()
# 5th page, all issue types, all pillars
print(a[4, :, :])

[[1.02 1.02 0.   0.   1.02 1.02]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]]

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [5]:
# 1st issue type, all 10 pages, all 6 pillars
print(a[:, 0, :])
print(a[:, 0, :].shape)

[[1.02 1.02 0.   0.   1.02 1.02]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.  ]]
(10, 6)


In [6]:
# Pillar 0
print(a[:, :, 0])
print(a[:, :, 0].shape)

[[1.02 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]]
(10, 5)


In [7]:
# 1 page, all issue types, 1 pillar
print(a[0, :, 0])
print(a[0, :, 0].shape)

[1.02 0.   0.   0.   0.  ]
(5,)


In [8]:
# define a mas that to filter all zero elelements
mask = a != 0
print(mask)
print(mask.shape)

[[[ True  True False False  True  True]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]]

 [[False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]]

 [[False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]]

 [[False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]]

 [[False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False False]
  [False False False False False

In [9]:
mask[:, :, 0].shape

(10, 5)

In [10]:
# all pages, all issue types, of pillar 0
mask[:, :, 0]

array([[ True, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False, False]])

In [11]:
# overall mean of non-zero elements
np.mean(a, where=mask)

1.02

In [12]:
# aggregates action value over page axis,
# resulting in (issue type, pillar).
np.mean(a, axis=0, where=mask)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


array([[1.02, 1.02,  nan,  nan, 1.02, 1.02],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan]])

In [13]:
# aggregates action value over issue type axis,
# resulting in (pages, pillar).
np.mean(a, axis=1, where=mask)

array([[1.02, 1.02,  nan,  nan, 1.02, 1.02],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan,  nan]])

In [14]:
# aggregates action value over pillar axis,
# resulting in (pages, issue type).
np.mean(a, axis=2, where=mask)

array([[1.02,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan,  nan]])

In [15]:
# ------------------------
# mean action value per pillar
# ------------------------
np.mean(a, axis=(0, 1), where=mask)

array([1.02, 1.02,  nan,  nan, 1.02, 1.02])

In [16]:
# ------------------------
# mean action value per issue type
# ------------------------
np.mean(a, axis=(0, 2), where=mask)

array([1.02,  nan,  nan,  nan,  nan])

In [17]:
# ------------------------
# mean action value per page
# ------------------------
np.mean(a, axis=(1, 2), where=mask)

array([1.02,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan])

## Aggregate some data

Let's call it `action` the action of fixing an issue in a page.
Then, calculate and ranking based on the value of a action, `page` and `issue` combination. 


In [18]:
# aggregate values per pillar
action_values = np.zeros(300).reshape(10, -1, 6)
action_values[0, 0, :] = np.array([1.02, 1.02, 0.0, 0.0, 1.02, 1.02])
action_values[1, 0, :] = np.array([2.02, 1.02, 3.0, 1.4, 1.02, 1.02])

print(action_values[:, :, 0])
action_values.shape

[[1.02 0.   0.   0.   0.  ]
 [2.02 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]]


(10, 5, 6)

What we want is to aggregate the `mean` action value per `page` and `issue` type.
This way we can rank the action by value.

We can either use all elements in the action matrix, or only non-zero elements.

In [19]:
# masking ... using only non-zero elements
mask = action_values != 0
agg_action_values = np.mean(action_values, axis=2, where=mask)
print(agg_action_values)
print(agg_action_values.shape)

[[1.02  nan  nan  nan  nan]
 [1.58  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]]
(10, 5)


In [20]:
# Not masking ... use all elements
agg_action_values = np.mean(action_values, axis=2)
print(agg_action_values)
print(agg_action_values.shape)

[[0.68 0.   0.   0.   0.  ]
 [1.58 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.  ]]
(10, 5)


In [21]:
action_values = agg_action_values.ravel()
action_values

array([0.68, 0.  , 0.  , 0.  , 0.  , 1.58, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [22]:
indices = np.argsort(action_values)
indices

array([24, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
       43, 44, 45, 46, 47, 26, 25, 49, 23,  1,  2,  3,  4,  6,  7,  8,  9,
       10, 48, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 12,  0,  5])

In [23]:
# The element [0,0] is the biggest, so it is the last
unraveled = np.unravel_index(indices, agg_action_values.shape)
unraveled

(array([4, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9,
        5, 5, 9, 4, 0, 0, 0, 0, 1, 1, 1, 1, 2, 9, 2, 2, 2, 3, 3, 3, 3, 3,
        4, 4, 4, 2, 0, 1]),
 array([4, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2,
        1, 0, 4, 3, 1, 2, 3, 4, 1, 2, 3, 4, 0, 3, 1, 3, 4, 0, 1, 2, 3, 4,
        0, 1, 2, 2, 0, 0]))

In [24]:
# put the indexes next to each other
pairs = np.dstack(unraveled)
pairs

array([[[4, 4],
        [5, 2],
        [5, 3],
        [5, 4],
        [6, 0],
        [6, 1],
        [6, 2],
        [6, 3],
        [6, 4],
        [7, 0],
        [7, 1],
        [7, 2],
        [7, 3],
        [7, 4],
        [8, 0],
        [8, 1],
        [8, 2],
        [8, 3],
        [8, 4],
        [9, 0],
        [9, 1],
        [9, 2],
        [5, 1],
        [5, 0],
        [9, 4],
        [4, 3],
        [0, 1],
        [0, 2],
        [0, 3],
        [0, 4],
        [1, 1],
        [1, 2],
        [1, 3],
        [1, 4],
        [2, 0],
        [9, 3],
        [2, 1],
        [2, 3],
        [2, 4],
        [3, 0],
        [3, 1],
        [3, 2],
        [3, 3],
        [3, 4],
        [4, 0],
        [4, 1],
        [4, 2],
        [2, 2],
        [0, 0],
        [1, 0]]])

In [25]:
action_values[indices]

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.68, 1.58])

In [26]:
stacked = np.dstack((pairs, action_values[indices])).squeeze()
stacked

array([[4.  , 4.  , 0.  ],
       [5.  , 2.  , 0.  ],
       [5.  , 3.  , 0.  ],
       [5.  , 4.  , 0.  ],
       [6.  , 0.  , 0.  ],
       [6.  , 1.  , 0.  ],
       [6.  , 2.  , 0.  ],
       [6.  , 3.  , 0.  ],
       [6.  , 4.  , 0.  ],
       [7.  , 0.  , 0.  ],
       [7.  , 1.  , 0.  ],
       [7.  , 2.  , 0.  ],
       [7.  , 3.  , 0.  ],
       [7.  , 4.  , 0.  ],
       [8.  , 0.  , 0.  ],
       [8.  , 1.  , 0.  ],
       [8.  , 2.  , 0.  ],
       [8.  , 3.  , 0.  ],
       [8.  , 4.  , 0.  ],
       [9.  , 0.  , 0.  ],
       [9.  , 1.  , 0.  ],
       [9.  , 2.  , 0.  ],
       [5.  , 1.  , 0.  ],
       [5.  , 0.  , 0.  ],
       [9.  , 4.  , 0.  ],
       [4.  , 3.  , 0.  ],
       [0.  , 1.  , 0.  ],
       [0.  , 2.  , 0.  ],
       [0.  , 3.  , 0.  ],
       [0.  , 4.  , 0.  ],
       [1.  , 1.  , 0.  ],
       [1.  , 2.  , 0.  ],
       [1.  , 3.  , 0.  ],
       [1.  , 4.  , 0.  ],
       [2.  , 0.  , 0.  ],
       [9.  , 3.  , 0.  ],
       [2.  , 1.  , 0.  ],
 

In [27]:
stacked[stacked[:, 2] == 0] = np.nan
non_nan_mask = ~np.isnan(stacked[:, 2])
stacked[non_nan_mask]

array([[0.  , 0.  , 0.68],
       [1.  , 0.  , 1.58]])

## Putting together



In [28]:
def rank(value_matrix: np.ndarray, non_zero_only: bool) -> np.ndarray:
    if non_zero_only:
        mask = value_matrix != 0.0
        agg_values = np.mean(value_matrix, axis=2, where=mask)
    else:
        agg_values = np.mean(value_matrix, axis=2)

    raveled = agg_values.ravel()
    indices = np.argsort(raveled)
    stacked = np.dstack((np.dstack(np.unravel_index(indices, agg_values.shape)), raveled[indices])).squeeze()
    stacked[stacked[:, 2] == 0] = np.nan
    non_nan_mask = ~np.isnan(stacked[:, 2])
    return stacked[non_nan_mask]

In [29]:
# Let's test it out.
# First, create the dataset:
sample_action_values_matrix = np.zeros(300).reshape(10, -1, 6)
sample_action_values_matrix[0, 0, :] = np.array([1.02, 1.02, 0.0, 0.0, 1.02, 1.02])
sample_action_values_matrix[1, 0, :] = np.array([2.02, 1.02, 3.0, 0.0, 1.02, 1.02])
sample_action_values_matrix[2, 0, :] = np.array([1, 1, 1.0, 1.0, 1.0, 1.0])
sample_action_values_matrix[0, 1, :] = np.array([1, 4, 1.0, 5.0, 1.0, 1.0])

In [30]:
sorted_actions = rank(sample_action_values_matrix, False)
sorted_actions

array([[0.        , 0.        , 0.68      ],
       [2.        , 0.        , 1.        ],
       [1.        , 0.        , 1.34666667],
       [0.        , 1.        , 2.16666667]])

In [31]:
sorted_actions = rank(sample_action_values_matrix, True)
sorted_actions

array([[2.        , 0.        , 1.        ],
       [0.        , 0.        , 1.02      ],
       [1.        , 0.        , 1.616     ],
       [0.        , 1.        , 2.16666667]])

## Transfor to dataframe

In [32]:
import pandas as pd

In [33]:
rank_df = (
    pd.DataFrame(data=sorted_actions, columns=["page_idx", "issue_type_idx", "action_value"])
    .sort_values(by=["action_value"], ascending=False)
    .reset_index(drop=True)
)
rank_df

Unnamed: 0,page_idx,issue_type_idx,action_value
0,0.0,1.0,2.166667
1,1.0,0.0,1.616
2,0.0,0.0,1.02
3,2.0,0.0,1.0


In [34]:
# add the page url and the issue type name


# this is how the input is given:
page_value = pd.DataFrame(
    columns=["page", "value"],
    data=[
        ["page 1", 1.0],
        ["page 2", 1.0],
        ["page 3", 0.5],
        ["page 4", 0.5],
        ["page 5", 0.5],
        ["page 6", 0.25],
        ["page 7", 0.25],
        ["page 8", 0.25],
        ["page 9", 0.25],
        ["page 10", 0.25],
    ],
).set_index("page", drop=True)

severity = pd.DataFrame(
    columns=["issue_type", "severity"],
    data=[
        ["broken_image", 1.0],
        ["missing_alt_tag", 0.5],
        ["duplicated_content", 0.25],
        ["other_1", 0.5],
        ["other_2", 1.0],
    ],
).set_index("issue_type", drop=True)

In [35]:
# Final dataframe with action ranking information:
(
    rank_df.merge(page_value.reset_index(drop=False), how="left", left_on="page_idx", right_index=True).merge(
        severity.reset_index(drop=False), how="left", left_on="issue_type_idx", right_index=True
    )[["page", "issue_type", "severity", "action_value"]]
).reset_index()

Unnamed: 0,index,page,issue_type,severity,action_value
0,0,page 1,missing_alt_tag,0.5,2.166667
1,1,page 2,broken_image,1.0,1.616
2,2,page 1,broken_image,1.0,1.02
3,3,page 3,broken_image,1.0,1.0
