In [1]:
import dataclasses
from numpy.testing import assert_array_equal
import numpy as np
import pandas as pd
import numpy.typing as npt

## Construct array from index arrays

In [70]:
def construct_array(
        matrix: npt.NDArray[np.int_],
        row_indices: npt.NDArray[np.int_],
        col_indices: npt.NDArray[np.int_],
) -> npt.NDArray[np.int_]:
    
    return matrix[row_indices, col_indices]

In [71]:
np.random.seed(10)
A = np.random.randint(1, 100, size=(3, 4))
A

array([[10, 16, 65, 29],
       [90, 94, 30,  9],
       [74,  1, 41, 37]])

In [72]:
construct_array(A, [2, 0, 0], [1, 3, 2])

array([ 1, 29, 65])

In [73]:
construct_array(A, np.arange(3), np.arange(1, 4))

array([16, 30, 37])

In [74]:
# TESTING AREA

@dataclasses.dataclass
class ConstructArrayCase:
    matrix: npt.NDArray[np.int_]
    row_indices: list[int]
    col_indices: list[int]
    result: npt.NDArray[np.int_]


CONSTRUCT_ARRAY_TEST_CASES = [
    ConstructArrayCase(matrix=np.array(range(25)).reshape(5, 5),
                       row_indices=[0, 1, 2],
                       col_indices=[0, 1, 2],
                       result=np.array([0, 6, 12])),
    ConstructArrayCase(matrix=np.arange(-10, 10).reshape((5, 4)),
                       row_indices=[1, 2, 3, 3],
                       col_indices=[3, 2, 1, 2],
                       result=np.array([-3, 0, 3, 4])),
    ConstructArrayCase(matrix=np.arange(42).reshape((7, 6)),
                       row_indices=[],
                       col_indices=[],
                       result=np.array([])),
    ConstructArrayCase(matrix=np.arange(42).reshape((7, 6)),
                       row_indices=np.arange(4),
                       col_indices=np.arange(4),
                       result=np.array(np.arange(4)) * 7),
    ConstructArrayCase(matrix=np.arange(42).reshape((42, 1)),
                       row_indices=[0, 1, 41],
                       col_indices=[0, 0, 0],
                       result=np.array([0, 1, 41]))
]

for t in CONSTRUCT_ARRAY_TEST_CASES:
    assert_array_equal(construct_array(t.matrix, t.row_indices, t.col_indices), t.result)

## Detect identic arrays

In [145]:
def detect_identic(
        lhs_array: npt.ArrayLike,
        rhs_array: npt.ArrayLike
) -> bool:
    return np.array_equal(lhs_array, rhs_array)

In [146]:
detect_identic(5, 5.0)

True

In [147]:
detect_identic('xyz', "xyz")

True

In [148]:
detect_identic([1, 2, 45], [1, 2, 45])

True

In [149]:
detect_identic(np.zeros(10), np.ones(10) - 1)

True

In [150]:
detect_identic([[0, 1], [2, 3]], np.arange(4))

False

In [151]:
detect_identic([[0, 1], [2, 3]], np.arange(4).reshape((2, 2)))

True

In [152]:
# TESTING AREA

@dataclasses.dataclass
class DetectIdenticCase:
    lhs_array: npt.ArrayLike
    rhs_array: npt.ArrayLike
    result: bool


DETECT_IDENTIC_TEST_CASES = [
    DetectIdenticCase(
        lhs_array=np.array([1, 2]),
        rhs_array=np.array([1, 2]),
        result=True),
    DetectIdenticCase(
        lhs_array=np.array([1., 2]),
        rhs_array=np.array([1, 2.]),
        result=True),
    DetectIdenticCase(
        lhs_array=np.array([1, 2]),
        rhs_array=np.array([1.0001, 2]),
        result=False),
    DetectIdenticCase(
        lhs_array=np.array([1, 2]),
        rhs_array=np.array([[1, 2]]),
        result=False),
    DetectIdenticCase(
        lhs_array=np.array([[1, 2, 3]]),
        rhs_array=np.array([[1, 2]]),
        result=False),
    DetectIdenticCase(
        lhs_array=np.array([]),
        rhs_array=np.array([]),
        result=True),
    DetectIdenticCase(
        lhs_array=3,
        rhs_array=3,
        result=True),
    DetectIdenticCase(
        lhs_array=np.array(range(3)),
        rhs_array=np.array(range(3))[np.newaxis, :],
        result=False),
]

for t in DETECT_IDENTIC_TEST_CASES:
    assert_array_equal(detect_identic(t.lhs_array, t.rhs_array), t.result)

## Mean channel

In [84]:
def mean_channel(X: npt.NDArray[np.float_]) -> npt.NDArray[np.float_]:
    """
    Given color image (3-dimensional array of size (n, m, 3).
    Compute average value for all 3 channels
    :param X: color image
    :return: array of size 3 with average values
    """
    return np.mean(X, axis=(0, 1))

In [85]:
A = np.arange(60).reshape(4, 5, 3)

In [86]:
np.mean(A, axis=(0, 1))

array([28.5, 29.5, 30.5])

In [87]:
A = np.arange(48).reshape(4, 4, 3)
mean_channel(A)

array([22.5, 23.5, 24.5])

In [88]:
A = np.random.rand(1920, 1080, 3)
mean_channel(A)

array([0.50007022, 0.50073281, 0.49993379])

In [153]:
# TESTING AREA

@dataclasses.dataclass
class MeanChannelCase:
    X: npt.NDArray[np.float_]
    result: npt.NDArray[np.float_]


MEAN_CHANNEL_TEST_CASES = [
    MeanChannelCase(
        X=np.array(range(5 * 5 * 3)).reshape(5, 5, 3),
        result=np.array([36, 37, 38])),
    MeanChannelCase(
        X=np.dstack((
            (np.arange(320 * 240) % 64).reshape(320, 240),
            (np.arange(320 * 240) % 64).reshape(320, 240) * 2,
            (np.arange(320 * 240) % 64).reshape(320, 240) * 3)),
        result=np.array([31.5, 63., 94.5])),
    MeanChannelCase(
        X=np.array([]).reshape(0, 0, 3),
        result=np.array([np.nan, np.nan, np.nan]))
]

for t in MEAN_CHANNEL_TEST_CASES:
    assert_array_equal(mean_channel(t.X), t.result)

## Unique rows

In [90]:
def get_unique_rows(X: npt.NDArray[np.int_]) -> npt.NDArray[np.int_]:
    """
    Compute unique rows of matrix
    :param X: matrix
    :return: matrix of unique rows
    """
    return np.unique(X, axis=(0))

In [91]:
A = np.array([[1, 2], [2, 1], [1, 2]])
A

array([[1, 2],
       [2, 1],
       [1, 2]])

In [92]:
# TESTING AREA

@dataclasses.dataclass
class GetUniqueRowsCase:
    X: npt.NDArray[np.int_]
    result: npt.NDArray[np.int_]


GET_UNIQUE_ROWS_TEST_CASES = [
    GetUniqueRowsCase(
        X=np.array([[1, 2, 3]]),
        result=np.array([[1, 2, 3]])),
    GetUniqueRowsCase(
        X=np.array([[4, 5, 6], [0, 1, 2], [1, 2, 3], [0, 1, 2], [4, 5, 6], [1, 2, 3]]),
        result=np.array([[0, 1, 2], [1, 2, 3], [4, 5, 6]])),
]

for t in GET_UNIQUE_ROWS_TEST_CASES:
    assert_array_equal(get_unique_rows(t.X), t.result)

## Construct matrix

In [154]:
def construct_matrix(
        arr1: npt.NDArray[np.int_], arr2: npt.NDArray[np.int_]
) -> npt.NDArray[np.int_]:
    """
    Construct matrix from pair of arrays
    :param first_array: first array
    :param second_array: second array
    :return: constructed matrix
    """
    return np.vstack((arr1, arr2)).T

In [155]:
construct_matrix([-1, -2, -3], [10, 20, 30])

array([[-1, 10],
       [-2, 20],
       [-3, 30]])

In [156]:
@dataclasses.dataclass
class ConstructMatrixCase:
    first_array: npt.NDArray[np.int_]
    second_array: npt.NDArray[np.int_]
    result: npt.NDArray[np.int_]


CONSTRUCT_MATRIX_TEST_CASES = [
    ConstructMatrixCase(
        first_array=np.array([1, 2, 3]),
        second_array=np.array([4, 5, 6]),
        result=np.array([[1, 4], [2, 5], [3, 6]])),
    ConstructMatrixCase(
        first_array=np.array([]),
        second_array=np.array([]),
        result=np.array([]).reshape(0, 2)),
    ConstructMatrixCase(
        first_array=np.array([1]),
        second_array=np.array([2]),
        result=np.array([[1, 2]])),
    ConstructMatrixCase(
        first_array=np.arange(0, 100, 2),
        second_array=np.arange(1, 100, 2),
        result=np.arange(100).reshape(50, 2))
]

for t in CONSTRUCT_MATRIX_TEST_CASES:
    assert_array_equal(construct_matrix(t.first_array, t.second_array), t.result)

## Add zeros

In [178]:
import numpy as np
import numpy.typing as npt


def add_zeros(x: npt.NDArray[np.int_]) -> npt.NDArray[np.int_]:
    """
    Add zeros between values of given array
    E.g.: [-1, 3, 5] -> [-1, 0, 3, 0, 5]
    :param x: array,
    :return: array with zeros inserted
    """
    return np.insert(x, np.arange(1, len(x)), 0)

In [179]:
add_zeros([-1, 3, 5, 2])

array([-1,  0,  3,  0,  5,  0,  2])

In [180]:
add_zeros([1])

array([1])

In [181]:
add_zeros([])

array([], dtype=float64)

## Max element after 0

In [173]:
def max_element(arr: npt.NDArray[np.int_]) -> int | None:
    """
    Return max element after zero for input array.
    If appropriate elements are absent, then return None
    E.g., [6, 2, 0, 3, 0, 0, 5, 7, 0] -> 5
    :param array: array,r
    :return: max element value or None
    """
    return 

TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

In [171]:
max_element([6, 2, 0, 3, 0, 0, 5, 7, 0])

NameError: name 'max_element' is not defined

## Nearest value

In [219]:
def nearest_value(matrix: npt.NDArray[np.float_], value: float) -> float:
    """
    Find nearest value in matrix.
    If matrix is empty return None
    E.g., matrix = np.arange(0,10).reshape((2, 5)) and value = 3.6 give 4
    :param matrix: input matrix
    :param value: value to find
    :return: nearest value in matrix or None
    """
    matrix = matrix.flatten()
    if len(matrix) == 0:
        return 0
    diff = np.absolute(matrix-value)
    return matrix[diff.argmin()]

In [220]:
matrix = np.arange(0,10).reshape((2, 5))
matrix

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [221]:
nearest_value(matrix, 12.3)

9

## Nonzero value

In [237]:
def nonzero_product(matrix: npt.NDArray[np.int_]) -> int | None:
    """
    Compute product of nonzero diagonal elements of matrix
    If all diagonal elements are zeros, then return None
    :param matrix: array,
    :return: product value or None
    """
    diag = np.diag(matrix)
    diag = diag[np.nonzero(diag)]
    if diag.size == 0:
        return None
    return diag.product()

TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

In [238]:
A = np.random.normal(2, 4, (3, 4))
A

array([[  1.77764675,   6.83242204,   8.71165316,   1.35584914],
       [  7.14395059,   9.0446926 ,   2.22980118, -10.89837793],
       [ -3.92416257,  -1.51824286,  -1.5027401 ,   2.24884784]])

In [239]:
np.diag(A)

array([ 1.77764675,  9.0446926 , -1.5027401 ])

In [240]:
nonzero_product(A)

TypeError: 'int' object is not callable

In [241]:
B = np.array([[1, 2], [3, 0]])
B

array([[1, 2],
       [3, 0]])

In [123]:
nonzero_product(B)

1

In [120]:
C = np.array([[0, 2], [3, 0]])
C

array([[0, 2],
       [3, 0]])

In [128]:
print(nonzero_product(C))

None


## Titanic

In [247]:
df = pd.read_csv('titanic.csv')
df.shape

(891, 12)

In [248]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [249]:
df.columns[df.isna().any()]

Index(['Age', 'Cabin', 'Embarked'], dtype='object')

In [250]:
df.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [251]:
df.head(11)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [252]:
counts = df.groupby('Pclass')['Pclass'].count()
counts

Pclass
1    216
2    184
3    491
Name: Pclass, dtype: int64

In [253]:
counts / df.shape[0]

Pclass
1    0.242424
2    0.206510
3    0.551066
Name: Pclass, dtype: float64

In [254]:
df['Family'] = df.apply(lambda row: row['Name'].split(",")[0], axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen


In [255]:
(df['Family'].value_counts() > 0).sum()

667

In [256]:
s = "Braund, Mr. Owen Harris"
s.split(",")[0]

'Braund'

In [257]:
df[(df['Sex'] == 'male') & (df['Survived'] == 1) & (df['Embarked'] == 'S') & (df['Fare'] > 30)]["Age"].mean()

26.696000000000005

In [260]:
def male_age(df: pd.DataFrame) -> float:
    """
    Return mean age of survived men, embarked in Southampton with fare > 30
    :param df: dataframe
    :return: mean age
    """
    return df[(df['Sex'] == 'male') & (df['Survived'] == 1) & (df['Embarked'] == 'S') & 
           (df['Fare'] > 30)]["Age"].mean()

def nan_columns(df: pd.DataFrame):
    """
    Return list of columns containing nans
    :param df: dataframe
    :return: series of columns
    """
    return df.columns[df.isna().any()].tolist()
    

def class_distribution(df: pd.DataFrame) -> pd.Series:
    """
    Return Pclass distrubution
    :param df: dataframe
    :return: series with ratios
    """
    counts = df.groupby('Pclass')['Pclass'].count()
    counts.index.name = None
    return counts / df.shape[0]
    
def families_count(df: pd.DataFrame, k: int) -> int:
    """
    Compute number of families with more than k members
    :param df: dataframe,
    :param k: number of members,
    :return: number of families
    """
    df['Family'] = df.apply(lambda row: row['Name'].split(",")[0], axis=1)
    return (df['Family'].value_counts() > k).sum()

def mean_price(df: pd.DataFrame, tickets) -> float:
    """
    Return mean price for specific tickets list
    :param df: dataframe,
    :param tickets: list of tickets,
    :return: mean fare for this tickets
    """
    res = []
    d

In [261]:
male_age(df)

26.696000000000005

In [262]:
class_distribution(df)

1    0.242424
2    0.206510
3    0.551066
Name: Pclass, dtype: float64

In [275]:
df['Ticket'].value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

In [273]:
families_count(df, 4)

8