In [2]:
import numpy as np
import pandas as pd

## Немного ни о чем

Поговорим про разные представления для пропущенных значений

In [4]:
df_float = pd.DataFrame(
    {
        'column_none': [1., 2., 3., 4., 5., None],
        'column_nan': [1., 2., 3., 4., 5., np.nan],
    }
)

In [None]:
df_float['column_none'] == df_float['column_nan']

Unnamed: 0,0
0,True
1,True
2,True
3,True
4,True
5,False


In [None]:
df_float

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [None]:
df_float.iloc[-1]

Unnamed: 0,5
column_none,
column_nan,


In [None]:
df_float['column_none'].dtype, df_float['column_nan'].dtype

(dtype('float64'), dtype('float64'))

Видим, что None скастовался в np.nan и стал float. Что будет, если у нас изначально данные из целых чисел?

In [10]:
df_int = pd.DataFrame(
    {
        'column_none': [1, 2, 3, 4, 5, None],
        'column_nan': [1, 2, 3, 4, 5, np.nan],
    }
)

In [None]:
df_int['column_none'] == df_int['column_nan']

Unnamed: 0,0
0,True
1,True
2,True
3,True
4,True
5,False


In [None]:
df_int

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


Тоже получили конвертацию None в NaN

In [None]:
df_int.loc[5]

Unnamed: 0,5
column_none,
column_nan,


In [None]:
df_int['column_none'].dtype, df_int['column_nan'].dtype

(dtype('float64'), dtype('float64'))

Произошел каст, вероятно не очень желательный

In [None]:
df_int['column_nan'] = df_int['column_nan'].astype(np.int16)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

Просто так скастить в integer, не выбрасывая NaN нельзя

In [None]:
df_int['column_nan'] = df_int['column_nan'].astype("Int16")

О, что-то получилось?

In [None]:
df_int

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [None]:
df_int['column_none'].nbytes, df_int['column_nan'].nbytes

(48, 18)

In [None]:
df_int.column_none.loc[5]

nan

In [None]:
df_int.column_nan.loc[5]

<NA>

Получили еще одну версию для "ничего"?))0)

In [None]:
df_int.isna()

Unnamed: 0,column_none,column_nan
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,True,True


In [None]:
df_int['column_none'].loc[5].nbytes

8

In [None]:
df_int.memory_usage(index=False)

Unnamed: 0,0
column_none,48
column_nan,18


In [11]:
df_int.columns = ['column_1', 'column_2']

In [12]:
df_int

Unnamed: 0,column_1,column_2
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [13]:
df_int.loc[1, 'column_1'] = None
df_int.loc[1, 'column_2'] = None

In [8]:
df_int.loc[:,'column_2'] = None

In [14]:
df_int

Unnamed: 0,column_1,column_2
0,1.0,1.0
1,,
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


Произошел еще один type cast

In [None]:
nas = df_int.column_2[df_int.column_2.isna()]

In [None]:
nas.iloc[0] == nas.iloc[1]

<NA>

In [None]:
nas.iloc[0] is nas.iloc[1]

True

!!!

In [None]:
id(nas.iloc[0]), id(nas.iloc[1])

(139672941813136, 139672941813136)

Это один и тот же объект!

In [None]:
nans = df_int.column_1[df_int.column_1.isna()]

In [None]:
nans

1   NaN
5   NaN
Name: column_1, dtype: float64

In [None]:
a, b = nans.values

In [None]:
a, b

(nan, nan)

In [None]:
a == b

False

In [None]:
a is b

False

In [None]:
id(a), id(b)

(139672314563696, 139672314563920)

И напоследок

In [None]:
set([float('nan'), float('nan')])

{nan, nan}

In [None]:
set([np.float64('nan'), np.float64('nan')])

{nan, nan}

In [None]:
set([pd.NA, pd.NA])

{<NA>}

In [None]:
set([np.nan, np.nan])

{nan}

In [None]:
np.nan is np.nan is np.NaN is np.NAN

True

In [None]:
pd.NA is pd.NA

True

In [None]:
type(1 + pd.NA)

pandas._libs.missing.NAType

In [None]:
id(1 + pd.NA), id(pd.NA)

(139672941813136, 139672941813136)

In [None]:
np.nan is np.NaN is np.NAN

True

In [None]:
type(1 + np.nan)

float

In [None]:
id(np.nan), id(np.nan + 1)

(139673343209232, 139672315328240)

## Задачи

###  Given series A and series B

In [None]:
series_a = pd.Series([1, 2, 4, 3])
series_b = pd.Series([3, 4, 5, 6])

- Items is series A not present in series B

In [None]:
~series_a.isin(series_b)

0     True
1     True
2    False
3    False
dtype: bool

In [None]:
series_a[~series_a.isin(series_b)]

0    1
1    2
dtype: int64

- Intersection of series

In [None]:
series_a = pd.Series([1, 2, 4, 3])
series_b = pd.Series([3, 4, 5, 6])

In [None]:
%%time
np.intersect1d(series_a, series_b)

CPU times: user 67 µs, sys: 3 µs, total: 70 µs
Wall time: 62.7 µs


array([3, 4])

In [None]:
%%time
set(series_a) & set(series_b)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 9.54 µs


{3, 4}

- Items presented only in one of the series, not in both

In [None]:
np.setdiff1d(series_a, series_b), np.setdiff1d(series_b, series_a)

(array([1, 2]), array([5, 6]))

In [None]:
np.union1d(np.setdiff1d(series_a, series_b), np.setdiff1d(series_b, series_a))

array([1, 2, 5, 6])

или

In [None]:
np.setxor1d(series_a, series_b)

array([1, 2, 5, 6])

или

In [None]:
series_union = pd.Series(np.union1d(series_a, series_b))
series_intersect = pd.Series(np.intersect1d(series_a, series_b))

series_union, series_intersect

(0    1
 1    2
 2    3
 3    4
 4    5
 5    6
 dtype: int64,
 0    3
 1    4
 dtype: int64)

In [None]:
series_union[~series_union.isin(series_intersect)]

0    1
1    2
4    5
5    6
dtype: int64

### Merge by column pairs: fruit-pazham, weight-kilo

In [None]:
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})

In [None]:
df1.head()

Unnamed: 0,fruit,weight,price
0,apple,high,7
1,banana,medium,0
2,orange,low,12
3,apple,high,1
4,banana,medium,11


In [None]:
df2.head()

Unnamed: 0,pazham,kilo,price
0,apple,high,14
1,orange,low,4
2,pine,high,7
3,apple,low,13
4,orange,high,9


In [None]:
pd.merge(df1, df2, how='inner', left_on=['fruit', 'weight'], right_on = ['pazham', 'kilo'], suffixes=('_left', '_right'))

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,7,apple,high,14
1,apple,high,1,apple,high,14
2,apple,high,2,apple,high,14
3,orange,low,12,orange,low,4
4,orange,low,7,orange,low,4
5,orange,low,14,orange,low,4


In [None]:
df_merged = df1.merge(df2, how='inner', left_on=['fruit', 'weight'], right_on = ['pazham', 'kilo'], suffixes=('_left', '_right'))
df_merged

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,7,apple,high,14
1,apple,high,1,apple,high,14
2,apple,high,2,apple,high,14
3,orange,low,12,orange,low,4
4,orange,low,7,orange,low,4
5,orange,low,14,orange,low,4


Lets explore dropping duplicate rows

In [None]:
df_merged.drop_duplicates(keep='first')

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,7,apple,high,14
1,apple,high,1,apple,high,14
2,apple,high,2,apple,high,14
3,orange,low,12,orange,low,4
4,orange,low,7,orange,low,4
5,orange,low,14,orange,low,4


In [None]:
df_merged.drop_duplicates(keep='last', inplace=False, ignore_index=False, subset=['fruit', 'weight'])

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
2,apple,high,2,apple,high,14
5,orange,low,14,orange,low,4


- *keep* отвечает за то, какой индекс оставить для одинаковых элементов, первый встреченный или последний
- *ignore_index* -- делать ли реиндексацию или нет
- *subset* -- задать множество колонок, по которым будем считать дупликаты
- *inplace* -- (здесь и далее) делать ли изменения в исходном датафрейме (inplace=True) или вернуть копию

In [None]:
import numpy as np
import pandas as pd

N = 10_000_000
df1 = pd.DataFrame({
    "key": np.random.randint(0, 1_000_000, size=N),
    "val1": np.random.rand(N)
})
df2 = pd.DataFrame({
    "key": np.random.randint(0, 1_000_000, size=N),
    "val2": np.random.rand(N)
})

In [None]:
def merge_on_column():
    pd.merge(df1, df2, on="key", sort=False)
%timeit merge_on_column()

18.5 s ± 2.1 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
df1_indexed = df1.set_index("key")
df2_indexed = df2.set_index("key")

In [None]:
def merge_on_index():
    pd.merge(df1_indexed, df2_indexed, left_index=True, right_index=True)
%timeit merge_on_index()

18 s ± 2.43 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
df1_sorted = df1.sort_values("key")
df2_sorted = df2.sort_values("key")

In [None]:
def merge_on_sorted_column():
    return pd.merge(df1_sorted, df2_sorted, on="key", how="inner", sort=False)
%timeit merge_on_sorted_column()

6.52 s ± 266 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
df2_indexed = df2.set_index("key")
df2_indexed = df2.set_index("key")

def join_on_column():
    return df1.join(df2_indexed, on="key", how="inner")

%timeit join_on_column()

20.2 s ± 1.18 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


###  Reverse all rows (first row should become last etc.)

In [None]:
df = pd.DataFrame(np.arange(30).reshape(-1, 6), columns=map(lambda x: f'column_{x}', range(6)))

In [None]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29


In [None]:
df.iloc[::-1]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [None]:
df[::-1]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [None]:
df.iloc[::-1, :]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [None]:
df.reindex(index=df.index[::-1])

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [None]:
df.loc[df.index[::-1]]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


### Get column name with the highest number of row-wise maximum’s in dataframe

In [29]:
df = pd.DataFrame(np.random.randint(0, 100, 50).reshape(-1, 5), columns=map(lambda x: f'column_{x}', range(5)))

In [16]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,97,22,78,37,77
1,41,30,24,53,37
2,93,95,41,96,30
3,98,89,88,95,43
4,26,91,2,83,35
5,47,0,58,39,38
6,71,29,3,14,20
7,38,99,99,2,39
8,21,77,3,75,39
9,68,40,21,77,96


In [17]:
df.idxmax(axis=1)

Unnamed: 0,0
0,column_0
1,column_3
2,column_3
3,column_0
4,column_1
5,column_2
6,column_0
7,column_1
8,column_1
9,column_4


То же самое через df.apply

In [None]:
df.apply(np.argmax, axis=1)

Unnamed: 0,0
0,4
1,3
2,2
3,2
4,3
5,0
6,0
7,3
8,3
9,2


In [None]:
df.idxmax(axis=1).value_counts()

Unnamed: 0,count
column_3,4
column_2,3
column_0,2
column_4,1


In [None]:
df.idxmax(axis=1).value_counts().index

Index(['column_3', 'column_2', 'column_0', 'column_4'], dtype='object')

Наш ответ

In [None]:
df.idxmax(axis=1).value_counts().index[0]

'column_3'

### Find the positions of numbers that are multiples of N

In [None]:
N = 5

In [None]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,54,5,60,52,71
1,15,4,18,92,71
2,33,3,77,40,58
3,56,12,76,3,67
4,35,67,57,76,27
5,92,57,49,26,59
6,78,69,51,66,9
7,21,32,22,88,0
8,22,6,42,73,71
9,66,16,98,12,63


For each column Series separately

In [None]:
pos_map = df.apply(lambda x: x % N == 0)
print(pos_map)
for col in pos_map.columns:
    ser = pos_map.loc[:, col]
    print(f'{col}: {ser[ser].index.tolist()}')

   column_0  column_1  column_2  column_3  column_4
0     False      True      True     False     False
1      True     False     False     False     False
2     False     False     False      True     False
3     False     False     False     False     False
4      True     False     False     False     False
5     False     False     False     False     False
6     False     False     False     False     False
7     False     False     False     False      True
8     False     False     False     False     False
9     False     False     False     False     False
column_0: [1, 4]
column_1: [0]
column_2: [0]
column_3: [2]
column_4: [7]


```
column_0: [0, 2, 6, 7]
column_1: [0, 5]
column_2: [6, 9]
column_3: [7, 8]
column_4: [5, 8]
```

Now try to treat rows and columns as coordinates. Return list (or array) of pairs for such elements. (one-liner)

In [None]:
np.argwhere(df.values % N == 0).tolist()

[[0, 1], [0, 2], [1, 0], [2, 3], [4, 0], [7, 4]]

```[[0, 0],
 [0, 1],
 [2, 0],
 [5, 1],
 [5, 4],
 [6, 0],
 [6, 2],
 [7, 0],
 [7, 3],
 [8, 3],
 [8, 4],
 [9, 2]]
 ```

### Compute the minimum-by-maximum for every row of dataframe

In [None]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


In [20]:
df.min(axis=1)

Unnamed: 0,0
0,22
1,24
2,30
3,43
4,2
5,0
6,3
7,2
8,3
9,21


In [21]:
df.max(axis=1)

Unnamed: 0,0
0,97
1,53
2,96
3,98
4,91
5,58
6,71
7,99
8,77
9,96


In [22]:
df.apply(lambda row: (row.max(), row.min()), axis=1)

Unnamed: 0,0
0,"(97, 22)"
1,"(53, 24)"
2,"(96, 30)"
3,"(98, 43)"
4,"(91, 2)"
5,"(58, 0)"
6,"(71, 3)"
7,"(99, 2)"
8,"(77, 3)"
9,"(96, 21)"


### Normalize all columns of df by subtracting the column mean and divide by standard deviation.

In [None]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


In [54]:
# your code
# hint: use apply (for what axis?), np.mean and np.std (or similar pandas methods)
df = pd.DataFrame(np.random.randint(0, 100, 50).reshape(-1, 5), columns=map(lambda x: f'column_{x}', range(5)))
for column in df.columns:
  print((df[column] - np.mean(df[column])) / np.std(df[column]))

0   -0.563356
1   -1.523149
2    0.646817
3   -0.479896
4    1.564879
5    0.897197
6   -1.397959
7    0.229516
8   -0.479896
9    1.105848
Name: column_0, dtype: float64
0   -0.259376
1    0.365047
2    1.613893
3   -0.211343
4    0.413080
5    1.709958
6   -0.307408
7   -1.220026
8   -0.691669
9   -1.412157
Name: column_1, dtype: float64
0    1.207154
1    0.867907
2   -1.082764
3   -1.252387
4    0.245954
5    0.811366
6   -0.969681
7   -1.337199
8    0.274225
9    1.235425
Name: column_2, dtype: float64
0    1.447895
1    0.924109
2   -0.048637
3   -1.395517
4   -1.507757
5    0.811869
6    0.699629
7    0.437736
8   -1.208450
9   -0.160877
Name: column_3, dtype: float64
0   -0.965098
1   -1.043882
2    0.177263
3    1.162057
4    1.437799
5   -0.649964
6   -0.137871
7    0.216655
8   -1.516583
9    1.319624
Name: column_4, dtype: float64


In [37]:
ser = pd.Series(np.random.randint(0, 100, 5))-10
ser

Unnamed: 0,0
0,-2
1,87
2,78
3,5
4,-5


### Range all columns of df such that the minimum value in each column is 0 and max is 1

Найти минимальное и максимальное значение, и нормализовать к диапазону min - max эквивалентному 0 - 1. Таким образом min это 0, любое следующе до максимума это n / (max - min).

In [44]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,0,53,90,34,84
1,76,35,12,21,37
2,31,21,47,51,62
3,69,40,70,88,19
4,35,30,13,96,0
5,64,83,92,64,51
6,60,35,10,90,54
7,88,99,76,88,90
8,82,22,20,22,11
9,80,0,36,64,46


In [52]:
# your code
# same as prev task. google for the minmax normalization formula
for column in df.columns:
  df[column] = (df[column] - min(df[column])) / (max(df[column]) - min(df[column]))
df
# для строк можно череp apply axis 0...1

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,0.25,0.857143,0.907216,0.65625,0.0
1,0.0,0.077922,1.0,0.552083,1.0
2,0.684783,0.220779,0.381443,0.385417,0.425287
3,0.032609,1.0,0.453608,0.604167,0.597701
4,1.0,0.688312,0.989691,1.0,0.586207
5,0.728261,0.012987,0.072165,0.114583,0.885057
6,0.554348,0.0,0.082474,0.4375,0.91954
7,0.75,0.207792,0.010309,0.552083,0.632184
8,0.956522,0.61039,0.556701,0.854167,0.517241
9,0.48913,0.233766,0.0,0.0,0.298851


### Create a column that contains the second-largest value in each row?

Создайте колонку, в которой содержится второе самое большое значение строки

In [None]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,39,37,22,62,69
1,11,86,77,44,14
2,81,7,72,89,4
3,38,92,97,72,14
4,14,92,2,26,97
5,30,76,92,96,47
6,43,55,69,70,23
7,99,55,80,26,49
8,86,4,58,58,35
9,68,91,68,67,5


In [57]:
# your code
# hint: apply with sorting
df["2largest"] = df.apply(lambda row: sorted(row, reverse=True)[1], axis=1)
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,2largest
0,41,40,93,96,30,93
1,18,53,81,82,28,81
2,70,79,12,56,59,70
3,43,41,6,20,84,43
4,92,54,59,17,91,91
5,76,81,79,79,38,79
6,21,39,16,76,51,51
7,60,20,3,69,60,60
8,43,31,60,25,16,43
9,81,16,94,53,88,88


### Split a text column into two separate columns?

In [None]:
df_text = pd.DataFrame(
    {
        'row':
        [
            'id\t Name, Surname',
            '2\t Nadal, Raphael',
            '5\t Djokovic,  Novak',
            '1\t Federer, Roger'
        ]
    }
)

In [None]:
# your code
# hint: use str and split

---

Больше [упражнений на Pandas с решениями](https://www.machinelearningplus.com/python/101-pandas-exercises-python/)

## PS

1. Для тех, кто сидит на Windows, советую поставить WSL2 и работать из-под этой подсистемы:

- [гайд](https://learn.microsoft.com/ru-ru/windows/wsl/install#manual-installation-steps) от Microsoft
- [базовые команды терминала](https://ubuntu.com/tutorials/command-line-for-beginners#1-overview)
- [установите Anaconda](https://www.digitalocean.com/community/tutorials/how-to-install-the-anaconda-python-distribution-on-ubuntu-22-04)

Запускать ноутбучки локально с помощью команды `jupyter notebook --no-browser`. Также можно работать с подсистемой с помощью VSCode

Сможете легко склонировать наш (и не только) [репозиторий](https://github.com/sanityseeker/lspy-2023) с помощью git clone :)

2. Расширения для Jupyter Notebook -- [nbextensions](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/install.html)