# Filtering and transforming
Page 212

In [11]:
import pandas as pd
import numpy as np

In [4]:
original_list = [1, 2, 3]
n = 3
repeated_list = original_list * n
print(repeated_list)

[1, 2, 3, 1, 2, 3, 1, 2, 3]


In [13]:
import numpy as np
np.random.seed(0)

year = [2018, 2019, 2020] * 3 + [2021]
print(year)

[2018, 2019, 2020, 2018, 2019, 2020, 2018, 2019, 2020, 2021]


In [14]:
df = pd.DataFrame(
    {'name': list('ABCDEFGHIJ'), 'year': [2018, 2019, 2020] * 3 + [2021],
    'score':np.random.randint(80, 100, 10)})

In [15]:
df

Unnamed: 0,name,year,score
0,A,2018,92
1,B,2019,95
2,C,2020,80
3,D,2018,83
4,E,2019,83
5,F,2020,87
6,G,2018,89
7,H,2019,99
8,I,2020,98
9,J,2021,84


In [17]:
mean_score = df['score'].mean()
mean_score

89.0

In [19]:
students_who_scored_above_90 = df.loc[df['score']>90]
students_who_scored_above_90

Unnamed: 0,name,year,score
0,A,2018,92
1,B,2019,95
7,H,2019,99
8,I,2020,98


In [21]:
mean_score_per_year = df.groupby('year')['score'].mean()
mean_score_per_year

year
2018    88.000000
2019    92.333333
2020    88.333333
2021    84.000000
Name: score, dtype: float64

we want to determine which years in our school
had an average score of at least 90, and see all the students in those years.

In [23]:
def year_average_is_at_least_90(df):
    return df['score'].mean() > 90

df.groupby('year').filter(year_average_is_at_least_90)

Unnamed: 0,name,year,score
1,B,2019,95
4,E,2019,83
7,H,2019,99


In [24]:
df['score'].transform(lambda x: x/100)

0    0.92
1    0.95
2    0.80
3    0.83
4    0.83
5    0.87
6    0.89
7    0.99
8    0.98
9    0.84
Name: score, dtype: float64

In [25]:
df.groupby('year')['score'].max()

year
2018    92
2019    99
2020    98
2021    84
Name: score, dtype: int64

In [26]:
df.groupby('year')['score'].transform(np.max)

  df.groupby('year')['score'].transform(np.max)


0    92
1    99
2    98
3    92
4    99
5    98
6    92
7    99
8    98
9    84
Name: score, dtype: int64

In [None]:
df.groupby('year')['score'].transform(np.max)