# Pandas фильтрация

Считаем тестовый набор данных

In [18]:
import pandas as pd

df = pd.read_csv('./data/StudentsPerformance.csv')

df.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


Студенты женского пола

In [19]:
df.gender == 'female'

0       True
1       True
2       True
3      False
4      False
       ...  
995     True
996    False
997     True
998     True
999     True
Name: gender, Length: 1000, dtype: bool

Отфильтруем студентов женского пола и выведем поля gender и writing score

In [20]:
df.loc[df.gender == 'female', ['gender', 'writing score']]

Unnamed: 0,gender,writing score
0,female,74
1,female,88
2,female,93
5,female,78
6,female,92
...,...,...
993,female,74
995,female,95
997,female,65
998,female,77


Выведем студентов с writing score выше среднего

In [21]:
mean_writing_score = df['writing score'].mean()
df.loc[df['writing score'] > mean_writing_score]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
...,...,...,...,...,...,...,...,...
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
995,female,group E,master's degree,standard,completed,88,99,95
998,female,group D,some college,standard,completed,68,78,77


Применим фильтрацию с комбинацией условий

In [22]:
df.loc[(df['writing score'] > 99) & (df.gender == 'female')]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
106,female,group D,master's degree,standard,none,87,100,100
114,female,group E,bachelor's degree,standard,completed,99,100,100
165,female,group C,bachelor's degree,standard,completed,96,100,100
179,female,group D,some high school,standard,completed,97,100,100
377,female,group D,master's degree,free/reduced,completed,85,95,100
403,female,group D,high school,standard,completed,88,99,100
458,female,group E,bachelor's degree,standard,none,100,100,100
566,female,group E,bachelor's degree,free/reduced,completed,92,100,100
685,female,group E,master's degree,standard,completed,94,99,100
903,female,group D,bachelor's degree,free/reduced,completed,93,100,100


У какой доли студентов из датасэта в колонке lunch указано free/reduced?

In [23]:
df.loc[df['lunch'] == 'free/reduced'].shape[0] / df.shape[0]

0.355

Как различается среднее и дисперсия оценок по предметам у групп студентов со стандартным или урезанным ланчем?

In [24]:
df.loc[df['lunch'] == 'standard', ['math score', 'reading score', 'writing score']].describe()

Unnamed: 0,math score,reading score,writing score
count,645.0,645.0,645.0
mean,70.034109,71.654264,70.823256
std,13.653501,13.830602,14.339487
min,19.0,26.0,22.0
25%,61.0,63.0,62.0
50%,69.0,72.0,72.0
75%,80.0,82.0,81.0
max,100.0,100.0,100.0


In [25]:
df.loc[df['lunch'] == 'free/reduced', ['math score', 'reading score', 'writing score']].describe()

Unnamed: 0,math score,reading score,writing score
count,355.0,355.0,355.0
mean,58.921127,64.653521,63.022535
std,15.159956,14.895339,15.433823
min,0.0,17.0,10.0
25%,49.0,56.0,53.0
50%,60.0,65.0,64.0
75%,69.0,75.0,74.0
max,100.0,100.0,100.0


In [26]:
df.groupby('lunch').mean()

Unnamed: 0_level_0,math score,reading score,writing score
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free/reduced,58.921127,64.653521,63.022535
standard,70.034109,71.654264,70.823256


In [27]:
df.groupby('lunch').var()

Unnamed: 0_level_0,math score,reading score,writing score
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free/reduced,229.82427,221.871139,238.202881
standard,186.418089,191.28556,205.620887


## Фильтрация методом query()

Переименуем колонки, заменив пробелы подчеркиванием

In [28]:
df = df.rename(columns={'parental level of education': 'parental_level_of_education', 
'test preparation course': 'test_preparation_course', 
'math score': 'math_score', 
'reading score': 'reading_score',
'writing score': 'writing_score'})

df.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Выведем студенток с writing_score > 74

In [29]:
writing_score_query = 74
df.query('gender == "female" & writing_score > @writing_score_query')

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
15,female,group C,some high school,standard,none,69,75,78
...,...,...,...,...,...,...,...,...
991,female,group B,some high school,standard,completed,65,82,78
992,female,group D,associate's degree,free/reduced,none,55,76,76
995,female,group E,master's degree,standard,completed,88,99,95
998,female,group D,some college,standard,completed,68,78,77


Выведем студентов, уровень образования родителей которых - бакалавр или магистр

In [30]:
df[df['parental_level_of_education'].isin(["bachelor's degree", "master's degree"])]

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
2,female,group B,master's degree,standard,none,90,95,93
14,female,group A,master's degree,standard,none,50,53,58
18,male,group C,master's degree,free/reduced,completed,46,42,46
24,male,group D,bachelor's degree,free/reduced,completed,74,71,80
...,...,...,...,...,...,...,...,...
957,female,group D,master's degree,standard,none,92,100,100
969,female,group B,bachelor's degree,standard,none,75,84,80
970,female,group D,bachelor's degree,standard,none,89,100,100
993,female,group D,bachelor's degree,free/reduced,none,62,72,74


Выведем студентов, оценка за математику которых выше 90 и за чтение меньше либо равна 72

In [31]:
df[(df['math_score'] > 90) & (df['reading_score'] <= 72)]

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score


Выведем студентов, не прошедших тест

In [32]:
df[df['test_preparation_course'] != 'completed']

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
...,...,...,...,...,...,...,...,...
992,female,group D,associate's degree,free/reduced,none,55,76,76
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
996,male,group C,high school,free/reduced,none,62,55,55


Отберём колонки по названию, в которых присутствует 'score'

In [33]:
df[[col for col in df.columns if 'score' in col]]

Unnamed: 0,math_score,reading_score,writing_score
0,72,72,74
1,69,90,88
2,90,95,93
3,47,57,44
4,76,78,75
...,...,...,...
995,88,99,95
996,62,55,55
997,59,71,65
998,68,78,77


Методом filter df

In [34]:
df.filter(like='score') # axis=1 по-умолчанию

Unnamed: 0,math_score,reading_score,writing_score
0,72,72,74
1,69,90,88
2,90,95,93
3,47,57,44
4,76,78,75
...,...,...,...
995,88,99,95
996,62,55,55
997,59,71,65
998,68,78,77
