In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [100]:
#importing dataset into python environment
D=pd.read_csv('StudentsPerformance.csv')
D.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group C,some high school,free/reduced,none,0,17,10
1,female,group B,high school,free/reduced,none,8,24,23
2,female,group B,some high school,free/reduced,none,18,32,28
3,female,group B,some college,standard,none,11,38,32
4,female,group C,some college,free/reduced,none,22,39,33


In [101]:
#columns
D.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [102]:
#checking for null values
D.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

# 1.Finding number of males and females participated in the test

In [103]:
D['gender'].value_counts()

female    518
male      482
Name: gender, dtype: int64

Number of females = 518

Number of males   = 482

# 2.Students' parental level of education

In [104]:
D['parental level of education'].value_counts()

some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64

It appears that parents studied in some college is highest. Parents with associate's degree is close to this but master's degree is too low.

# 3.Who scores the most on average for math, reading and writing based on

# 3.1 Gender

#First method

In [105]:
test = D[['gender', 'math score','reading score','writing score']]
test.head()

Unnamed: 0,gender,math score,reading score,writing score
0,female,0,17,10
1,female,8,24,23
2,female,18,32,28
3,female,11,38,32
4,female,22,39,33


In [106]:
F=test['gender']=='female'
test[F]

Unnamed: 0,gender,math score,reading score,writing score
0,female,0,17,10
1,female,8,24,23
2,female,18,32,28
3,female,11,38,32
4,female,22,39,33
...,...,...,...,...
513,female,99,100,100
514,female,99,93,90
515,female,100,92,97
516,female,100,100,100


In [107]:
M=test['gender']=='male'
test[M]

Unnamed: 0,gender,math score,reading score,writing score
518,male,27,34,36
519,male,28,23,19
520,male,30,24,15
521,male,30,26,22
522,male,31,32,34
...,...,...,...,...
995,male,99,87,81
996,male,100,96,86
997,male,100,97,99
998,male,100,100,93


In [108]:
#Finding the mean of Math score, reading score and writing score for females
test[F].mean()

math score       63.376448
reading score    72.590734
writing score    72.467181
dtype: float64

In [109]:
#Finding the mean of Math score, reading score and writing score for males
test[M].mean()

math score       68.821577
reading score    65.545643
writing score    63.446058
dtype: float64

In [110]:
#Second method

In [111]:
D.groupby('gender')['math score'].mean()

gender
female    63.376448
male      68.821577
Name: math score, dtype: float64

In [112]:
D.groupby('gender')['reading score'].mean()

gender
female    72.590734
male      65.545643
Name: reading score, dtype: float64

In [113]:
D.groupby('gender')['writing score'].mean()

gender
female    72.467181
male      63.446058
Name: writing score, dtype: float64

Average math score of males (68.821577) is greater than that of females (63.376448)

Average reading score of females (72.590734) is greater than that of males (65.545643)

Average writing score of females (72.467181) is greater than that of males (63.446058)

# 3.2 Test preparation course

In [114]:
#first method

In [115]:
D['test preparation course'].value_counts()

none         642
completed    358
Name: test preparation course, dtype: int64

In [116]:
course = D[['test preparation course', 'math score','reading score','writing score']]
course.head()

Unnamed: 0,test preparation course,math score,reading score,writing score
0,none,0,17,10
1,none,8,24,23
2,none,18,32,28
3,none,11,38,32
4,none,22,39,33


In [117]:
N=course['test preparation course']=='none'
course[N]

Unnamed: 0,test preparation course,math score,reading score,writing score
0,none,0,17,10
1,none,8,24,23
2,none,18,32,28
3,none,11,38,32
4,none,22,39,33
...,...,...,...,...
984,none,94,88,78
986,none,94,73,71
987,none,95,81,84
990,none,97,93,91


In [118]:
C=course['test preparation course']=='completed'
course[C]

Unnamed: 0,test preparation course,math score,reading score,writing score
5,completed,23,44,36
10,completed,29,40,44
12,completed,32,51,44
16,completed,34,48,41
21,completed,35,55,60
...,...,...,...,...
995,completed,99,87,81
996,completed,100,96,86
997,completed,100,97,99
998,completed,100,100,93


In [119]:
course[C].mean()

math score       69.966480
reading score    74.175978
writing score    74.684358
dtype: float64

In [120]:
course[N].mean()

math score       63.789720
reading score    66.417445
writing score    64.457944
dtype: float64

In [121]:
#Second method

In [130]:
D.groupby('test preparation course')['math score'].mean()

test preparation course
completed    69.96648
none         63.78972
Name: math score, dtype: float64

In [131]:
D.groupby('test preparation course')['reading score'].mean()

test preparation course
completed    74.175978
none         66.417445
Name: reading score, dtype: float64

In [124]:
D.groupby('test preparation course')['writing score'].mean()

test preparation course
completed    74.684358
none         64.457944
Name: writing score, dtype: float64

Students who completed Test preparation course  have higher average for math score, reading score  and writing score  compared to others

# 4.Scoring variation for math, reading and writing based on


# 4.1 Gender

In [125]:
D.groupby(['gender']).agg(['min','mean','median','max'])

Unnamed: 0_level_0,math score,math score,math score,math score,reading score,reading score,reading score,reading score,writing score,writing score,writing score,writing score
Unnamed: 0_level_1,min,mean,median,max,min,mean,median,max,min,mean,median,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
female,0,63.376448,65,100,17,72.590734,73,100,10,72.467181,74,100
male,27,68.821577,69,100,23,65.545643,66,100,15,63.446058,64,100


# 4.2 Test preparation course

In [126]:
D.groupby(['test preparation course']).agg(['min','mean','median','max'])

Unnamed: 0_level_0,math score,math score,math score,math score,reading score,reading score,reading score,reading score,writing score,writing score,writing score,writing score
Unnamed: 0_level_1,min,mean,median,max,min,mean,median,max,min,mean,median,max
test preparation course,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
completed,23,69.96648,70,100,37,74.175978,75,100,36,74.684358,76,100
none,0,63.78972,64,100,17,66.417445,67,100,10,64.457944,65,100


# 5.Finding top 25% of students based on their maths score

In [127]:
#Sorting dataset in descending order of math score
d=D.sort_values('math score',ascending=False)
d

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
999,male,group E,bachelor's degree,standard,completed,100,100,100
996,male,group A,some college,standard,completed,100,96,86
515,female,group E,some college,standard,none,100,92,97
517,female,group E,associate's degree,standard,none,100,100,100
516,female,group E,bachelor's degree,standard,none,100,100,100
...,...,...,...,...,...,...,...,...
2,female,group B,some high school,free/reduced,none,18,32,28
51,female,group C,some high school,free/reduced,none,13,41,51
3,female,group B,some college,standard,none,11,38,32
1,female,group B,high school,free/reduced,none,8,24,23


In [128]:
d=d.reset_index()
d

Unnamed: 0,index,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,999,male,group E,bachelor's degree,standard,completed,100,100,100
1,996,male,group A,some college,standard,completed,100,96,86
2,515,female,group E,some college,standard,none,100,92,97
3,517,female,group E,associate's degree,standard,none,100,100,100
4,516,female,group E,bachelor's degree,standard,none,100,100,100
...,...,...,...,...,...,...,...,...,...
995,2,female,group B,some high school,free/reduced,none,18,32,28
996,51,female,group C,some high school,free/reduced,none,13,41,51
997,3,female,group B,some college,standard,none,11,38,32
998,1,female,group B,high school,free/reduced,none,8,24,23


25 % of 1000 = 250

In [132]:
#top 25% of students based on their maths score
d[0:250]


Unnamed: 0,index,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,999,male,group E,bachelor's degree,standard,completed,100,100,100
1,996,male,group A,some college,standard,completed,100,96,86
2,515,female,group E,some college,standard,none,100,92,97
3,517,female,group E,associate's degree,standard,none,100,100,100
4,516,female,group E,bachelor's degree,standard,none,100,100,100
...,...,...,...,...,...,...,...,...,...
245,856,male,group E,some high school,standard,completed,77,76,77
246,855,male,group E,associate's degree,free/reduced,completed,77,69,68
247,854,male,group D,some high school,standard,completed,77,68,69
248,853,male,group D,associate's degree,free/reduced,none,77,78,73
