# exploratory data analysis

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

## dataset

In [3]:
data = pd.read_csv("dataset/stud.csv")
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
data.shape

(1000, 8)

In [5]:
data.columns.tolist()

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course',
 'math_score',
 'reading_score',
 'writing_score']

In [6]:
# missing values
data.isnull().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [7]:
# duplicates
data.duplicated().sum()

np.int64(0)

In [8]:
# data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [9]:
# statistical description
data.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


## exploratory data analysis (eda)

In [10]:
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [11]:
data["gender"].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [12]:
data["race_ethnicity"].value_counts()

race_ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

In [13]:
data["parental_level_of_education"].value_counts()

parental_level_of_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

In [14]:
data["test_preparation_course"].value_counts()

test_preparation_course
none         642
completed    358
Name: count, dtype: int64

In [15]:
data["lunch"].value_counts()

lunch
standard        645
free/reduced    355
Name: count, dtype: int64

In [16]:
data.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


## data info
### object dtypes
- gender: male/female
- race/ethnicity: group A to group E
- parental level of education: bachelor degree, some college, masters degree, associate degree, high school
- lunch: lunch before test (standard or free/reduced)
- test preparation course: complete or not complete before test
### int dtypes
- math scores
- reading scores
- writing scores

### more information
- no duplicates
- no missing values
- 1000 rows
- 8 features/columns


## feature engineering

In [17]:
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [21]:
data["reading_score"].dtype

dtype('int64')

In [22]:
# total scores from math_score, reading_score, writing_score
data["total_score"] = data["math_score"] + data["reading_score"] + data["writing_score"]
data["total_score"].head()

0    218
1    247
2    278
3    148
4    229
Name: total_score, dtype: int64

In [23]:
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218
1,female,group C,some college,standard,completed,69,90,88,247
2,female,group B,master's degree,standard,none,90,95,93,278
3,male,group A,associate's degree,free/reduced,none,47,57,44,148
4,male,group C,some college,standard,none,76,78,75,229


In [25]:
data["average"] = data["total_score"] / 3
data.tail()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average
995,female,group E,master's degree,standard,completed,88,99,95,282,94.0
996,male,group C,high school,free/reduced,none,62,55,55,172,57.333333
997,female,group C,high school,free/reduced,completed,59,71,65,195,65.0
998,female,group D,some college,standard,completed,68,78,77,223,74.333333
999,female,group D,some college,free/reduced,none,77,86,86,249,83.0


In [26]:
data.isnull().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
total_score                    0
average                        0
dtype: int64

In [27]:
data.duplicated().sum()

np.int64(0)

In [28]:
len(data.columns)

10

In [30]:
data.shape

(1000, 10)

In [31]:
reading_full = data[data["reading_score"] == 100]["average"].count()
writing_full = data[data["writing_score"] == 100]["average"].count()
math_full = data[data["math_score"] == 100]["average"].count()

print(f"number of students with full marks in reading: {reading_full} ")
print(f"number of students with full marks in writing: {writing_full} ")
print(f"number of students with full marks in mathematics: {math_full}")

number of students with full marks in reading: 17 
number of students with full marks in writing: 14 
number of students with full marks in mathematics: 7


In [32]:
reading_less_20 = data[data["reading_score"] <= 20]["average"].count()
writing_less_20 = data[data["writing_score"] <= 20]["average"].count()
math_less_20 = data[data["math_score"] <= 20]["average"].count()

print(f"number of students with 20 marks or less in reading: {reading_less_20}")
print(f"number of students with 20 marks or less in writing: {writing_less_20}")
print(f"number of students with 20 marks or less in mathematics: {math_less_20}")

number of students with 20 marks or less in reading: 1
number of students with 20 marks or less in writing: 3
number of students with 20 marks or less in mathematics: 4
