### Importing Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly.express as px

### Read Data

In [11]:
df = pd.read_csv("StudentsPerformance.csv")

### Examine the data set

In [12]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [14]:
#Berechnung der Gesamtnote

x = df['math score']
y = df['reading score']
z = df['writing score']
total_marks = x+y+z

df['total marks'] = total_marks

### Analyzing value counts

In [15]:
df_1 = df[['gender', 'race/ethnicity', 'parental level of education', 'lunch','test preparation course']]

for col in df_1.columns:
    if col != ('math score','reading score','writing score'):
        print('Count is:',len(Counter(df[col])))        
        print(df[col].value_counts(),end="\n")

Count is: 2
female    518
male      482
Name: gender, dtype: int64
Count is: 5
group C    319
group D    262
group B    190
group E    140
group A     89
Name: race/ethnicity, dtype: int64
Count is: 6
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64
Count is: 2
standard        645
free/reduced    355
Name: lunch, dtype: int64
Count is: 2
none         642
completed    358
Name: test preparation course, dtype: int64


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
 8   total marks                  1000 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 70.4+ KB


## Visualizing categorical distributions

### Categorical and nominal variables:
- Gender
- race/ethnicity
- parental level of education
- lunch
- test preparation course

# Bar Plots

In [17]:
fig = px.bar(df, x='gender', color="gender", title="Total number students",
            labels={
                     "gender": "Gender",
                     "count": "Count"
                 }
            )
fig.show()

In [18]:
fig = px.bar(df, x='race/ethnicity', color="race/ethnicity", title="Distribution of Nationality",
            labels={
                    "race/ethnicity": "Nationality",
                    "count": "Count"
                 }
            )
fig.show()

In [19]:
fig = px.bar(df, x='race/ethnicity', color="gender", barmode='group', title="Gender-specific comparison of nationality",
            labels={
                     "race/ethnicity": "Nationality",
                     "gender": "Gender",
                     "count": "Count"
                 }
            )
fig.show()

In [20]:
fig = px.bar(df, x='parental level of education',y='total marks', barmode='group', color="gender", title="Gender-specific distribution of parental education level in relation to overall grades",
            labels={
                     "total marks": "Total marks",
                     "parental level of education": "Parental level of education",
                     "count": "Count",
                     "gender": "Gender"
                 }
            )
fig.show()

In [21]:
fig = px.bar(df, x='test preparation course', color="test preparation course", title="Participation of the students in the test preparation course",
            labels={
                     "gender": "Teilnahme",
                     "count": "Count"
                 }
            )
fig.show()

# Pie Plots

In [22]:
fig = px.pie(df, names='gender', title="Total number of female and male students",
            labels={
                     "gender": "Gender"
                 })
fig.show()

In [23]:
fig = px.pie(df, values='total marks', names='gender', title="Gender-specific overall score")
fig.show()

# Histogram Plots

### Distribution of a continious variable

In [24]:
fig = px.histogram(df, x="total marks", title="Histogram for the overall score")
fig.show()

fig = px.histogram(df, x="total marks", nbins=15)
fig.show()

### Distribution of a discrete variable

In [25]:
fig = px.histogram(df, x="parental level of education")
fig.show()

# Boxplots

A boxplot is a statistical representation of the distribution of a variable by its quartiles. The ends of the box represent the lower and upper quartiles, while the median (second quartile) is marked by a line inside the box.

In [26]:
fig = px.box(df, y="total marks",
             labels={
                     "total marks": "Gesamtnote"
                 },
                title="Distribution of the overall grade")
fig.show()

# Scatter Plots
### Distribution with several variables

In [27]:
fig = px.scatter(df, y="total marks", color="gender",
                 labels={
                     "total marks": "Total marks",
                     "gender": "Gender"
                 },
                title="Gender-specific distribution of the overall grade")
fig.show()

### Distribution between variables

In [28]:
fig = px.scatter(df, x="writing score", y="total marks", color="gender",
                 labels={
                     "total marks": "Total marks",
                     "writing score": "Writing score",
                     "gender": "Gender"
                 },
                title="Gender-specific distribution of the overall grade vs. written grade")
fig.show()

In [29]:
fig = px.scatter(df, x="math score", y="writing score", color="gender",
                 labels={
                     "math score": "Math score",
                     "reading score": "Reading score",
                     
                 },
                title="Gender distribution of maths grade vs reading grade")
fig.show()

In [30]:
fig = px.scatter(df, y="math score", color="gender",
                 labels={
                    "math score": "Math score",
                    "gender": "Gender"
                 },
                title="Distribution of math grade in relation to gender")
fig.show()

###  Bubble Chart

In [31]:
fig = px.scatter(df, x="writing score", y="total marks", color="gender",
                 size='reading score',
                 labels={
                     "total marks": "Gesamtnote",
                     "writing score": "Schriftliche Note",
                     "gender": "Geschlecht"
                 },
                title="Gender-specific distribution of the overall grade vs. written grade")

fig.show()

# Scatter Plots with Linear Regression

In [32]:
# pip install statsmodels
# If 'ols', an Ordinary Least Squares regression line will be drawn for each discrete-color/symbol group
fig = px.scatter(df, x="writing score", y="total marks", trendline="ols")
fig.show()