In [5]:
import pandas as pd
import seaborn as sns
from sqlalchemy import create_engine
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats.mstats import winsorize
import warnings

warnings.filterwarnings('ignore')

In [6]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'studentsperformance'

In [8]:
engine = create_engine("postgresql://{}:{}@{}:{}/{}".format(postgres_user, postgres_pw, 
                                                            postgres_host, postgres_port, postgres_db))

In [9]:
stu_performance = pd.read_sql("select * from studentsperformance", con = engine)

In [10]:
stu_performance.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [11]:
stu_performance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
gender                         1000 non-null object
race/ethnicity                 1000 non-null object
parental level of education    1000 non-null object
lunch                          1000 non-null object
test preparation course        1000 non-null object
math score                     1000 non-null int64
reading score                  1000 non-null int64
writing score                  1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


## 1. Are there any differences between the genders, ethnicities, and parental level of education with respect to their performances in exams?

In [12]:
stu_performance.groupby('gender').mean()[['math score', 'reading score', 'writing score']]

Unnamed: 0_level_0,math score,reading score,writing score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,63.633205,72.608108,72.467181
male,68.728216,65.473029,63.311203


In [14]:
stats.ttest_ind(stu_performance[stu_performance.gender == 'female'][['math score', 'reading score', 'writing score']], 
         stu_performance[stu_performance.gender == 'male'][['math score', 'reading score', 'writing score']])

Ttest_indResult(statistic=array([-5.38324587,  7.95930801,  9.97955791]), pvalue=array([9.12018555e-08, 4.68053874e-15, 2.01987771e-22]))

## Females performed better in reading and writing while males performed better in Math

In [15]:
stu_performance.groupby('race/ethnicity').mean()[['math score', 'reading score', 'writing score']]

Unnamed: 0_level_0,math score,reading score,writing score
race/ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
group A,61.629213,64.674157,62.674157
group B,63.452632,67.352632,65.6
group C,64.46395,69.103448,67.827586
group D,67.362595,70.030534,70.145038
group E,73.821429,73.028571,71.407143


In [16]:
groups = stu_performance['race/ethnicity'].unique()


In [18]:
grouped_df = stu_performance.groupby('race/ethnicity')

In [20]:
for var in ["math score", "reading score", "writing score"]:
    print("----------------------------------------------------------------")
    print("Comparisons for variable: {}".format(var))
    print("----------------------------------------------------------------")
    for i in range(0, len(groups)):
        for j in range(i+1, len(groups)):
            print("t-test between groups {0} and {1}:".format(groups[i], groups[j]))
            print(stats.ttest_ind(
                stu_performance[stu_performance['race/ethnicity'] == groups[i]][var],
                stu_performance[stu_performance['race/ethnicity'] == groups[j]][var]))

----------------------------------------------------------------
Comparisons for variable: math score
----------------------------------------------------------------
t-test between groups group B and group C:
Ttest_indResult(statistic=-0.7315669893534263, pvalue=0.4647708939167453)
t-test between groups group B and group A:
Ttest_indResult(statistic=0.9355076279747488, pvalue=0.3503415961659957)
t-test between groups group B and group D:
Ttest_indResult(statistic=-2.82845539712675, pvalue=0.004886197137104194)
t-test between groups group B and group E:
Ttest_indResult(statistic=-6.007397050552227, pvalue=5.007946047497971e-09)
t-test between groups group C and group A:
Ttest_indResult(statistic=1.5997220303217299, pvalue=0.11043810745588042)
t-test between groups group C and group D:
Ttest_indResult(statistic=-2.41858624746011, pvalue=0.015888349556016285)
t-test between groups group C and group E:
Ttest_indResult(statistic=-6.127642520822135, pvalue=1.927238849552764e-09)
t-test betw

## There are significant differences except groups A, B & C

## 2. Are there any differences between the lunch types with respect to their performances in exams? If there are, how do you explain this?

In [21]:
stu_performance.groupby('lunch').mean()[['math score', 'reading score', 'writing score']]

Unnamed: 0_level_0,math score,reading score,writing score
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
free/reduced,58.921127,64.653521,63.022535
standard,70.034109,71.654264,70.823256


In [29]:
stats.ttest_ind(stu_performance[stu_performance.lunch == 'free/reduced'][['math score',
                                                                          'reading score', 'writing score']],stu_performance[stu_performance.lunch == 'standard'][['math score', 'reading score', 'writing score']])

Ttest_indResult(statistic=array([-11.83718047,  -7.45105647,  -8.0097842 ]), pvalue=array([2.41319560e-30, 2.00279665e-13, 3.18618958e-15]))

## it appears that there is a significant difference between two groups and the standard group performed better than the other group

## 3.Does the test preparation course seem to have an effect on the exam performances?

In [31]:
stu_performance['test preparation course'].unique()

array(['none', 'completed'], dtype=object)

In [32]:
stu_performance.groupby('test preparation course').mean()[['math score', 'reading score', 'writing score']]

Unnamed: 0_level_0,math score,reading score,writing score
test preparation course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
completed,69.695531,73.893855,74.418994
none,64.077882,66.534268,64.504673


In [33]:
stats.ttest_ind(stu_performance[stu_performance['test preparation course']
                                == 'completed'][['math score', 'reading score', 'writing score']],
               stu_performance[stu_performance['test preparation course']== 'none'][['math score', 'reading score', 'writing score']])

Ttest_indResult(statistic=array([ 5.70461642,  7.87166354, 10.40917344]), pvalue=array([1.53591346e-08, 9.08178334e-15, 3.68529174e-24]))

## it appears that students who completed test prep courses performed better in all three scores 

## 4. Which 2 exam scores are most correlated with each other?

In [34]:
stu_performance.corr()

Unnamed: 0,math score,reading score,writing score
math score,1.0,0.81758,0.802642
reading score,0.81758,1.0,0.954598
writing score,0.802642,0.954598,1.0


## Reading and writing scores most correlated wit