# Simpson's Paradox
Use `admission_data.csv` for this exercise.

In [2]:
# Load and view first few lines of dataset
import pandas as pd
import numpy as np

data = pd.read_csv('admission_data.csv')
data.head(5)

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
1,56105,male,Physics,True
2,31441,female,Chemistry,False
3,51765,male,Physics,True
4,53714,female,Physics,True


### Proportion and admission rate for each gender

In [30]:
# Proportion of students that are female/male
data['gender'].value_counts(normalize=True)

female    0.514
male      0.486
Name: gender, dtype: float64

In [10]:
# Admission rate for females/males
data.groupby('gender')['admitted'].value_counts(normalize=True)

gender  admitted
female  False       0.712062
        True        0.287938
male    False       0.514403
        True        0.485597
Name: admitted, dtype: float64

### Proportion and admission rate for physics majors of each gender

In [16]:
# What proportion of female students are majoring in physics?
female_students = data[data['gender'] == 'female']

female_students.loc[female_students['major'] == 'Physics', 'admitted'].count() / female_students['student_id'].count()

0.12062256809338522

In [18]:
# What proportion of male students are majoring in physics?
male_students = data[data['gender'] == 'male']

male_students.loc[male_students['major'] == 'Physics', 'admitted'].count() / male_students['student_id'].count()

0.9259259259259259

In [21]:
# Admission rate for female physics majors
female_students.query('major == "Physics" and admitted == True').count()[0] / \
    female_students.loc[female_students['major'] == 'Physics'].count()[0]

0.7419354838709677

In [22]:
# Admission rate for male physics majors
male_students.query('major == "Physics" and admitted == True').count()[0] / \
    male_students.loc[male_students['major'] == 'Physics'].count()[0]

0.5155555555555555

### Proportion and admission rate for chemistry majors of each gender

In [23]:
# What proportion of female students are majoring in chemistry?
female_students.query('major == "Chemistry"').count()[0]/len(female_students)

0.8793774319066148

In [24]:
# What proportion of male students are majoring in chemistry?
male_students.query('major == "Chemistry"').count()[0]/len(male_students)

0.07407407407407407

In [25]:
# Admission rate for female chemistry majors
female_students.query('major == "Chemistry" and admitted == True').count()[0] / \
    female_students.query('major == "Chemistry"').count()[0]

0.22566371681415928

In [26]:
# Admission rate for male chemistry majors
male_students.query('major == "Chemistry" and admitted == True').count()[0] / \
    male_students.query('major == "Chemistry"').count()[0]

0.1111111111111111

### Admission rate for each major

In [27]:
# Admission rate for physics majors
data[data['major'] == "Physics"]['admitted'].mean()

0.54296875

In [28]:
# Admission rate for chemistry majors
data[data['major'] == "Chemistry"]['admitted'].mean()

0.21721311475409835

Many more females applied to chemistry, which had a lower admissions rate. Therefore, they had an overall lower admission rate. Though, females had higher admission rates conditionally in both physics and chemistry. This is known as **Simpson's Paradox**.