In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns 

import statsmodels
import statsmodels.api as sm
import scipy.stats as stats

##### Loading data - adopted from Leena_EDA

In [3]:
df1 = pd.read_csv('Leena_df1Clean.csv')
df1.head()

Unnamed: 0,Marital Status,Course,Previous qualification,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Gender,...,Inflation rate,GDP,Target,isDisplaced,hasSpecialNeeds,tuitionToDate,hasScholarship,isDebtor,AttendanceTime,Nationality
0,Single,Animation and Multimedia Design,Secondary,122.0,Basic,Basic,5,9,127.3,Male,...,1.4,1.74,Dropout,True,False,True,False,False,Day,Portuguese
1,Single,Tourism,Secondary,160.0,Secondary,Higher,3,3,142.5,Male,...,-0.3,0.79,Graduate/Enrolled,True,False,False,False,False,Day,Portuguese
2,Single,Communication Design,Secondary,122.0,Basic,Basic,9,9,124.8,Male,...,1.4,1.74,Dropout,True,False,False,False,False,Day,Portuguese
3,Single,Journalism and Communication,Secondary,122.0,Basic,Basic,5,3,119.6,Female,...,-0.8,-3.12,Graduate/Enrolled,True,False,True,False,False,Day,Portuguese
4,Married,Social Service (evening attendance),Secondary,100.0,Basic,Basic,9,9,141.5,Female,...,-0.3,0.79,Graduate/Enrolled,False,False,True,False,False,Evening,Portuguese


In [4]:
df1_grad = df1[df1.get('Target') == 'Graduate/Enrolled']
df1_drop = df1[df1.get('Target') == 'Dropout']

## Student Academic Path: Chi-Squared Test

> #### Are the enrolled courses for both graduate/enrolled and dropout students uniform?

In [7]:
# osberving relative frequencies of course for each target
course_relfreq = pd.crosstab(df1['Target'], df1['Course'])
course_relfreq.loc['Dropout'] = (course_relfreq.loc['Dropout'] / df1_drop.shape[0])
course_relfreq.loc['Graduate/Enrolled'] = (course_relfreq.loc['Graduate/Enrolled'] / df1_grad.shape[0])
course_relfreq

Course,Advertising and Marketing Management,Agronomy,Animation and Multimedia Design,Basic Education,Biofuel Production Technologies,Communication Design,Equinculture,Informatics Engineering,Journalism and Communication,Management,Management (evening attendance),Nursing,Oral Hygiene,Social Service,Social Service (evening attendance),Tourism,Veterinary Nursing
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Dropout,0.066854,0.060521,0.057706,0.059817,0.00563,0.03589,0.054891,0.064743,0.071077,0.0943,0.095707,0.08304,0.023223,0.045742,0.049965,0.067558,0.063336
Graduate/Enrolled,0.057609,0.041292,0.044289,0.035631,0.001332,0.058275,0.020979,0.025974,0.07659,0.081918,0.043956,0.215784,0.017649,0.09657,0.047952,0.051948,0.082251


> $H_0$: The frequencies for each course are uniform, and any variance is due to chance.
>
> $H_a$: A uniform proportion for each course is **not** a good fit, suggesting a skewness of a particular course the students are enrolled in.

In [24]:
# graduate/enrolled
print('Graduate/Enrolled Students')
print('P-value: ' + str(stats.chisquare(df1_grad['Course'].value_counts())[1]))

# dropout
print('\nDropout')
print('P-value: ' + str(stats.chisquare(df1_drop['Course'].value_counts())[1]))

Graduate/Enrolled Students
P-value: 0.0

Dropout
P-value: 1.2302569950781588e-34


>Both tests resulted in a p-val of 0 (or close to 0), so we reject the null for both graduate/enrolled and dropout students. Thus, it could be said that there is some skewness of the distribution of Courses that each student is enrolled in. 

## Test for Homogeneity

#### Mother's Qualification

In [44]:
pivot_table = pd.crosstab(df1['Target'], df1['Mother\'s qualification'])
pivot_table

Mother's qualification,Basic,Higher,Secondary,Unknown
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dropout,834,191,300,96
Graduate/Enrolled,1763,437,769,34


In [50]:
sqtab = sm.stats.SquareTable(pivot_table)
print(sqtab.homogeneity())

df          5
pvalue      0.0
statistic   4412.042205636382
