# School Project Analysis 

This analysis showed income and poverty level can influence parents decision on what type of school they will send their children. The results on education category isn't show significant difference for parents on school choice. 

In [34]:
# import dependencies
import pandas as pd
import os
from matplotlib import pyplot as plt
import numpy as np
import scipy.stats as stats

In [35]:
# Read and coonvert csv files to data frame
file1 = os.path.join("Resources", "AllStudentsGrade5_edit.csv")
file2 = os.path.join("Resources", "census_demo.csv")
file1_df = pd.read_csv(file1, encoding="ISO-8859-1")
file2_df = pd.read_csv(file2, encoding="ISO-8859-1")

# Check the length of the files to knowing the number of rows on each file
len(file1_df)
len(file2_df)

33120

In [36]:
# Combine those 2 files by merging file1 at the left side of file2
merge_table = pd.merge(file1_df, file2_df, on="Zipcode", how="left")
len(merge_table)

4326

In [37]:
# Display the combined dataframe and check the table headers
merge_table.head()

Unnamed: 0,CAMPUS,DNAME,CNAME,GRADE,District Type,School Site Street Address,School Site City,Zipcode,Below Grade Level,Approach Grade Level,Meet Grade Level,Master Grade Level,Income Below 75K,Income Over 100K,High School Diploma,Master Degree,Poverty Family,Non Poverty Family
0,1902103,CAYUGA ISD,CAYUGA ELEM.,5,INDEPENDENT,17750 N US HWY 287,TENNESSEE COLONY,75861.0,4.0,44.0,41.0,29.0,0.0,0.0,1132.0,45.0,0.0,192.0
1,1903102,ELKHART ISD,ELKHART INTERME,5,INDEPENDENT,301 E PARKER ST,ELKHART,75839.0,3.0,80.0,65.0,43.0,12.0,0.0,1123.0,125.0,48.0,601.0
2,1904102,FRANKSTON ISD,FRANKSTON ELEM.,5,INDEPENDENT,100 PERRY ST,FRANKSTON,75763.0,7.0,53.0,40.0,26.0,0.0,0.0,1341.0,278.0,131.0,582.0
3,1906102,NECHES ISD,NECHES ELEM.,5,INDEPENDENT,3055 FM 2574,PALESTINE,75803.0,1.0,23.0,15.0,10.0,53.0,0.0,4501.0,501.0,136.0,2279.0
4,1907110,PALESTINE ISD,STORY INTERMEDI,5,INDEPENDENT,5300 S LOOP 256,PALESTINE,75801.0,58.0,185.0,106.0,51.0,0.0,0.0,2441.0,545.0,43.0,1334.0


In [38]:
# Export file as a CSV, without the Pandas index, but with the header
output_file = os.path.join("Output", "Grade5_TXdemo.csv")
merge_table.to_csv(output_file, index=False, header=True)

In [39]:
# List out the columns header for reorganizing the file
merge_table.columns

Index(['CAMPUS', 'DNAME', 'CNAME', 'GRADE', 'District Type',
       'School Site Street Address', 'School Site City', 'Zipcode',
       'Below Grade Level', 'Approach Grade Level', 'Meet Grade Level',
       'Master Grade Level', 'Income Below 75K', 'Income Over 100K',
       'High School Diploma', 'Master Degree', 'Poverty Family',
       'Non Poverty Family'],
      dtype='object')

In [40]:
# Add 2 new columns for number of students that below and passed STAAR test 
merge_table['Below Grade'] = (merge_table['Below Grade Level'] + merge_table['Approach Grade Level'])
merge_table['Pass Grade'] = (merge_table['Meet Grade Level'] + merge_table['Master Grade Level'])

In [41]:
# Reorganize the df by filtering the columns that won't used in the analysis
analysis_df = merge_table[['District Type',
       'Below Grade', 'Pass Grade', 'Income Below 75K',
       'Income Over 100K', 'High School Diploma',
       'Master Degree', 'Poverty Family', 'Non Poverty Family']]
analysis_df.head()

Unnamed: 0,District Type,Below Grade,Pass Grade,Income Below 75K,Income Over 100K,High School Diploma,Master Degree,Poverty Family,Non Poverty Family
0,INDEPENDENT,48.0,70.0,0.0,0.0,1132.0,45.0,0.0,192.0
1,INDEPENDENT,83.0,108.0,12.0,0.0,1123.0,125.0,48.0,601.0
2,INDEPENDENT,60.0,66.0,0.0,0.0,1341.0,278.0,131.0,582.0
3,INDEPENDENT,24.0,25.0,53.0,0.0,4501.0,501.0,136.0,2279.0
4,INDEPENDENT,243.0,157.0,0.0,0.0,2441.0,545.0,43.0,1334.0


In [42]:
# analysis_df = analysis_df.replace(0.0,np.NaN)

In [43]:
# analysis_df = analysis_df.fillna("")
# analysis_df.head()

In [44]:
# Replace none value with 0
# analysis_df = analysis_df.fillna(0)

In [45]:
# Separate two types of schools 
independ = analysis_df.loc[analysis_df['District Type'] == 'INDEPENDENT', :]
charter = analysis_df.loc[analysis_df['District Type'] == 'CHARTER', :]

In [55]:
# Using t-test to evaluate difference between ISD and Charter schools on those demo that related to school choice. Used 'omit' on nan_policy to handle "NaN" value. 
stats.ttest_ind(independ['Income Below 75K'], charter['Income Below 75K'], equal_var=False, nan_policy='omit')
stats.ttest_ind(independ['Income Over 100K'], charter['Income Over 100K'], equal_var=False, nan_policy='omit')
stats.ttest_ind(independ['High School Diploma'], charter['High School Diploma'], equal_var=False, nan_policy='omit')
stats.ttest_ind(independ['Master Degree'], charter['Master Degree'], equal_var=False, nan_policy='omit')
stats.ttest_ind(independ['Poverty Family'], charter['Poverty Family'], equal_var=False, nan_policy='omit')
stats.ttest_ind(independ['Non Poverty Family'], charter['Non Poverty Family'], equal_var=False, nan_policy='omit')

Ttest_indResult(statistic=2.3500222377212143, pvalue=0.019169723907682373)

In [56]:
# Create a date frame with the t-test score
school_df = pd.DataFrame({'ISD vs Charter':['t-test score'],
                         'Income < 75K':['0.0002'],
                        'Income > 100K':['0.0023'], 
                        'High School Diploma':['1.3486'], 
                        'Master Degree':['0.3379'], 
                        'Poverty Family':['0.0015'], 
                        'Non Poverty Family':['0.0192']})
school_df.head()

Unnamed: 0,ISD vs Charter,Income < 75K,Income > 100K,High School Diploma,Master Degree,Poverty Family,Non Poverty Family
0,t-test score,0.0002,0.0023,1.3486,0.3379,0.0015,0.0192


In [57]:
# Converting the data frame to a csv file
output_file = os.path.join("Output", "ISD_vs_Charter_demo_ttest.csv")

school_df.to_csv(output_file, index=False, header=True)