In [1]:
import pandas as pd
import numpy as np 

In [2]:
# Load files
Demographics_file = "Demographics.csv"
Achievement_file = "Achievement_Levels.csv"
EWSdata_file = "EWSdata.csv"
ELAdata_file = "ELA_DATA.csv"
MATHdata_file = "MATH_DATA.csv"

Demographics_df = pd.read_csv(Demographics_file)
Achievement_df = pd.read_csv(Achievement_file)
EWSdata_df = pd.read_csv(EWSdata_file)
ELAdata_df = pd.read_csv(ELAdata_file)
MATHdata_df = pd.read_csv(MATHdata_file)

In [3]:
# Sample of Demographics.csv data
Demographics_df.sample()

Unnamed: 0,Student ID,Gender,Ethnicity
945,3515041508,F,Multi-Racial


In [4]:
# Sample of Achivement_Levels.csv data
Achievement_df.sample()

Unnamed: 0,Student ID,FSA_ELA,FSA_MATH
184,3571301431,2,2


In [5]:
# Sample of EWSdata.csv data
EWSdata_df.sample()

Unnamed: 0,Student ID,Retained,Attendance,Behavior
0,3507951708,no,no,no


In [6]:
# Sample of ELAdta.csv data
ELAdata_df.sample()

Unnamed: 0,Student ID,Fall_Diag,Winter_Diag
592,3576162000.0,534.0,551.0


In [7]:
# Sample of MATHdata.csv data
MATHdata_df.sample()

Unnamed: 0,Student ID,Fall_Diag,Winter_Diag
482,3517251749,388,424


In [8]:
# Merging all files together on 'Student ID' column
all_data = Demographics_df.merge(Achievement_df,on='Student ID').merge(EWSdata_df,on='Student ID').merge(ELAdata_df,on='Student ID').merge(MATHdata_df,on='Student ID')

In [9]:
all_data.sample()

Unnamed: 0,Student ID,Gender,Ethnicity,FSA_ELA,FSA_MATH,Retained,Attendance,Behavior,Fall_Diag_x,Winter_Diag_x,Fall_Diag_y,Winter_Diag_y
3,3509881649,M,Caucasian,5,5,NO,NO,NO,530.0,556.0,476,496


In [11]:
# Mergin data for ELA
ela_data = Demographics_df.merge(Achievement_df,on='Student ID').merge(EWSdata_df,on='Student ID').merge(ELAdata_df,on='Student ID')
ela_data = ela_data.drop(['FSA_MATH'],axis=1)
ela_data.sample()

Unnamed: 0,Student ID,Gender,Ethnicity,FSA_ELA,Retained,Attendance,Behavior,Fall_Diag,Winter_Diag
97,3548611608,F,Caucasian,3,NO,NO,NO,509.0,553.0


In [12]:
# Mergin data for MATH
math_data = Demographics_df.merge(Achievement_df,on='Student ID').merge(EWSdata_df,on='Student ID').merge(MATHdata_df,on='Student ID')
math_data = math_data.drop(['FSA_ELA'],axis=1)
math_data.sample()

Unnamed: 0,Student ID,Gender,Ethnicity,FSA_MATH,Retained,Attendance,Behavior,Fall_Diag,Winter_Diag
105,3555201608,F,African American,5,NO,NO,NO,468,493


In [13]:
# Checking if null data 
all_data.isnull().sum()

Student ID       0
Gender           0
Ethnicity        0
FSA_ELA          0
FSA_MATH         0
Retained         0
Attendance       0
Behavior         0
Fall_Diag_x      0
Winter_Diag_x    0
Fall_Diag_y      0
Winter_Diag_y    0
dtype: int64

In [14]:
# Checking if na data
all_data.isna().sum()

Student ID       0
Gender           0
Ethnicity        0
FSA_ELA          0
FSA_MATH         0
Retained         0
Attendance       0
Behavior         0
Fall_Diag_x      0
Winter_Diag_x    0
Fall_Diag_y      0
Winter_Diag_y    0
dtype: int64

In [15]:
# Export all_data
# all_data.to_csv('all_data.csv',index=False)

In [16]:
# Export math and ela data
# math_data.to_csv('math_data_joined.csv',index=False)
# ela_data.to_csv('ela_data_joined.csv',index=False)

In [17]:
# Descriptive statistics for ELA
ela_data.describe()

Unnamed: 0,Student ID,FSA_ELA,Fall_Diag,Winter_Diag
count,213.0,213.0,213.0,213.0
mean,3591254000.0,3.173709,515.093897,536.680751
std,350778700.0,1.078461,41.332415,36.078419
min,1300604000.0,1.0,372.0,410.0
25%,3530192000.0,2.0,491.0,517.0
50%,3555562000.0,3.0,518.0,539.0
75%,3577092000.0,4.0,543.0,562.0
max,5926007000.0,5.0,612.0,620.0


In [18]:
# Descriptive statistics for MATH
math_data.describe()

Unnamed: 0,Student ID,FSA_MATH,Fall_Diag,Winter_Diag
count,213.0,213.0,213.0,213.0
mean,3591254000.0,3.41784,436.305164,450.516432
std,350778700.0,1.177245,20.392512,19.566191
min,1300604000.0,1.0,365.0,380.0
25%,3530192000.0,3.0,426.0,442.0
50%,3555562000.0,3.0,438.0,453.0
75%,3577092000.0,4.0,450.0,462.0
max,5926007000.0,5.0,486.0,496.0


In [114]:
# diag_bins = [0, 416, 467, 518, 569, 620]
# group_names = ["365-416", "417-467", "468-518", "519-569", "570-620"]

In [19]:
# Establish bins and group names for ELA.
diag_bins = [0, 422, 472, 521, 571, 620]
group_names = [1, 2, 3, 4, 5]

In [23]:
# Establish bins ELA per students
bins_per_student_ela = ela_data.set_index(['Student ID'])['Winter_Diag']

In [24]:
bins_per_student_ela

Student ID
3505181649    564.0
3507461649    584.0
3508041649    528.0
3509881649    556.0
3510171549    534.0
              ...  
3531461508    593.0
3571921508    522.0
3582191508    554.0
3592081508    534.0
3535761308    519.0
Name: Winter_Diag, Length: 213, dtype: float64

In [25]:
bins_per_student_ela.ndim

1

In [26]:
bins_per_student_ela.groupby(pd.cut(bins_per_student_ela, diag_bins)).count()

Winter_Diag
(0, 422]        3
(422, 472]      8
(472, 521]     51
(521, 571]    120
(571, 620]     31
Name: Winter_Diag, dtype: int64

In [31]:
bins_winter_ela = pd.cut(bins_per_student_ela, diag_bins, labels=group_names)
# bins_winter_ela


In [32]:
bins_per_student_ela.describe()

count    213.000000
mean     536.680751
std       36.078419
min      410.000000
25%      517.000000
50%      539.000000
75%      562.000000
max      620.000000
Name: Winter_Diag, dtype: float64

In [33]:
#  Converting bins ELA in DataFrame
bins_winter_ela = pd.DataFrame(bins_winter_ela)
bins_winter_ela = bins_winter_ela.rename(columns={'Winter_Diag': 'Bins_Winter'})
bins_winter_ela


Unnamed: 0_level_0,Bins_Winter
Student ID,Unnamed: 1_level_1
3505181649,4
3507461649,5
3508041649,4
3509881649,4
3510171549,4
...,...
3531461508,5
3571921508,4
3582191508,4
3592081508,4


In [34]:
# Merging ELA data with ELA bins
bins_ela = ela_data.merge(bins_winter_ela,on='Student ID')
# bins_ela = bins_ela.drop(['Bins_Winter'],axis=1)

In [35]:
bins_ela.sample(10)

Unnamed: 0,Student ID,Gender,Ethnicity,FSA_ELA,Retained,Attendance,Behavior,Fall_Diag,Winter_Diag,Bins_Winter
55,3576151649,M,Hispanic,2,NO,NO,NO,491.0,526.0,4
153,3570291408,M,African American,3,yes,yes,no,491.0,543.0,4
82,3515081508,M,Caucasian,3,YES,NO,NO,413.0,473.0,3
49,3569341204,F,Asian,1,YES,NO,NO,372.0,450.0,2
64,3581511613,M,Caucasian,4,NO,NO,NO,528.0,549.0,4
56,3576991649,F,Hispanic,3,NO,NO,NO,536.0,527.0,4
86,3527541608,F,Caucasian,5,NO,NO,NO,542.0,547.0,4
152,3538141535,M,Multi-Racial,4,no,yes,no,552.0,556.0,4
119,3576991160,F,Hispanic,1,YES,NO,NO,473.0,493.0,3
145,3512991408,M,Caucasian,3,no,no,no,546.0,544.0,4


In [42]:
# Establish bins and group names.
# diag_bins = [0, 422, 472, 521, 571, 620]
# group_names = [1, 2, 3, 4, 5]
# ela_data["Bins_Winter"] = pd.cut(bins_per_student_ela, diag_bins, labels=group_names)

In [43]:
# ela_data.sample(15)

In [53]:
# Establish bins and group names for MATH.
diag_binsMATH = [0, 392, 418, 444, 470, 500]
group_namesMATH = [1, 2, 3, 4, 5]

In [54]:
# Establish bins MATH per students
bins_per_student_math = math_data.set_index(['Student ID'])['Winter_Diag']

In [55]:
bins_per_student_math

Student ID
3505181649    446
3507461649    460
3508041649    467
3509881649    496
3510171549    443
             ... 
3531461508    460
3571921508    442
3582191508    463
3592081508    442
3535761308    449
Name: Winter_Diag, Length: 213, dtype: int64

In [56]:
bins_per_student_math.groupby(pd.cut(bins_per_student_math, diag_binsMATH)).count()

Winter_Diag
(0, 392]        3
(392, 418]     11
(418, 444]     53
(444, 470]    124
(470, 500]     22
Name: Winter_Diag, dtype: int64

In [57]:
bins_per_student_math.describe()

count    213.000000
mean     450.516432
std       19.566191
min      380.000000
25%      442.000000
50%      453.000000
75%      462.000000
max      496.000000
Name: Winter_Diag, dtype: float64

In [58]:
bins_winter_math = pd.cut(bins_per_student_math, diag_binsMATH, labels=group_namesMATH)

In [59]:
#  Converting bins MATH in DataFrame
bins_winter_math = pd.DataFrame(bins_winter_math)
bins_winter_math = bins_winter_math.rename(columns={'Winter_Diag': 'Bins_Winter'})
bins_winter_math

Unnamed: 0_level_0,Bins_Winter
Student ID,Unnamed: 1_level_1
3505181649,4
3507461649,4
3508041649,4
3509881649,5
3510171549,3
...,...
3531461508,4
3571921508,3
3582191508,4
3592081508,3


In [60]:
# Merging MATH data with MATH bins
bins_math = math_data.merge(bins_winter_math,on='Student ID')

In [61]:
bins_math.sample(10)

Unnamed: 0,Student ID,Gender,Ethnicity,FSA_MATH,Retained,Attendance,Behavior,Fall_Diag,Winter_Diag,Bins_Winter
28,3532761649,F,Caucasian,5,NO,NO,NO,463,465,4
154,3577091588,F,Caucasian,5,no,yes,no,450,461,4
12,3513031423,M,Caucasian,3,YES,NO,NO,448,446,4
185,3572921508,M,Caucasian,5,no,no,no,478,475,5
19,3517711849,M,Caucasian,4,NO,NO,NO,439,464,4
146,4804266071,F,Hispanic,4,no,yes,no,449,452,4
114,3569911060,M,African American,4,NO,YES,NO,443,478,5
27,3530741649,F,Caucasian,5,NO,NO,NO,450,470,4
167,3554491536,F,Caucasian,4,no,no,no,447,437,3
181,3560431408,F,Caucasian,5,yes,no,no,437,459,4


In [62]:
# # Export bins_ela
# bins_ela.to_csv('bins_ela.csv',index=False)
# # Export bins_math
# bins_math.to_csv('bins_math.csv',index=False)

In [63]:
bins_math.describe()

Unnamed: 0,Student ID,FSA_MATH,Fall_Diag,Winter_Diag
count,213.0,213.0,213.0,213.0
mean,3591254000.0,3.41784,436.305164,450.516432
std,350778700.0,1.177245,20.392512,19.566191
min,1300604000.0,1.0,365.0,380.0
25%,3530192000.0,3.0,426.0,442.0
50%,3555562000.0,3.0,438.0,453.0
75%,3577092000.0,4.0,450.0,462.0
max,5926007000.0,5.0,486.0,496.0


In [65]:
bins_ela_avrg = bins_ela.groupby(["Bins_Winter"]).mean()["Winter_Diag"]

In [66]:
bins_ela_avrg

Bins_Winter
1    415.000000
2    461.375000
3    503.549020
4    545.625000
5    587.774194
Name: Winter_Diag, dtype: float64