In [1]:
import pandas as pd
import os, sys
import matplotlib.pyplot as plt

new_dir = '/Users/juanbello/Desktop/Kaggle/group3'

sys.path.append(os.path.join(new_dir))


In [2]:
ed = pd.read_csv('ed_train_g3.csv') # education
ed.head(3)

Unnamed: 0,uid,literate,attended_school,highest_school_lvl,preschool,now_enrolled,now_attending,past_enrolled,past_attending,now_not_attend_reason,...,past_not_attend_reason,past_not_enroll_reason,finish_school_age,younger_19,public_private_school,school_transportation_time,school_transportation_vehicle,school_transportation_cost,school_expenses,poverty_score
0,441_2_3,True,1,2.0,0.0,1.0,1.0,1.0,1.0,,...,,,,,1.0,40.0,1.0,,140000.0,4.0
1,647_7_1,True,1,2.0,0.0,1.0,1.0,1.0,1.0,,...,,,,,1.0,15.0,1.0,,75000.0,4.0
2,756_4_1,True,1,2.0,0.0,1.0,1.0,1.0,1.0,,...,,,,,1.0,6.0,1.0,,100000.0,6.0


# Preliminary Variable Selection


In [4]:
ed.columns

Index(['uid', 'literate', 'attended_school', 'highest_school_lvl', 'preschool',
       'now_enrolled', 'now_attending', 'past_enrolled', 'past_attending',
       'now_not_attend_reason', 'now_not_enroll_reason',
       'past_not_attend_reason', 'past_not_enroll_reason', 'finish_school_age',
       'younger_19', 'public_private_school', 'school_transportation_time',
       'school_transportation_vehicle', 'school_transportation_cost',
       'school_expenses', 'poverty_score'],
      dtype='object')

In [5]:
# All students are literate
# All students attended school
ed = ed.drop(columns=['literate', 'attended_school'])


In [6]:
ed[['now_attending', 'past_attending', 'now_enrolled', 'past_enrolled']].head(3)

Unnamed: 0,now_attending,past_attending,now_enrolled,past_enrolled
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0


In [7]:
# enrollment and attendance past and present:

ed['now_attending'] = ed['now_attending'].fillna(ed['now_enrolled'])
ed['past_attending'] = ed['past_attending'].fillna(ed['past_enrolled'])

# We only care if they missed at least one year in the past 2.
missed_year = ed['now_attending'].eq(2) | ed['past_attending'].eq(2) 
ed.loc[missed_year, 'Missed_year'] = True
ed.loc[~missed_year, 'Missed_year'] = False

ed =  ed.drop(columns=['now_enrolled', 'past_enrolled', 'past_attending', 'now_attending'])


# absence reasons:

In [9]:

ed.loc[ed['now_not_attend_reason'] == 13, 'now_not_attend_reason'] = 14
ed.loc[ed['past_not_attend_reason'] == 13, 'past_not_attend_reason'] = 14

# We have situations where someone enrolls but does not attend.
# We don't have situations when someone who did not enroll attends. 
# So use attend as base truth and fill some of the NaNs with enrolled.
ed['now_not_attend_reason'] = ed['now_not_attend_reason'].fillna(ed['now_not_enroll_reason'])
ed['past_not_attend_reason'] = ed['past_not_attend_reason'].fillna(ed['past_not_enroll_reason'])

ed = ed.drop(columns=['now_not_enroll_reason', 'past_not_enroll_reason'])

In [10]:
ed['now_not_attend_reason'] = ed['now_not_attend_reason'].fillna(0)
ed['past_not_attend_reason'] = ed['past_not_attend_reason'].fillna(0)

In [11]:
# cols = ['now_not_attend_reason', 'past_not_attend_reason']
# data = ed[cols]
# data.loc[data['now_not_attend_reason'].ne(data['past_not_attend_reason']), cols]

In [12]:
# Now apply transform reasons to the financial scale
ABSENCE_REASON_TO_MONEY_MAPPING = [0, 3,1,3,3,1,1,1,2,2,1,1,2,1,1]

ed['now_not_attend_reason'] =  ed['now_not_attend_reason'].apply(lambda indx : ABSENCE_REASON_TO_MONEY_MAPPING[int(indx)])
ed['past_not_attend_reason'] =  ed['past_not_attend_reason'].apply(lambda indx : ABSENCE_REASON_TO_MONEY_MAPPING[int(indx)])

In [13]:
# Now the merge between now_not_attend_reason and past_not_attend_reason becomes obvious for our purposes of predicting wealth: pick the maximum. 
ed['not_attend_reason'] =  ed.apply(lambda row: max(row['past_not_attend_reason'], row['now_not_attend_reason']), axis=1)


In [14]:
ed = ed.drop(columns=['now_not_attend_reason', 'past_not_attend_reason'])

In [15]:
# Now we need to merge Missed_year? and not_attend_reason into one variate

## CLAIM: Missed_year==False <-> not_attend_reason=0
claim = ed['Missed_year'].eq(False) & ed['not_attend_reason'].ne(0)
assert(len(ed[claim])==0)

## DONE: So they are already dependent!! No need to do anything other than delete missed_year?

# The meaning of this not_attend_reason variable in the FINAL TRANSFORM ED_T4 is: 
# It represents what happened in the last couple of years
# 0 if the kid attended both years to school
# 1 if the kid missed at least one year due to something, in the worse case scenario, (NOT) related to money
# 2 if the kid missed at least one year due to something, in the worse case scenario, (SOMEWHAT) related to money
# 3 if the kid missed at least one year due to something, in the worse case scenario, (DIRECTLY) related to money


In [16]:
ed = ed.drop(columns=['Missed_year'])

# Highest School lvl

In [18]:
ed['highest_school_lvl'].describe()

count    1601.000000
mean        1.931293
std         1.498007
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        10.000000
Name: highest_school_lvl, dtype: float64

In [19]:
Q4_education_mapping = [0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 7, 7] # just gives the same values to same degrees (no matter where they were studied)
ed['highest_school_lvl'] =  ed['highest_school_lvl'].apply(lambda indx : indx if pd.isna(indx) else Q4_education_mapping[int(indx)])

## fill out NaN values with standarized mean
mean = int(ed['highest_school_lvl'].mean())
ed['highest_school_lvl'] = ed['highest_school_lvl'].fillna(mean)


In [20]:
ed['highest_school_lvl'].describe()

count    1601.000000
mean        1.814491
std         1.200496
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         7.000000
Name: highest_school_lvl, dtype: float64

# Preschool?

In [22]:
cond = ed['preschool'].isna()
ed[cond]

Unnamed: 0,uid,highest_school_lvl,preschool,finish_school_age,younger_19,public_private_school,school_transportation_time,school_transportation_vehicle,school_transportation_cost,school_expenses,poverty_score,not_attend_reason


# finish_school_age, younger_19
 don't make sense. we will have the age from hh

In [24]:
ed = ed.drop(columns=['finish_school_age', 'younger_19'])

# public_private_school
Q23: Is the school that [Name] attend in public or private?

* PUBLIC	= 1
* PRIVATE RELIGIOUS	= 2
* PRIVATE NON-RELIGIOUS	= 3

I like this ordering because, apparently, "Non-religious private schools are more expensive than religious private schools on average"

NaN replacement: 
AVERAGE because average assigns a more conservative value to NaN. A zero on this new scale is still the 25th quantile so it is biased towards poverty ya know.

But indicator variables have the potential to model the relatinsip better than by increases in 1... 
**everytime you want to make up a scale think of this: can I do it bettern than a machine?**

In [26]:
ed['public_private_school'] = ed['public_private_school'].fillna(0) # no data avaliable
dummies = pd.get_dummies(ed['public_private_school'], prefix='school_type', drop_first=True)
ed = pd.concat([ed, dummies], axis=1)


In [27]:
ed = ed.drop(columns=['public_private_school'])

# Transportation !!!!

Just transform into indicator variables!!!!! see what happens 

In [29]:
dummies = pd.get_dummies(ed['school_transportation_vehicle'], prefix='transport', drop_first=True)
ed = pd.concat([ed, dummies], axis=1)


In [30]:
ed = ed.drop(columns=['school_transportation_vehicle'])

In [31]:
# transportation_mapping = [float('nan'), 2, 1, float('nan'), 4, 3, float('nan'), float('nan')]

# ed['school_transportation_vehicle'] = ed['school_transportation_vehicle'].apply(lambda indx: indx if pd.isna(indx) else transportation_mapping[int(indx)] )
# av = int(ed['school_transportation_vehicle'].mean())
# ed['school_transportation_vehicle'] = ed['school_transportation_vehicle'].fillna(av)



In [32]:
av = int(ed['school_transportation_time'].mean())
ed['school_transportation_time'] = ed['school_transportation_time'].fillna(av)

# because they are kids so they probably feel like the cost is 0
ed['school_transportation_cost'] = ed['school_transportation_cost'].fillna(0)

# school_expenses

In [34]:
av = int(ed['school_expenses'].mean())
ed['school_expenses'] = ed['school_expenses'].fillna(av)


#### DONE!!! with ed

# Household data

In [37]:
hh = pd.read_csv('hh_train_g3.csv') # household


In [38]:
# leave sex as is
nans = hh['sex'].isna()
hh[nans]

Unnamed: 0,uid,poverty_score,sex,family_role,age,marital_status,spouse_lives,time_away,lives_with_mom,moms_education,mom_alive,lives_with_dad,dad_education,dad_alive


# family role

In [40]:
# hh['frequency'] = hh.groupby(['family_role', 'poverty_score'])['family_role'].transform('count')

# # Normalize the frequencies for color mapping
# max_freq = hh['frequency'].max()
# hh['normalized_frequency'] = hh['frequency'] / max_freq

# # Scatterplot
# plt.scatter(
#     hh['family_role'], 
#     hh['poverty_score'], 
#     c=hh['normalized_frequency'],  # Use normalized frequency for color
#     cmap='Blues',                  # Choose a colormap
#     edgecolor='black', 
#     s=100, 
#     alpha=0.8
# )



# marital_status 
just make it into 5 indicator variables. 



In [42]:
# dummies = pd.get_dummies(hh['marital_status'], prefix='marital', drop_first=True)
# ed = pd.concat([ed, dummies], axis=1)


In [43]:
# hh['frequency'] = hh.groupby(['marital_status', 'poverty_score'])['marital_status'].transform('count')

# # Normalize the frequencies for color mapping
# max_freq = hh['frequency'].max()
# hh['normalized_frequency'] = hh['frequency'] / max_freq

# # Scatterplot
# plt.scatter(
#     hh['marital_status'], 
#     hh['poverty_score'], 
#     c=hh['normalized_frequency'],  # Use normalized frequency for color
#     cmap='Blues',                  # Choose a colormap
#     edgecolor='black', 
#     s=100, 
#     alpha=0.8
# )

# # get rid of it 
# hh = hh.drop(columns=['frequency', 'marital_status', 'normalized_frequency'])

In [44]:
# get rid of both
hh = hh.drop(columns=['marital_status', 'family_role'])

In [45]:
hh['lives_with_partner'] = hh['spouse_lives'].eq(1)
hh = hh.drop(columns=['spouse_lives'])

In [46]:
# hh['frequency'] = hh.groupby(['lives_with_partner', 'poverty_score'])['lives_with_partner'].transform('count')

# # Normalize the frequencies for color mapping
# max_freq = hh['frequency'].max()
# hh['normalized_frequency'] = hh['frequency'] / max_freq

# # Scatterplot
# plt.scatter(
#     hh['lives_with_partner'], 
#     hh['poverty_score'], 
#     c=hh['normalized_frequency'],  # Use normalized frequency for color
#     cmap='Blues',                  # Choose a colormap
#     edgecolor='black', 
#     s=100, 
#     alpha=0.8
# )


# time away is good

In [48]:
hh['lives_with_parent'] = hh['lives_with_mom'].eq(1) | hh['lives_with_dad'].eq(1)
hh = hh.drop(columns=['lives_with_mom', 'lives_with_dad'])

In [49]:
hh['parents_education'] = (hh['moms_education'] + hh['dad_education']) / 2
ave = hh['parents_education'].mean()
hh['parents_education'] = hh['parents_education'].fillna(ave)

hh = hh.drop(columns=['moms_education', 'dad_education'])

In [50]:
# parents alive 
hh['mom_alive'] = hh['mom_alive'].eq(1)
hh['dad_alive'] = hh['dad_alive'].eq(1)
hh['mom_alive'] = hh['mom_alive'].apply(lambda boo: 1 if boo else 0)
hh['dad_alive'] = hh['dad_alive'].apply(lambda boo: 1 if boo else 0)

hh['num_parents_alive'] = hh['mom_alive'] + hh['dad_alive']
hh = hh.drop(columns=['mom_alive', 'dad_alive'])

# DONE

In [52]:
df = pd.merge(ed, hh, on='uid', how='inner')  # Use 'inner' join by default
# y = pd.get_dummies(df['poverty_score_y'], prefix='poverty', drop_first=False, dtype=int)
y = df['poverty_score_y']

In [53]:
df = df.drop(columns=['poverty_score_x', 'poverty_score_y'])

In [54]:
df.to_csv('X.csv', index=False)
y.to_csv('y.csv', index=False)

In [55]:
df

Unnamed: 0,uid,highest_school_lvl,preschool,school_transportation_time,school_transportation_cost,school_expenses,not_attend_reason,school_type_1.0,school_type_3.0,transport_2.0,transport_4.0,transport_5.0,sex,age,time_away,lives_with_partner,lives_with_parent,parents_education,num_parents_alive
0,441_2_3,2,0.0,40.0,0.0,140000.0,0,True,False,False,False,False,1,19,0,False,True,2.145105,0
1,647_7_1,2,0.0,15.0,0.0,75000.0,0,True,False,False,False,False,1,19,0,False,True,2.145105,0
2,756_4_1,2,0.0,6.0,0.0,100000.0,0,True,False,False,False,False,2,19,0,False,False,3.000000,2
3,25_4_3,5,2.0,20.0,0.0,730000.0,0,False,True,False,False,False,1,20,0,False,True,2.145105,0
4,132_6_3,2,0.0,30.0,0.0,170000.0,0,True,False,True,False,False,1,20,0,False,True,2.145105,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1596,799_3_5,1,3.0,40.0,0.0,732169.0,2,False,False,False,False,False,2,23,0,True,False,4.000000,2
1597,215_4_1,1,0.0,40.0,0.0,732169.0,2,False,False,False,False,False,2,23,0,False,False,3.000000,2
1598,785_2_2,1,0.0,40.0,0.0,732169.0,2,False,False,False,False,False,2,24,0,True,False,2.500000,2
1599,377_7_2,1,0.0,40.0,0.0,732169.0,2,False,False,False,False,False,2,24,0,True,False,3.000000,1


In [103]:
y

0       4.0
1       4.0
2       6.0
3       4.0
4       3.0
       ... 
1596    4.0
1597    4.0
1598    4.0
1599    2.0
1600    7.0
Name: poverty_score_y, Length: 1601, dtype: float64