In [2]:
import pandas as pd
import numpy as np


# Dealing with Missing Values

In [3]:
df = pd.read_csv("data/train.csv")
df.head()

#impute v2a1 (Monthly rent payment) with mean
df["v2a1"] = df["v2a1"].fillna(df["v2a1"].mean())

#impute v18q1 (number of tablets household owns) with 0
df["v18q1"] = df["v18q1"].fillna(0)

#remove rez_esc column (Years behind in school)
df = df.drop(columns="rez_esc", index = 1)

#remove 5 rows with NA 
df = df.dropna()

# Remove ID Columns

In [4]:
#remove ID
df = df.drop(["Id", "idhogar"], axis = 1)

# Dealing with Inconsistent Data

In [5]:

#replace yes with 1, replace no with 0
df[["edjefe", "edjefa"]] = df[["edjefe", "edjefa"]].apply(lambda x: x.replace("no", 0).replace("yes", 1))

#convert string to numeric
df["edjefe"] = pd.to_numeric(df["edjefe"])
df["edjefa"] = pd.to_numeric(df["edjefa"])

#create a new column called head_edu (head of household years of education), remove the two columns
df["head_edu"] = df[["edjefe", "edjefa"]].max(axis = 1) 
df = df.drop(["edjefe", "edjefa"], axis = 1)

#Add dependency_rate column
df["dependency_rate"] = (df["hogar_mayor"] + df["hogar_nin"]) / df["hogar_total"] 
#Remove dependency column
df = df.drop("dependency", axis = 1)

# Data Transformation
### Binning - age, v2a1 (monthly rental)

**Gender - Reduce to 1 column** (avoid multicollinearity)



In [6]:
df["gender"] = df["male"] #if male then 1, else 0
df = df.drop(["male", "female"], axis = 1)


**Predominant material on the outside wall** (avoid multicollinearity by removing 1 column)

In [7]:
wall_columns = list(df.filter(regex='^pared').columns)
print(wall_columns)
if (df[wall_columns].sum(axis = 1) == 1).all(): #check if sum of the 8 columns is 1 for all rows
    df = df.drop(["paredother"], axis = 1) #if yes, then remove 1 column. In this case, remove 'paredother'
    print("Removed 'paredother'.")


['paredblolad', 'paredzocalo', 'paredpreb', 'pareddes', 'paredmad', 'paredzinc', 'paredfibras', 'paredother']
Removed 'paredother'.


**Predominant material on the floor** (avoid multicollinearity by removing 1 column)

In [8]:
floor_columns = list(df.filter(regex='^piso').columns)
print(floor_columns)
if (df[floor_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["pisoother"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'pisoother'.")


['pisomoscer', 'pisocemento', 'pisoother', 'pisonatur', 'pisonotiene', 'pisomadera']
Removed 'pisoother'.


**Predominant material on the roof** (avoid multicollinearity by removing 1 column)

In [9]:
roof_columns = list(df.filter(regex='^techo').columns) 
print(roof_columns)
print(df[df[roof_columns].sum(axis = 1) == 0][roof_columns + ["cielorazo"]].describe()) 
print(df[df[roof_columns].sum(axis = 1) == 1][roof_columns + ["cielorazo"]].describe()) 
#This shows that if the 4 columns (techo*) are all 0, then 'cielorazo'is 0.
#However, there are cases where any of the 4 columns with 1, does not imply 'cielorazo' is 1. 
#(e.g. There are cases where the roof is zink, but the house has no ceiling)
#Therefore, we cannot remove any of the 4 columns and 'cielorazo'

['techozinc', 'techoentrepiso', 'techocane', 'techootro']
       techozinc  techoentrepiso  techocane  techootro  cielorazo
count       66.0            66.0       66.0       66.0       66.0
mean         0.0             0.0        0.0        0.0        0.0
std          0.0             0.0        0.0        0.0        0.0
min          0.0             0.0        0.0        0.0        0.0
25%          0.0             0.0        0.0        0.0        0.0
50%          0.0             0.0        0.0        0.0        0.0
75%          0.0             0.0        0.0        0.0        0.0
max          0.0             0.0        0.0        0.0        0.0
         techozinc  techoentrepiso    techocane    techootro    cielorazo
count  9485.000000     9485.000000  9485.000000  9485.000000  9485.000000
mean      0.976805        0.017818     0.003163     0.002214     0.681919
std       0.150529        0.132295     0.056154     0.047004     0.465756
min       0.000000        0.000000     0.000000     

**Water provision** (avoid multicollinearity by removing 1 column)

In [10]:
water_columns = list(df.filter(regex='^abastagua').columns)
print(water_columns)
if (df[water_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["abastaguano"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'abastaguano'.")


['abastaguadentro', 'abastaguafuera', 'abastaguano']
Removed 'abastaguano'.


**Electricity** (avoid multicollinearity by removing 1 column)

In [11]:
electricity_columns = ["public", "planpri", "noelec", "coopele"]
print(electricity_columns)
(df[electricity_columns].sum(axis = 1) == 1).all() #check if sum of the relevant columns is 1 for all rows
# Does not require removing any column

['public', 'planpri', 'noelec', 'coopele']


False

**Toilet** (avoid multicollinearity by removing 1 column)

In [12]:
toilet_columns = list(df.filter(regex='^sanitario').columns)
print(toilet_columns)
if (df[toilet_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["sanitario6"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'sanitario6'.")


['sanitario1', 'sanitario2', 'sanitario3', 'sanitario5', 'sanitario6']
Removed 'sanitario6'.


**Main source of energy** (avoid multicollinearity by removing 1 column)

In [13]:
energy_columns = list(df.filter(regex='^energcocinar').columns)
print(energy_columns)
if (df[energy_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["energcocinar1"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'energcocinar1'.")


['energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4']
Removed 'energcocinar1'.


**Rubbish disposal** (avoid multicollinearity by removing 1 column)

In [14]:
rubbish_columns = list(df.filter(regex='^elimbasu').columns)
print(rubbish_columns)
if (df[rubbish_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["elimbasu6"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'elimbasu6'.")


['elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 'elimbasu5', 'elimbasu6']
Removed 'elimbasu6'.


**Walls condition** (avoid multicollinearity by removing 1 column)

In [15]:
wallc_columns = list(df.filter(regex='^epared').columns)
print(wallc_columns)
if (df[wallc_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["epared2"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'epared2'.")


['epared1', 'epared2', 'epared3']
Removed 'epared2'.


**Roof condition** (avoid multicollinearity by removing 1 column)

In [16]:
roofc_columns = list(df.filter(regex='^etecho').columns)
print(roofc_columns)
if (df[roofc_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["etecho2"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'etecho2'.")


['etecho1', 'etecho2', 'etecho3']
Removed 'etecho2'.


**Floor condition** (avoid multicollinearity by removing 1 column)

In [17]:
floorc_columns = list(df.filter(regex='^eviv').columns)
print(floorc_columns)
if (df[floorc_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["eviv2"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'eviv2'.")


['eviv1', 'eviv2', 'eviv3']
Removed 'eviv2'.


**Civil Status** (avoid multicollinearity by removing 1 column)

In [18]:
civil_columns = list(df.filter(regex='^estadocivil').columns)
print(civil_columns)
if (df[civil_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["estadocivil7"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'estadocivil7'.")


['estadocivil1', 'estadocivil2', 'estadocivil3', 'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7']
Removed 'estadocivil7'.


**Household Status** (avoid multicollinearity by removing 1 column)

In [19]:
household_columns = list(df.filter(regex='^parentesco').columns)
print(household_columns)
if (df[household_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["parentesco12"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'parentesco12'.")


['parentesco1', 'parentesco2', 'parentesco3', 'parentesco4', 'parentesco5', 'parentesco6', 'parentesco7', 'parentesco8', 'parentesco9', 'parentesco10', 'parentesco11', 'parentesco12']
Removed 'parentesco12'.


**Education Level** (avoid multicollinearity by removing 1 column)

In [20]:
edulevel_columns = list(df.filter(regex='^instlevel').columns)
print(edulevel_columns)
if (df[edulevel_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["instlevel9"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'instlevel9'.")
else:
    print(df[edulevel_columns].sum(axis = 1).describe())

['instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']
count    9551.000000
mean        0.999686
std         0.017721
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
dtype: float64


**House ownership** (avoid multicollinearity by removing 1 column)

In [21]:
houseown_columns = list(df.filter(regex='^tipovivi').columns)
print(houseown_columns)
if (df[houseown_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["tipovivi5"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'tipovivi5'.")


['tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']
Removed 'tipovivi5'.


**Region** (avoid multicollinearity by removing 1 column)

In [22]:
region_columns = list(df.filter(regex='^lugar').columns)
print(region_columns)
if (df[region_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["lugar6"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'lugar6'.")


['lugar1', 'lugar2', 'lugar3', 'lugar4', 'lugar5', 'lugar6']
Removed 'lugar6'.


**Settlement** (avoid multicollinearity by removing 1 column)

In [23]:
settlement_columns = list(df.filter(regex='^area').columns)
print(settlement_columns)
if (df[settlement_columns].sum(axis = 1) == 1).all(): #check if sum of the relevant columns is 1 for all rows
    df = df.drop(["area2"], axis = 1) #if yes, then remove 1 column. 
    print("Removed 'area2'.")


['area1', 'area2']
Removed 'area2'.


# Data Reduction

In [24]:
# hhsize, r4h1, r4h2, r4h3, r4m1, r4m2, r4m3, r4t1, r4t2, r4t3, tamhog, tamviv
# include: r4h1, r4h2, r4m1, r4m2, tamhog
# remove: hhsize, r4h3, r4m3, r4t1, r4t2, r4t3, tamviv
df = df.drop(["hhsize", "r4h3", "r4m3", "r4t1", "r4t2", "r4t3", "tamviv"], axis = 1)

In [25]:
## dependency_rate were calculated previously, include only hogar_nin
## remove hogar_adul, hogar_mayor, hogar_total
df = df.drop(["hogar_adul", "hogar_mayor", "hogar_total"], axis = 1)


In [26]:
#remove mobilephone (implied by qmobilephone)
df = df.drop(["mobilephone"], axis = 1)

## Remove square columns


In [27]:
# Remove columns that are square of another column
square_columns = list(df.filter(like='SQ').columns) + ["agesq"]
df = df.drop(square_columns, axis = 1)


# Remove Outliers

# Check missing values

In [28]:
missing = pd.isna(df).sum()
missing[missing > 0]

Series([], dtype: int64)

# Normalize


In [47]:
scaler = preprocessing.StandardScaler().fit(df[["v2a1"]].iloc[0:int(0.6*df.shape[0])])
df["v2a1"] = scaler.transform(df[["v2a1"]])   


In [149]:
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import xgboost
from xgboost import XGBClassifier

In [150]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

In [61]:
X = df.drop(["Target"], axis = 1).values
y = df["Target"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
y_test.shape

(3152,)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [70]:
y_pred

array([4, 4, 2, ..., 4, 4, 4], dtype=int64)

In [None]:
df2 = pd.read_csv("data/test.csv")
df2b = clean_test(df2)
X_TEST = df2b.values

In [162]:
y_TEST = model.predict(X_TEST)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = y_TEST
df_submission.to_csv("xgboost_01.csv", index=False)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

[0.00765941 0.00111498 0.001395   0.00044321 0.         0.
 0.04585591 0.02903457 0.00597071 0.00114061 0.01327236 0.00102483
 0.00146452 0.02719572 0.03833558 0.         0.         0.
 0.00359541 0.         0.         0.05952775 0.01789043 0.
 0.         0.         0.         0.         0.         0.
 0.05573803 0.00025185 0.         0.         0.         0.
 0.         0.         0.         0.         0.00082631 0.
 0.         0.         0.00105058 0.         0.00073098 0.
 0.         0.01061954 0.0692902  0.         0.01268923 0.01688073
 0.02955059 0.         0.00127465 0.         0.00221753 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.10572265 0.17154526 0.00189987 0.         0.
 0.         0.         0.         0.         0.02450224 0.00128632
 0.         0.03666014 0.         0.00133819 0.         0.
 0.01141441 0.00173121 0.01082786 0.00434574 0.         0.
 0.         0.         0.       

# MODELS

## Random Forest 01

In [182]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)
print(clf.feature_importances_)
y_TEST_rf = clf.predict(X_TEST)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = y_TEST_rf
df_submission.to_csv("rf_01.csv", index=False)
unique, counts = np.unique(y_TEST_rf, return_counts=True)
unique, counts


[0.00765941 0.00111498 0.001395   0.00044321 0.         0.
 0.04585591 0.02903457 0.00597071 0.00114061 0.01327236 0.00102483
 0.00146452 0.02719572 0.03833558 0.         0.         0.
 0.00359541 0.         0.         0.05952775 0.01789043 0.
 0.         0.         0.         0.         0.         0.
 0.05573803 0.00025185 0.         0.         0.         0.
 0.         0.         0.         0.         0.00082631 0.
 0.         0.         0.00105058 0.         0.00073098 0.
 0.         0.01061954 0.0692902  0.         0.01268923 0.01688073
 0.02955059 0.         0.00127465 0.         0.00221753 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.10572265 0.17154526 0.00189987 0.         0.
 0.         0.         0.         0.         0.02450224 0.00128632
 0.         0.03666014 0.         0.00133819 0.         0.
 0.01141441 0.00173121 0.01082786 0.00434574 0.         0.
 0.         0.         0.       

(array([2, 4], dtype=int64), array([   85, 23771], dtype=int64))

## Random Forest 02

In [194]:
clf = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=0)
clf.fit(X_train, y_train)
# print(clf.feature_importances_)
y_TEST_rf = clf.predict(X_TEST)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = y_TEST_rf
df_submission.to_csv("rf_02.csv", index=False)
unique, counts = np.unique(y_TEST_rf, return_counts=True)
unique, counts

(array([1, 2, 3, 4], dtype=int64),
 array([  143,  2698,    51, 20964], dtype=int64))

## Random Forest 03

In [198]:
clf = RandomForestClassifier(n_estimators=250, max_depth=25, random_state=0)
clf.fit(X_train, y_train)
# print(clf.feature_importances_)
y_TEST_rf = clf.predict(X_TEST)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = y_TEST_rf
df_submission.to_csv("rf_03.csv", index=False)
unique, counts = np.unique(y_TEST_rf, return_counts=True)
unique, counts

(array([1, 2, 3, 4], dtype=int64),
 array([  435,  3303,   358, 19760], dtype=int64))

## XGB 01

In [215]:
model = XGBClassifier()
model.fit(X_train, y_train)
Y_TEST_XGB = model.predict(X_TEST)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = Y_TEST_XGB
df_submission.to_csv("xgb_01.csv", index=False)
unique, counts = np.unique(Y_TEST_XGB, return_counts=True)
unique, counts

(array([1, 2, 3, 4], dtype=int64),
 array([  796,  3395,   278, 19387], dtype=int64))

# Use feature importance + Use full X Train

# XGB 02

In [216]:
X2 = X[:,model.feature_importances_ > 0.01]
Y2 = y
XTEST2 = X_TEST[:,model.feature_importances_ > 0.01]

In [218]:
xgb = XGBClassifier()
xgb.fit(X2, Y2)
Y_TEST_XGB = xgb.predict(XTEST2)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = Y_TEST_XGB
df_submission.to_csv("xgb_02.csv", index=False)
unique, counts = np.unique(Y_TEST_XGB, return_counts=True)
unique, counts

(array([1, 2, 3, 4], dtype=int64),
 array([  594,  3689,   379, 19194], dtype=int64))

# XGB 03

In [225]:
X2 = X[:,model.feature_importances_ > 0.012]
Y2 = y
XTEST2 = X_TEST[:,model.feature_importances_ > 0.012]
XTEST2.shape

(23856, 23)

In [226]:
xgb = XGBClassifier()
xgb.fit(X2, Y2)
Y_TEST_XGB = xgb.predict(XTEST2)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = Y_TEST_XGB
df_submission.to_csv("xgb_03.csv", index=False)
unique, counts = np.unique(Y_TEST_XGB, return_counts=True)
unique, counts

(array([1, 2, 3, 4], dtype=int64),
 array([  698,  3668,   340, 19150], dtype=int64))

# Use more features (don't remove multicollinearity)

## XGB 04

In [227]:
df = pd.read_csv("data/train.csv")
df2 = pd.read_csv("data/test.csv")

In [229]:
cdf = clean_test_2(df)
cdf2 = clean_test_2(df2)

In [234]:
X = cdf.drop(["Target"], axis = 1).values
y = cdf["Target"].values
X_TEST = cdf2.values

In [235]:
model = XGBClassifier()
model.fit(X, y)
Y_TEST_XGB = model.predict(X_TEST)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = Y_TEST_XGB
df_submission.to_csv("xgb_04.csv", index=False)
unique, counts = np.unique(Y_TEST_XGB, return_counts=True)
unique, counts

(array([1, 2, 3, 4], dtype=int64),
 array([  614,  3616,   328, 19298], dtype=int64))

## XGB 05

In [237]:
X2 = X[:,model.feature_importances_ > 0.010]
Y2 = y
XTEST2 = X_TEST[:,model.feature_importances_ > 0.010]
XTEST2.shape

(23856, 35)

In [238]:
xgb = XGBClassifier()
xgb.fit(X2, Y2)
Y_TEST_XGB = xgb.predict(XTEST2)
df_submission = pd.read_csv("data/sample_submission.csv")
df_submission["Target"] = Y_TEST_XGB
df_submission.to_csv("xgb_05.csv", index=False)
unique, counts = np.unique(Y_TEST_XGB, return_counts=True)
unique, counts

(array([1, 2, 3, 4], dtype=int64),
 array([  613,  3740,   340, 19163], dtype=int64))