### Import Data and Preprocessing Libraries

In [None]:
# Data Processing/Exploration Libraries
#kaggle https://www.kaggle.com/c/costa-rican-household-poverty-prediction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
pd.options.display.float_format = '{:.0f}'.format

#Checking to see the expected submission format
sample = pd.read_csv('../input/sample_submission.csv')

sample.head()

In [None]:
# Data Import & First Look
#train = pd.read_csv('train.csv', index_col='Id')
train = pd.read_csv('../input/train.csv')
print ("Train dataset has {} rows(samples) with {} columns(features) each.".format(*train.shape))

#test = pd.read_csv('test.csv', index_col='Id')
test = pd.read_csv('../input/test.csv')

print ("Test dataset has {} rows(samples) with {} columns(features) each.".format(*test.shape))

#Lets consolidate train and test for preprocessing purpose. We will then separate it out based on null value on Target column
data=train.append(test)
display(data.describe())
display(data.head())


In [None]:
def check_missing(data):
    missing = data.isnull().sum().sort_values(ascending=False)
    missing_percent= (missing[missing > 0]*100 / data.shape[0])
    display(missing_percent)
check_missing(data)

### Removing Columns with Null values

Out of 142 features, we have 3 columns with 70 to 70% null values and 2 columns with .05% null values.

We can delete the observations with null meaneduc  and SQBmeaned   as it consists of less than .05% of the total population.

rez_esc: Years behind in school:

Although, this is a very important feature and have high priority, I decided to remove this features and 80+% of data is missing and we have some more features which gives an estimate of the education status in the houehold.

v18q1, number of tablets household owns:
Number of tablets has 76% null values and can be ignore as we have V18q already which gives some insight on tablets owned by a person.

v2a1: Monthly rent payment is a very important feature and can be estimated based on other house related features. We can go about two ways on estimating the missing values.
- impute null values with median/mean after grouping by rent related features
- train a separate model to predict a missing value
- use some generative model, that can fill missing values by itself, one possibility is a Restricted Boltzmann Machine

I will first try imputing  the missing value due to time constraint

In [None]:
'''For feature v2a1, mean is higher than median which implies the presence of outliers. In this case, median imputation will be more appropriate. We will also consider median by segments based on following rent related features:
rooms,  number of all rooms in the house
v14a, =1 has bathroom in the household
pisonotiene, =1 if no floor at the household
cielorazo, =1 if the house has ceiling
abastaguano, =1 if no water provision
sanitario1, =1 no toilet in the dwelling
energcocinar1, =1 no main source of energy used for cooking (no kitchen)
'''

data['v2a1'].describe()
# For feature v2a1, mean is higher than median which implies the presence of outliers. 
# In this case, median imputation will be more appropriate after grouping by following rent related features.

In [None]:
data['v2a1']=data.groupby(['rooms','v14a','pisonotiene','cielorazo','abastaguano','sanitario1','energcocinar1'])['v2a1'].transform(lambda x: x.fillna(x.median()))

#drop/impute columns and rows with null values count 2
data.drop(['rez_esc','v18q1'], axis=1,inplace=True)
#drop columns which are square value of existing features. These can be ignored as they are redundant for out model. 9
data.drop(['SQBescolari', 'SQBage','SQBhogar_total','SQBedjefe','SQBhogar_nin','SQBovercrowding','SQBdependency','SQBmeaned','agesq'], axis=1,inplace=True)

data.dropna(axis=0,inplace=True,subset=['meaneduc','v2a1'])
data_dropped_rows=data[data['meaneduc'].isnull() | data['v2a1'].isnull()]

check_missing(data)


The dataset given to us is at indiviudal level while we have to predict target on a household level. Lets first consolidate the data on a household level. 
For Household level information I will consolidate the data from Individuals and for houselevel information I will get the data from the head of household.

In [None]:
has_household_head=data.groupby('idhogar')['parentesco1'].sum()
has_household_head[has_household_head != 1].count()
#For household with no head, to avoid discrapancy and to be on the safe side,
#we will choose the most represented value for each feature among the family members.
#data.columns
corr=data.corr().abs()

indices = np.where(corr > 0.90)
indices = [(corr.index[x], corr.columns[y]) for x, y in zip(*indices)
                                        if x != y and x < y]
indices

In [None]:
'''Duplicate Columns:

hhsize, household size
hogar_total, # of total individuals in the household
tamhog, size of the household
r4t3, Total persons in the household
tamviv, number of persons living in the household

abastaguadentro, =1 if water provision inside the dwelling
abastaguafuera, =1 if water provision outside the dwelling

public, "=1 electricity from CNFL,  ICE,  ESPH/JASEC"
coopele, =1 electricity from cooperative

male, =1 if male
female, =1 if female

sanitario2, =1 toilet connected to sewer or cesspool
sanitario3, =1 toilet connected to  septic tank

energcocinar2, =1 main source of energy used for cooking electricity
energcocinar3, =1 main source of energy used for cooking gas

area1, =1 zona urbana
area2, =2 zona rural

Columns that can be dropped are:
hhsize, household size
tamhog, size of the household
r4t3, Total persons in the household
tamviv, number of persons living in the household
abastaguadentro, =1 if water provision inside the dwelling
male, =1 if male
area1, =1 zona urbana

'escolari', 'age' can also be dropped since it is not effective on a household level and we have 
other source to find the number of people under certain age

'dependency', 'edjefe', 'edjefa', These have mixed data and to avoidmisinterpreting, we will remove them as well.
'''

data.drop(['hhsize', 'tamhog','r4t3','tamviv','abastaguadentro','area1','escolari', 'age','dependency', 'edjefe', 'edjefa'], axis=1,inplace=True)


In [None]:
display(data.shape)
display(data.dtypes.unique())
bool_features=[]
g = data.columns.to_series().groupby(data.dtypes).groups
{k.name: v for k, v in g.items()}


Now we will look through each remaining columns to see which are household related and which are individual

In [None]:
id_ = ['Id', 'idhogar', 'Target']

ind_features = ['v18q', 'dis','male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 
            'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 
            'parentesco6', 'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 
            'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 
            'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
            'instlevel9', 'mobilephone']



hh_features = ['idhogar', 'Target','hacdor', 'hacapo', 'v14a', 'refrig', 'paredblolad', 'paredzocalo', 
           'paredpreb','pisocemento', 'pareddes', 'paredmad',
           'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisoother', 
           'pisonatur', 'pisonotiene', 'pisomadera',
           'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 
            'abastaguafuera', 'abastaguano',
            'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 
           'sanitario2', 'sanitario3', 'sanitario5',   'sanitario6',
           'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 
           'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 
           'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
           'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 
           'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5', 
           'computer', 'television', 'lugar1', 'lugar2', 'lugar3',
           'lugar4', 'lugar5', 'lugar6', 'area2', 'rooms', 'r4h1', 'r4h2', 'r4m1','r4m2','r4m3', 'r4t1',  'r4t2', 
              'hogar_nin','hogar_adul','hogar_mayor','hogar_total','bedrooms','qmobilephone',
                'overcrowding']

In [None]:
train['dependency'].head()

In [None]:
ind_features_aggregate = data.groupby('idhogar')[ind_features].sum()
ind_features_aggregate.head()


In [None]:
hh_features_head=data[data['parentesco1'] == 1]
hh_features_nohead=data[(data['parentesco1']) != 1]
hh_features_nohead_max = hh_features_nohead.groupby('idhogar')[hh_features].max()
#hh_features_head.append(hh_features_nohead_max)

hh_feature_list=hh_features_head[hh_features]
hh_feature_list.append(hh_features_nohead_max)
hh_feature_list.set_index('idhogar', inplace=True)
hh_feature_list.head()

In [None]:
final=  hh_feature_list.join(ind_features_aggregate, how='inner')
final.head()


In [None]:
# Labels for training
train_labels = np.array(list(final[final['Target'].notnull()]['Target'].astype(np.uint8)))

# Extract the training data
train_set = final[final['Target'].notnull()].drop(columns = [ 'Target'])
test_set = final[final['Target'].isnull()].drop(columns = [  'Target'])

In [None]:
# Submission base which is used for making submissions to the competition
submission_base = test['idhogar'].copy()
submission_base.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_set, train_labels, test_size=0.25, random_state=0)


In [None]:
#Baseline Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

logreg = LogisticRegression()
logreg.fit(x_train, y_train)

In [None]:
predictions = logreg.predict(x_test)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

# Use score method to get accuracy of model
f1_score = f1_score(y_test,predictions, average='weighted')
print(f1_score)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = all_sample_title = 'F1 Score: {0}'.format(f1_score)
plt.title(all_sample_title, size = 15);



In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV 
from sklearn.metrics import fbeta_score,accuracy_score,f1_score
 
clf= XGBClassifier()

parameters = {'learning_rate':[.05,.1,.2,.3], 'max_depth':[6,7,8,9], 'min_child_weight':[1,2], 
              'gamma':[0], 'subsample':[1], 'colsample_bytree':[.3,.4,.5], 'n_estimators':[100,150]
             , 'reg_lambda':[1]}

# TODO: Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(fbeta_score, beta=.5,average='weighted')

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(clf,parameters,scorer)


# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(x_train,y_train)

# Get the estimatorc
best_clf = grid_fit.best_estimator_
best_params=grid_fit.best_params_


# Make predictions using the unoptimized and the optimized model 
predictions = (clf.fit(x_train, y_train)).predict(x_test)
best_predictions = best_clf.predict(x_test)

# Report the before-and-afterscores
print ("Unoptimized model\n------")
print ("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print ("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5,average='weighted')))
print ("\nOptimized Model\n------")
print ("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print ("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5,average='weighted')))
print ("Best estimator parameters")
print (best_params)

In [None]:
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

# Use score method to get accuracy of model
f1score = f1_score(y_test,best_predictions, average='weighted')
#score = metrics.accuracy_score(y_test, predictions)
#print(f1_score)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = all_sample_title = 'F1 Score: {0}'.format(f1score)
plt.title(all_sample_title, size = 15);

In [None]:
test_set.head()

In [None]:
## submission
best_predictions = best_clf.predict(test_set)
subs = pd.DataFrame(index=test_set.index)
subs['Target'] = best_predictions.astype(np.int64)
#display(test_id.head())
test_id = pd.DataFrame(index=test['idhogar'])
test_id['Id']=test.index

final=test_id.join(subs,  how='left')
final['Id'] = 'ID_'+final.index
final.reset_index(drop=True, inplace=True)
final.to_csv('sample_submission.csv', index=False,float_format='%.0f')
display(final.shape)
final.head()

