In [2]:
###Hide
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import grid_search
from sklearn.decomposition import PCA
from sklearn import feature_selection as fs
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

## Data Scrubbing Process

#### Summary of the data

The National Child Development Study (NCDS) was a perinated mortality survey which examined the social factors assosiated with stillbirth and infant mortality over 17000 babies born in the UK during 1958. Since then the cohort has been survery on six other times in order to monitor their health. These surveys were carried out in 1965 (age 7), 1969 (age 11), 1974 (age 16), 1981 (age 23), 1991 (age 33) and 1999/2000 (age 41/42). As part of the 1991 survey, a special study was also undertaken of the children of one third of the cohort members, including assessments of the behaviour and cognitive development of approximately 5000 children. There have also been surveys of sub-samples of the cohort, the most recent occurring in 1996 (age 37) when information was collected on the basic skills of a 10% sample of cohort members. [[reference]](../references/ncds_and_bcs70_response.pdf)

File ncds0123 :

File ncds_pms_additionals :

File ncds_response :


In [3]:
#Load and inspect the ncds data
ncds_data = pd.read_csv('datasets/ncds0123.txt', delimiter='\t', low_memory=False)
# Print shapes
print "Shape of data:", ncds_data.shape
ncds_data.head()

Shape of data: (18558, 1765)


Unnamed: 0,ncdsid,n622,n0region,n1region,n2region,n3region,n553,n545,n520,n490,...,n1849,dvht07,dvht11,dvht16,dvrwt07,dvrwt11,dvrwt16,dvwt07,dvwt11,dvwt16
0,N10001N,2,9,9,9,9,23,4,2,12,...,-1,1.21899986267032,1.47299957275415,1.59999942779607,110.347991943347,98.1929931640604,105.055999755876,25.8549957275385,37.6489868164152,56.0199890136717
1,N10002P,1,9,8,8,8,34,4,5,1,...,-1,1.34599971771224,-1.0,-1.0,90.865997314449,-1.0,-1.0,26.3089904785155,-1.0,-1.0
2,N10003Q,1,4,4,4,4,34,4,10,1,...,-1,1.32099914550831,1.49899959564243,1.87999916076665,87.9599914550983,96.4049987792867,89.382995605487,24.4939880371087,38.1019897460905,66.6799926757659
3,N10004R,2,1,1,1,1,26,4,11,1,...,-1,1.29499912262003,1.51099967956562,1.62999916076665,105.16198730471,111.588989257796,132.054992675766,28.122985839843,45.8139953613169,72.7999877929584
4,N10005S,2,10,10,10,10,25,4,1,3,...,-1,1.34599971771224,-1.0,-1.0,129.382995605487,-1.0,-1.0,37.6489868164152,-1.0,-1.0


In [4]:
#Load and inspect the pms additions data
ncds_pms_data = pd.read_csv('datasets/ncds_pms_additionals.txt', delimiter='\t', low_memory=False)
# Print shapes
print "Shape of data:", ncds_pms_data.shape
ncds_pms_data.head()

Shape of data: (16990, 54)


Unnamed: 0,NCDSID,N622,BSTATUS,POD,BOOKING,PLANC,DIASTOL,MAXDBP,ALBECL,XRAY,...,DTB8,DTB9,DTB10,ILLNESS,MOD,TOD,AAD,SBNND,PLCWGT,TABLE62
0,N10001N,2,0,8,8,2,1,1,0,0,...,0,0,0,0,0,-1,-1,-1,-2,-1
1,N10002P,1,0,2,0,4,4,3,0,0,...,0,0,0,0,0,-1,-1,-1,-2,-1
2,N10003Q,1,0,8,8,2,1,3,0,0,...,0,0,1,3,0,-1,-1,-1,-2,-1
3,N10004R,2,0,8,8,2,1,-8,-8,1,...,1,0,0,-1,0,-1,-1,-1,-2,-1
4,N10005S,2,0,8,8,2,1,3,0,1,...,0,0,0,0,0,-1,-1,-1,-2,-1


In [5]:
#Load and inspect the response data
ncds_response_data = pd.read_csv('datasets/ncds_response.txt', delimiter='\t', low_memory=False)
# Print shapes
print "Shape of data:", ncds_response_data.shape
ncds_response_data.head()

Shape of data: (18558, 18)


Unnamed: 0,NCDSID,N622,BSTATUS,COBIRTH,MULTIPNO,MULTCODE,ETHNICID,OUTCME00,OUTCME01,OUTCME02,OUTCME03,OUTCME04,OUTCME05,OUTCME06,OUTCMEBM,OUTCME07,OUTCME08,OUTCME09
0,N10001N,2,0,1,-1,-1,1,1,1,1,1,1,1,1,1,1,1,1
1,N10002P,1,0,1,-1,-1,1,1,1,1,1,1,1,1,1,1,1,1
2,N10003Q,1,0,1,-1,-1,1,1,1,1,1,7,7,7,6,7,7,7
3,N10004R,2,0,1,-1,-1,1,1,1,1,1,1,1,1,1,2,2,3
4,N10005S,2,0,2,-1,-1,5,1,1,1,1,2,2,2,6,6,6,6


Let us first join all the threee datasets, so we can work on all the data together.

In [6]:
# Join datasets
ncds_merged_data = pd.merge(left=ncds_data,right=ncds_pms_data,how='left',left_on='ncdsid',right_on='NCDSID')
ncds_merged_data = pd.merge(left=ncds_merged_data,right=ncds_response_data,how='left',left_on='ncdsid',right_on='NCDSID')
print "Shape of data:", ncds_merged_data.shape

Shape of data: (18558, 1837)


In [7]:
#Hide

# Utility methods for data exploring and cleaning
def evaluate_data(df):
    # Check for range of unique values for the train data
    for i in range(df.shape[1]):
        vals = np.unique(df.iloc[:, i])
        if len(vals) < 15:
            print '(Categorical) {} unique values - {}: {}'.format(len(vals), df.columns[i], vals)
        else:
            print '(Continuous) range of values - ', df.columns[i], ': {} to {}'.format(df.iloc[:, i].min(), df.iloc[:, i].max())

def evaluate_epil_columns(df):
    for column in epil_columns:
        vals = np.unique(df[column])
        if len(vals) < 15:
            print '(Categorical) {} unique values - {}: {}'.format(len(vals), column, vals)
        else:
            print '(Continuous) range of values - ', column, ': {} to {}'.format(df[column].min(), df[column].max())


def columns_with_null(df):
    for column in df.columns:
        df_missing = df[df[column].isnull()]
        count = 0
        if df_missing.shape[0] > 0:
            print "Predictor " , column, " contain null values / Count = " ,df_missing.shape[0]
            count = count +1
    print "Total number of columns with null:",count

#### Understand the data

Let us explore the data and understand the predictors, what they mean in real life. Also try to understand the values of each predictors