In [2]:
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf
import math
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [3]:
data_path = 'data/'

In [4]:
################### PART - 2 : Prepare COMB Data for the analysis ###################
#####################################################################################

print('PART - 2 : Preparing Data for the analysis')

def one_hot_df(data_frame, one_hot_colnames=list()) :
    if len(one_hot_colnames) != 0:
        colnames = list(data_frame)
        hot_col = list()

        for hot in one_hot_colnames :
            if hot in colnames :
                hot_col.append(hot)
    else:
        hot_col = list(data_frame)
        
    if 'district' in hot_col :
        hot_col.remove('district')
    if 'state' in hot_col :
        hot_col.remove('state')
    if 'age' in hot_col:
        hot_col.remove('age')
            
    data_frame = pd.get_dummies(data_frame, columns=hot_col, sparse=True)
    return (data_frame)

PART - 2 : Preparing Data for the analysis


In [15]:
# These are the columns which I think are irrelevant in the analysis
# Feel free to add or remove entries 
col_to_be_removed = [
    'state',
    'Unnamed: 0',
    'psu_id',
    'house_no',
    'house_hold_no',
    'member_identity',
    'father_serial_no',
    'mother_serial_no',
    'date_of_birth',
    'month_of_birth',
    'year_of_birth',
    'date_of_marriage',
    'month_of_marriage',
    'year_of_marriage',
    'building_no',
    'no_of_dwelling_rooms',
    'rural_1',
    'rural_2',
    'stratum_code',
    'relation_to_head',
    'member_identity',
    'father_serial_no',
    'mother_serial_no',
    'date_of_birth',
    'month_of_birth',
    'year_of_birth',
    'date_of_marriage',
    'month_of_marriage',
    'year_of_marriage',
    'isheadchanged',
    'year'
]

In [5]:
pd.__version__

u'0.19.2'

In [6]:
dist = pd.read_csv(data_path + '22_AHS_COMB_Clean.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4155187 entries, 0 to 4155186
Data columns (total 78 columns):
Unnamed: 0                         int64
state                              int64
district                           int64
rural                              int64
stratum_code                       int64
psu_id                             int64
house_no                           int64
house_hold_no                      int64
sex                                int64
usual_residance                    float64
relation_to_head                   float64
member_identity                    float64
father_serial_no                   float64
mother_serial_no                   float64
date_of_birth                      float64
month_of_birth                     float64
year_of_birth                      float64
age                                float64
religion                           float64
social_group_code                  float64
marital_status                     float64
da

In [16]:
print('  ')
print('Removing not-so-useful columns')
# Dropping the above columns
dist = dist.drop(col_to_be_removed,axis=1,errors='ignore')

# As we need to calculate for variable 'diagnosed_for'
# So, we drop the rows where 'diagnosed_for' == NaN
dist_p = dist[np.isfinite(dist['diagnosed_for'])]

  
Removing not-so-useful columns


In [17]:
dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4155187 entries, 0 to 4155186
Data columns (total 58 columns):
district                           int64
rural                              int64
sex                                int64
usual_residance                    float64
age                                float64
religion                           float64
social_group_code                  float64
marital_status                     float64
currently_attending_school         float64
reason_for_not_attending_school    float64
highest_qualification              float64
occupation_status                  float64
disability_status                  float64
injury_treatment_type              float64
illness_type                       float64
treatment_source                   float64
symptoms_pertaining_illness        float64
sought_medical_care                float64
diagnosed_for                      float64
diagnosis_source                   float64
regular_treatment                

In [9]:
dist.head()

Unnamed: 0.1,Unnamed: 0,district,rural,psu_id,house_no,sex,usual_residance,age,religion,social_group_code,...,is_water_pump,cart,land_possessed,residancial_status,iscoveredbyhealthscheme,healthscheme_1,healthscheme_2,housestatus,householdstatus,as_binned
0,0,3,1,157280582,633,1,1.0,30.0,1.0,3.0,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
1,1,3,1,157278213,634,1,1.0,49.0,1.0,3.0,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
2,2,3,1,157278379,634,2,1.0,47.0,1.0,3.0,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
3,3,3,1,157281242,634,1,1.0,25.0,1.0,3.0,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
4,4,3,1,157280945,634,1,1.0,27.0,1.0,3.0,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,


In [19]:
del dist
# dist_p = dist_p.reset_index(drop=True)

# Removing rows with 'diagnosed_for' = 0.0
dist_p = dist_p[dist_p['diagnosed_for'] != 0.0]
# dist_p = dist_p.reset_index(drop=True)

In [20]:
dist_p.head()

Unnamed: 0,district,rural,sex,usual_residance,age,religion,social_group_code,marital_status,currently_attending_school,reason_for_not_attending_school,...,is_water_pump,cart,land_possessed,residancial_status,iscoveredbyhealthscheme,healthscheme_1,healthscheme_2,housestatus,householdstatus,as_binned
98,3,1,1,1.0,42.0,1.0,2.0,4.0,,,...,2.0,4.0,3.0,1.0,2.0,,,1.0,1.0,
108,3,1,1,1.0,67.0,1.0,2.0,3.0,,,...,2.0,4.0,3.0,1.0,2.0,,,1.0,1.0,
139,3,1,1,1.0,67.0,1.0,1.0,3.0,,,...,2.0,4.0,2.0,1.0,2.0,,,1.0,1.0,
162,3,1,2,1.0,43.0,1.0,2.0,3.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
206,3,1,1,1.0,22.0,1.0,3.0,1.0,,,...,2.0,4.0,1.0,1.0,1.0,7.0,,1.0,1.0,


In [21]:
dist_p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158036 entries, 98 to 4155184
Data columns (total 58 columns):
district                           158036 non-null int64
rural                              158036 non-null int64
sex                                158036 non-null int64
usual_residance                    149826 non-null float64
age                                158036 non-null float64
religion                           158036 non-null float64
social_group_code                  158036 non-null float64
marital_status                     158036 non-null float64
currently_attending_school         8262 non-null float64
reason_for_not_attending_school    1218 non-null float64
highest_qualification              154501 non-null float64
occupation_status                  155340 non-null float64
disability_status                  158036 non-null float64
injury_treatment_type              158036 non-null float64
illness_type                       158036 non-null float64
treatment_so

In [11]:
%whos

Variable            Type         Data/Info
------------------------------------------
Activation          type         <class 'keras.layers.core.Activation'>
Dense               type         <class 'keras.layers.core.Dense'>
Dropout             type         <class 'keras.layers.core.Dropout'>
SGD                 type         <class 'keras.optimizers.SGD'>
Sequential          type         <class 'keras.models.Sequential'>
col_to_be_removed   list         n=29
data_path           str          data/
dist_p              DataFrame             Unnamed: 0  dist<...>158036 rows x 61 columns]
h5py                module       <module 'h5py' from '//an<...>kages/h5py/__init__.pyc'>
math                module       <module 'math' from '//an<...>2.7/lib-dynload/math.so'>
np                  module       <module 'numpy' from '//a<...>ages/numpy/__init__.pyc'>
one_hot_df          function     <function one_hot_df at 0x11c3c9a28>
pd                  module       <module 'pandas' from '//<...>ges/panda

In [22]:
# Shuffling the dataset and reset index
dist_p = dist_p.iloc[np.random.permutation(len(dist_p))]
dist_p = dist_p.reset_index(drop=True)

In [24]:
print('Splitting to-predict column')
# Seperating 'diagnosed_for' variable for prediction
diagnosed_col = dist_p[['diagnosed_for']]
diagnosed_data = dist_p.drop(['diagnosed_for'], inplace=False, axis=1, errors='ignore')
del dist_p

Splitting to-predict column


In [25]:
diagnosed_data.head()

Unnamed: 0,district,rural,sex,usual_residance,age,religion,social_group_code,marital_status,currently_attending_school,reason_for_not_attending_school,...,is_water_pump,cart,land_possessed,residancial_status,iscoveredbyhealthscheme,healthscheme_1,healthscheme_2,housestatus,householdstatus,as_binned
0,5,1,2,1.0,19.0,1.0,2.0,1.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
1,4,1,1,1.0,51.0,1.0,1.0,3.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
2,11,2,2,1.0,31.0,1.0,3.0,3.0,,,...,2.0,4.0,6.0,,,,,,,5.0
3,6,1,1,1.0,65.0,1.0,3.0,3.0,,,...,2.0,4.0,1.0,,,,,,,2.0
4,6,1,2,1.0,23.0,1.0,2.0,3.0,,,...,2.0,4.0,2.0,1.0,1.0,7.0,,1.0,1.0,


In [26]:
diagnosed_col.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158036 entries, 0 to 158035
Data columns (total 1 columns):
diagnosed_for    158036 non-null float64
dtypes: float64(1)
memory usage: 1.2 MB


In [27]:
print('  ')
print('Saving diagnosed_for column to data/22_COMB_diag_col.csv')
diagnosed_col.to_csv(data_path + '22_COMB_diag_col.csv')
del diagnosed_col

  
Saving diagnosed_for column to data/22_COMB_diag_col.csv


In [29]:
diagnosed_data.isnull().sum().sum()

1494606

In [40]:
#  df.fillna(df.mean())
# ff = diagnosed_data.fillna(diagnosed_data.mean())
ff = diagnosed_data.fillna(0)

In [41]:
diagnosed_data.shape

(158036, 57)

In [42]:
ff.shape

(158036, 57)

In [43]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
h = enc.fit(ff)
k = enc.transform(ff).toarray()

In [44]:
k.shape

(158036, 457)

In [39]:
k[1:5]

array([[ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.]])

In [11]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
a = enc.fit(diagnosed_col)

In [13]:
a = enc.transform(diagnosed_col).toarray()

In [17]:
diagnosed_data.shape

(158036, 60)

In [53]:
np.array_split(diagnosed_data, 5)[1].shape

(31607, 57)

In [54]:
one = pd.DataFrame(np.array_split(diagnosed_data, 5)[0])

In [55]:
one.head()

Unnamed: 0,district,rural,sex,usual_residance,age,religion,social_group_code,marital_status,currently_attending_school,reason_for_not_attending_school,...,is_water_pump,cart,land_possessed,residancial_status,iscoveredbyhealthscheme,healthscheme_1,healthscheme_2,housestatus,householdstatus,as_binned
0,5,1,2,1.0,19.0,1.0,2.0,1.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
1,4,1,1,1.0,51.0,1.0,1.0,3.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
2,11,2,2,1.0,31.0,1.0,3.0,3.0,,,...,2.0,4.0,6.0,,,,,,,5.0
3,6,1,1,1.0,65.0,1.0,3.0,3.0,,,...,2.0,4.0,1.0,,,,,,,2.0
4,6,1,2,1.0,23.0,1.0,2.0,3.0,,,...,2.0,4.0,2.0,1.0,1.0,7.0,,1.0,1.0,


In [56]:
one_hot = one_hot_df(one)

In [57]:
one_hot.shape

(31608, 301)

In [58]:
two = pd.DataFrame(np.array_split(diagnosed_data, 5)[1])
three = pd.DataFrame(np.array_split(diagnosed_data, 5)[2])
four = pd.DataFrame(np.array_split(diagnosed_data, 5)[3])
five = pd.DataFrame(np.array_split(diagnosed_data, 5)[4])

In [60]:
two_hot = one_hot_df(two)
three_hot = one_hot_df(three)
four_hot = one_hot_df(four)
five_hot = one_hot_df(five)

In [61]:
two_hot.shape

(31607, 301)

In [75]:
# set(three_hot).intersection(two_hot)
set(list(three_hot)) - set(list(four_hot))

{'healthscheme_2_4.0', 'housestatus_3.0'}

In [62]:
three_hot.shape

(31607, 302)

In [63]:
four_hot.shape

(31607, 300)

In [64]:
five_hot.shape

(31607, 302)

In [78]:
total = list()
total.append(one_hot)
total.append(two_hot)
total.append(three_hot)
total.append(four_hot)
total.append(five_hot)
result = pd.concat(total)

In [79]:
result.shape

(158036, 302)

In [76]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158036 entries, 0 to 158035
Columns: 302 entries, age to water_filteration_8.0
dtypes: float64(3), int64(1), uint8(298)
memory usage: 11.4 MB


In [68]:
result = pd.DataFrame(result)

In [70]:
result.head()

Unnamed: 0,age,alcohol_1.0,alcohol_2.0,alcohol_3.0,alcohol_4.0,as_binned_1.0,as_binned_2.0,as_binned_3.0,as_binned_4.0,as_binned_5.0,...,treatment_source_99.0,usual_residance_1.0,water_filteration_1.0,water_filteration_2.0,water_filteration_3.0,water_filteration_4.0,water_filteration_5.0,water_filteration_6.0,water_filteration_7.0,water_filteration_8.0
0,19.0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,51.0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,31.0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
3,65.0,0,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,23.0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [69]:
result.isnull().sum().sum()

126429

In [19]:
bb = diagnosed_data.fillna(0)

In [24]:
bb.head()

Unnamed: 0.1,Unnamed: 0,district,rural,psu_id,house_no,sex,usual_residance,age,religion,social_group_code,...,is_water_pump,cart,land_possessed,residancial_status,iscoveredbyhealthscheme,healthscheme_1,healthscheme_2,housestatus,householdstatus,as_binned
0,2214686,4,2,157502641,53,1,1.0,14.0,1.0,3.0,...,2.0,4.0,6.0,1.0,1.0,2.0,0.0,1.0,1.0,0.0
1,2790098,5,1,157951458,33,1,1.0,33.0,1.0,2.0,...,2.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,1644600,7,1,158410905,264,2,1.0,55.0,1.0,3.0,...,2.0,4.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,0.0
3,2933732,9,2,158788315,38,1,1.0,60.0,2.0,3.0,...,2.0,4.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,2254437,6,2,158275760,99,2,1.0,32.0,1.0,2.0,...,2.0,4.0,6.0,1.0,1.0,7.0,0.0,1.0,1.0,0.0


In [None]:
cc = one_hot_df(bb)

In [None]:
cc.info()

In [22]:
enc.fit(bb)
b = enc.transform(bb).toarray()

In [23]:
b.shape

(158036, 317435)

In [None]:
print('One Hot Encoding Data')
# One - Hot encoding for the data
diagnosed_data = one_hot_df(diagnosed_data)

One Hot Encoding Data


In [None]:
diagnosed_data.info()

In [None]:
print('Saving One-Hot Columns to data/22_COMB_diag_hotData.csv')
diagnosed_data.to_csv(data_path + '22_COMB_diag_hotData.csv')
del diagnosed_data

print('------------------------------------------------------')
print('    ')

In [None]:
################### PART - 3 : Apply Machine Learning on the data ###################
#####################################################################################

print('PART - 3 : Applying Machine Learning on the data')

diagnosed_data = pd.read_csv(data_path + '22_COMB_diag_hotData.csv', low_memory=False)
diagnosed_col = pd.read_csv(data_path + '22_COMB_diag_col.csv', low_memory=False)

assert (diagnosed_data.shape[0] == diagnosed_col.shape[0])
split_index = int(diagnosed_data.shape[0] * 0.85)

print('   ')
print('Splitting train and test data in ratio 85:15')
train_data = np.array(diagnosed_data.astype(float))[:split_index]
train_label = np.array(diagnosed_col.astype(float))[:split_index][:,0]

test_data = np.array(diagnosed_data.astype(float))[split_index:]
test_label = np.array(diagnosed_col.astype(float))[split_index:][:,0]

# Replace Label No 99 by 32
# Label No 99 causes 'to_categorical' to make 100 one-hot values
# Replacing it by 33 leads to only 33 values
def replace_99_labes(label_data):
	label_list = list()
	for i in range(len(label_data)):
		if label_data[i] == 99.0 :
			label_data[i] = 32.0

	return label_data

train_label = to_categorical(replace_99_labes(train_label).astype('int32'), nb_classes=None)
test_label = to_categorical(replace_99_labes(test_label).astype('int32'), nb_classes=None)

# Fully-Connected Neural network with 4 Hidden layers
model = Sequential()
# Input Layer
model.add(Dense(1000, input_dim=1208, init='uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# Hidden Layer - 1
model.add(Dense(750, init='uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# Hidden Layer - 2
model.add(Dense(500, init='uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# Hidden Layer - 3
model.add(Dense(250, init='uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# Hidden Layer - 4
model.add(Dense(100, init='uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# Output Layer
model.add(Dense(33, init='uniform'))
model.add(Activation('softmax'))

sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])


# model.fit(train_data, categorical_labels,
#           nb_epoch=2000,
#           batch_size=128)

model.evaluate(test_data, to_categorical(test_label.astype('int32'), nb_classes=None))

In [80]:
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf
import math

def get_sheet_field_names(excel_workbook, sheet_name) :
    # Start from row 3, as initial 2 rows contain no info
    sheet = excel_workbook.parse(sheet_name, skiprows=2, na_values=['NA'])
    # Find index of 'NOTES:' in 1st cloumn and delete all rows below it
    notes_index = sheet.loc[sheet['Field Order'] == "NOTES:"].index.tolist()[0]
    sheet = sheet.ix[1 : notes_index - 1]
    
    # select column 2,3 and 4 (Filed name, Description and Codes used)
    sheet = sheet[[1,2,3]]
    # Remove <NaN> from Field Names
    sheet = sheet.dropna(subset=[list(sheet)[0]])
    
    # Selecting Non-Yellow field names
    # Dropping <NaN> from Field Descriptions and Codes Used
    sheet_non_yellow = sheet.dropna(subset=[list(sheet)[1], list(sheet)[2]])
    
    # Selecting 'None' and Non-'None' Codes used
    sheet_code_not_none = sheet_non_yellow[sheet_non_yellow['Codes Used'] != "None"]
    sheet_code_none = sheet_non_yellow[sheet_non_yellow['Codes Used'] == "None"]
    
    # Convert all 'Field Names' to list()
    sheet_all = sheet['Field Name'].tolist()
    sheet_non_yellow = sheet_non_yellow['Field Name'].tolist()
    sheet_yellow = list(set(sheet_all) - set(sheet_non_yellow))
    sheet_code_not_none = sheet_code_not_none['Field Name'].tolist()
    sheet_code_none = sheet_code_none['Field Name'].tolist()
    
    # Output in form of list() of lists()
    output = list()
    output.append(sheet_yellow)
    output.append(sheet_non_yellow)
    output.append(sheet_all)
    output.append(sheet_code_none)
    output.append(sheet_code_not_none)
    # output = list[sheet_yellow, sheet_non_yellow, sheet_all, sheet_code_none, sheet_code_not_none]
    
    return(output)

def lowercase_32Char(list_):
    list_1 = [x.lower() for x in list_]
    list_2 = [x[0:32] for x in list_1]
    return (list_2)

def lowercase_32Char_list(field_list) :
    # Field names in CSV files are max upto 32 characters
    # and all small letters
    l = len(field_list)
    sol = list()
    
    for field in field_list:
        sol.append(lowercase_32Char(field))
    
    return(sol)

# Remove yellow fields from the data frame
def remove_yellow_fields(data_frame, yellow_field_list) :
    df_col_names = list(data_frame)
    
    drop_col = list()
    for yellow in yellow_field_list :
        if yellow in df_col_names :
            drop_col.append(yellow)
            
    df = data_frame.drop(drop_col, axis=1)
    
    if 'id' in list(df) :
        df = df.drop(['id'], axis=1)
    
    return df

def sort_dataset_state_dist_house(data_frame) :
    return (data_frame.sort(['state', 'district', 'house_no', 'house_hold_no'])).reset_index(drop=True)


# AHS_struct_workbook = pd.ExcelFile("Data_structure_AHS.xlsx")
# AHS_struct_sheets_names = AHS_struct_workbook.sheet_names

# comb_field_list = lowercase_32Char_list(get_sheet_field_names(AHS_struct_workbook, "COMB"))

# data_clean = remove_yellow_fields(data, comb_field_list[0])
# data_clean_sorted = sort_dataset_state_dist_house(data_clean)

# data_clean_sorted.to_csv('22_AHS_COMB_Clean_Sorted.csv')

In [81]:
AHS_struct_workbook = pd.ExcelFile(data_path + "Data_structure_AHS.xlsx")
AHS_struct_sheets_names = AHS_struct_workbook.sheet_names

In [82]:
comb_field_list = lowercase_32Char_list(get_sheet_field_names(AHS_struct_workbook, "COMB"))
AHS_comb = pd.read_csv(data_path + "22_AHS_COMB.csv", sep="|")

  interactivity=interactivity, compiler=compiler, result=result)


In [83]:
AHS_comb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4155187 entries, 0 to 4155186
Data columns (total 99 columns):
hh_id                              float64
client_hh_id                       float64
hl_id                              float64
state                              int64
district                           int64
rural                              int64
stratum_code                       int64
psu_id                             int64
house_no                           int64
house_hold_no                      int64
currently_dead_or_out_migrated     float64
hh_serial_no                       int64
sex                                int64
usual_residance                    float64
relation_to_head                   float64
member_identity                    float64
father_serial_no                   float64
mother_serial_no                   float64
date_of_birth                      float64
month_of_birth                     float64
year_of_birth                      float64
ag

In [None]:
size_threshold = 30000
no_of_df = int(diagnosed_data.shape[0]/size_threshold)
# List of splitted datasets
splitted_dataset = np.array_split(diagnosed_data, no_of_df)
df_list = list()
for df in splitted_dataset:
	df_list.append(pd.DataFrame(df))

hot_df_list = list()
for df in df_list:
	hot_df_list.append(one_hot_df(df))

diagnosed_data = pd.concat(hot_df_list)
diagnosed_data = diagnosed_data.fillna(0)

In [84]:
AHS_comb.shape

(4155187, 99)

In [85]:
splitted_dataset = np.array_split(AHS_comb, 5)
df_list = list()
for df in splitted_dataset:
    df_list.append(pd.DataFrame(df))

In [86]:
comb = pd.concat(df_list)

In [87]:
comb.shape

(4155187, 99)

In [88]:
comb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4155187 entries, 0 to 4155186
Data columns (total 99 columns):
hh_id                              float64
client_hh_id                       float64
hl_id                              float64
state                              int64
district                           int64
rural                              int64
stratum_code                       int64
psu_id                             int64
house_no                           int64
house_hold_no                      int64
currently_dead_or_out_migrated     float64
hh_serial_no                       int64
sex                                int64
usual_residance                    float64
relation_to_head                   float64
member_identity                    float64
father_serial_no                   float64
mother_serial_no                   float64
date_of_birth                      float64
month_of_birth                     float64
year_of_birth                      float64
ag

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets

In [8]:
train_data = pd.read_csv('/Users/himanshubabal/Downloads/train_data.csv')
train_label = pd.read_csv('/Users/himanshubabal/Downloads/train_label.csv')

test_data = pd.read_csv('/Users/himanshubabal/Downloads/test_data.csv')

In [13]:
np.copy(train_label).flatten().shape

(206,)

In [20]:
X = train_data[:175] 
Y = np.copy(train_label).flatten()[:175]

h = .02  # step size in the mesh

logreg = linear_model.LogisticRegression(C=1e2,penalty='l2')

logreg.fit(X, Y)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
logreg.predict(train_data[175:])

array([2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1])

In [32]:
def acc(l1, l2):
    sc = 0
    for i in range(len(l1)):
        if l1[i] == l2[i]:
            sc += 1
    s = float(sc/len(l1))
    return s
#     return (100.0 * np.sum(np.argmax(predictions, 2).T == labels) / predictions.shape[1] / predictions.shape[0])

In [28]:
list(train_label[175:].values.flatten())

[2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1]

In [33]:
acc(logreg.predict(train_data[175:]), list(train_label[175:].values.flatten()))

0.8064516129032258

In [18]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:].min() - .5, X[:].max() + .5
y_min, y_max = X[:].min() - .5, X[:].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

TypeError: cannot convert the series to <class 'float'>