In [None]:
import pandas as pd
import numpy as np
import h5py
import math
import os

In [None]:
data_path = '/home/physics/btech/ph1140797/AHS-ML-Project/data/'

In [None]:
from python_helper.sort_clean_data import lowercase_32Char_list
from python_helper.sort_clean_data import get_sheet_field_names
from python_helper.sort_clean_data import remove_yellow_fields
from python_helper.sort_clean_data import sort_dataset_state_dist_house
from python_helper.sort_clean_data import create_balanced_classes

In [None]:
def remove_yellow_df():
	AHS_struct_workbook = pd.ExcelFile(data_path + "Data_structure_AHS.xlsx")
	AHS_struct_sheets_names = AHS_struct_workbook.sheet_names

	# ---- Uncomment these lines for processing other datasets as well ----
	# mort_field_list = lowercase_32Char_list(get_sheet_field_names(AHS_struct_workbook, "MORT"))
	# wps_field_list = lowercase_32Char_list(get_sheet_field_names(AHS_struct_workbook, "WPS"))
	# women_field_list = lowercase_32Char_list(get_sheet_field_names(AHS_struct_workbook, "WOMAN"))
	#
	# AHS_mort = pd.read_csv(data_path + "22_AHS_MORT.csv", sep="|")
	# AHS_wps = pd.read_csv(data_path + "22_AHS_WPS.csv", sep="|")
	# AHS_women = pd.read_csv(data_path + "22_AHS_WOMEN.csv", sep="|")
	#
	# mort_clean = remove_yellow_fields(AHS_mort, mort_field_list[0])
	# wps_clean = remove_yellow_fields(AHS_wps, wps_field_list[0])
	# women_clean = remove_yellow_fields(AHS_women, women_field_list[0])
	#
	# mort_clean.to_csv(data_path + '22_AHS_MORT_Clean.csv')
	# wps_clean.to_csv(data_path + '22_AHS_WPS_Clean.csv')
	# women_clean.to_csv(data_path + '22_AHS_WOMEN_Clean.csv')

	comb_field_list = lowercase_32Char_list(get_sheet_field_names(AHS_struct_workbook, "COMB"))
	AHS_comb = pd.read_csv(data_path + "22_AHS_COMB.csv", sep="|")

	print('    ')
	print('Removing Yellow Fields from COMB Data')
	data_clean = remove_yellow_fields(AHS_comb, comb_field_list[0])

	print('Saving Clean Data to data/22_AHS_COMB_Clean.csv')
	data_clean.to_csv(data_path + '22_AHS_COMB_Clean.csv')

	del AHS_struct_workbook, AHS_struct_sheets_names
	del AHS_comb, data_clean, comb_field_list

In [None]:
def check_if_exists(force=False):
	file_path = data_path + '22_AHS_COMB_Clean.csv'
	if not os.path.exists(file_path):
		remove_yellow_df()
	else:
		if force:
			remove_yellow_df()
		else:
			print('Yellow Fields already removed. Proceeding further')

# Set force = True  to force it to redo even if it exists
check_if_exists()

In [None]:
print('PART - 2 : Preparing Data for the analysis')

def one_hot_df(data_frame, one_hot_colnames=list()) :
    if len(one_hot_colnames) != 0:
        colnames = list(data_frame)
        hot_col = list()

        for hot in one_hot_colnames :
            if hot in colnames :
                hot_col.append(hot)
    else:
        hot_col = list(data_frame)
        
    if 'district' in hot_col :
        hot_col.remove('district')
    if 'state' in hot_col :
        hot_col.remove('state')
    if 'age' in hot_col:
        hot_col.remove('age')
            
    data_frame = pd.get_dummies(data_frame, columns=hot_col)
    return (data_frame)


# These are the columns which I think are irrelevant in the analysis
# Feel free to add or remove entries 
col_to_be_removed = [
    'state',
    'Unnamed: 0',
    'psu_id',
    'house_no',
    'house_hold_no',
    'member_identity',
    'father_serial_no',
    'mother_serial_no',
    'date_of_birth',
    'month_of_birth',
    'year_of_birth',
    'date_of_marriage',
    'month_of_marriage',
    'year_of_marriage',
    'building_no',
    'no_of_dwelling_rooms',
    'rural_1',
    'rural_2',
    'stratum_code',
    'relation_to_head',
    'member_identity',
    'father_serial_no',
    'mother_serial_no',
    'date_of_birth',
    'month_of_birth',
    'year_of_birth',
    'date_of_marriage',
    'month_of_marriage',
    'year_of_marriage',
    'isheadchanged',
    'year'
]

In [None]:
dist = pd.read_csv(data_path + '22_AHS_COMB_Clean.csv')

In [None]:
dist.shape

In [None]:
dist_p = dist[np.isfinite(dist['diagnosed_for'])]

In [None]:
dist_p.shape

In [None]:
diagnosed_col = dist_p[['diagnosed_for']]
diagnosed_data = dist_p.drop(['diagnosed_for'], inplace=False, axis=1, errors='ignore')

In [None]:
size_threshold = 30000
no_of_df = int(diagnosed_data.shape[0]/size_threshold)
# List of splitted datasets
splitted_dataset = np.array_split(diagnosed_data, no_of_df)
df_list = list()
for df in splitted_dataset:
    df_list.append(pd.DataFrame(df))

hot_df_list = list()
for df in df_list:
    hot_df_list.append(one_hot_df(df))

diagnosed_data = pd.concat(hot_df_list)

In [None]:
diagnosed_data = diagnosed_data.fillna(0)