## Missing data methods for our dataset 
The impact of missing data on quantitative research can be serious, leading to biased estimates of parameters, loss of information, decreased statistical power, increased standard errors, and weakened generalizability of findings. Firstly, understand that there is NO good way to deal with missing data. We have come across different solutions for data imputation depending on the kind of problem —Analysis, ML, Regression, etc. and it is difficult to provide a general solution. Here, we attempting to summarize the most commonly used methods and trying to find a structural solution.

### read data 

In [1]:
import numpy

In [2]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import tensorflow as tf
data_0 = pd.read_csv('final2.csv')
data_0 = data_0.iloc[: , 1:]
data_0=data_0.drop(['BIRTH','SCREEN','HALLUC','ANYOPIATEDAYS', 'METHADONE','OTHERDRUGS'],axis=1)
data_0=data_0.sample(frac=1)
print (data_0.columns.str.strip())
data_0.head()


2021-08-12 11:50:51.010422: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:
2021-08-12 11:50:51.010438: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Index(['DAST', 'SEX', 'HISPANIC', 'RACE', 'VET', 'ACTIVE', 'DEPLOY', 'AUDIT',
       'COSCREEN', 'BI', 'BT', 'RT', 'ANYALC', 'BINGEDAYS', 'DRUGDAYS',
       'ALCDRUGS', 'DAYSCOCAINE', 'MARYJDAYS', 'METHDAYS', 'INJECT', 'AGE',
       'TOBMONTH'],
      dtype='object')


Unnamed: 0,DAST,SEX,HISPANIC,RACE,VET,ACTIVE,DEPLOY,AUDIT,COSCREEN,BI,...,ANYALC,BINGEDAYS,DRUGDAYS,ALCDRUGS,DAYSCOCAINE,MARYJDAYS,METHDAYS,INJECT,AGE,TOBMONTH
2059,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,...,,,0.0,,0.0,0.0,0.0,0.0,55.0,0.0
6175,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0
4517,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,0.0
6680,0.0,0.0,0.0,2.0,,,,0.0,0.0,0.0,...,,,,,,,,,73.0,
3376,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0


In [None]:
import datawig

In [None]:
audit=np.where( (data_0['AUDIT']==0) & (data_0['ANYALC'].isnull()))[0]
len(audit)
for i in audit: 
    data_0.iloc[i,12]=0
data_0.info(verbose=True)

#### The msno.matrix nullity matrix is a data-dense display which lets you quickly visually pick out patterns in data completion.

In [None]:
import missingno as msno
%matplotlib inline
msno.matrix(data_0)

#### The missingno correlation heatmap measures nullity correlation: how strongly the presence or absence of one variable affects the presence of another:

In [None]:
msno.heatmap(data_0)

### The dendrogram allows you to more fully correlate variable completion, revealing trends deeper than the pairwise ones visible in the correlation heatmap:

In [None]:
msno.dendrogram(data_0)

In [None]:
data_0.isnull().sum()

In [None]:
data_0.notnull().sum()

 ### 1) Dropping the rows that have null Values.
Listwise deletion (complete-case analysis) removes all data for an observation that has one or more missing values.
#### Advantages:
    Quick Process and Simpler

#### Disadvantages:
    Significant loss of data (~40 %)

In [None]:
df = data_0.copy()

In [None]:
df=df.dropna()

In [None]:
df.notnull().sum()

### 2)  Replace Missing Values with Mean, Median & Mode & Standard Deviation & Minimum & Maximum


In [None]:
df['DAST'] = df['DAST'].astype("category")
df['SEX'] = df['SEX'].astype("category")
df['HISPANIC'] = df['HISPANIC'].astype("category")
df['RACE'] = df['RACE'].astype("category")
df['VET'] = df['VET'].astype(bool)
df['ACTIVE'] = df['ACTIVE'].astype(bool)
df['DEPLOY'] = df['DEPLOY'].astype("category")
df['AUDIT'] = df['AUDIT'].astype("category")
df['COSCREEN'] = df['COSCREEN'].astype(bool)
df['RT'] = df['RT'].astype("category")
df['BI'] = df['BI'].astype(bool)
df['BT'] = df['BT'].astype(bool)
df['INJECT'] = df['INJECT'].astype(bool)
df['TOBMONTH'] = df['TOBMONTH'].astype(bool)
###############################################################
df['ANYALC'] = df['ANYALC'].astype(float)
df['BINGEDAYS'] = df['BINGEDAYS'].astype(float)
df['DRUGDAYS'] = df['DRUGDAYS'].astype(float)
df['ALCDRUGS'] = df['ALCDRUGS'].astype(float)
df['DAYSCOCAINE'] = df['DAYSCOCAINE'].astype(float)
df['MARYJDAYS'] = df['MARYJDAYS'].astype(float)
df['ANYOPIATEDAYS'] = df['ANYOPIATEDAYS'].astype(float)
df['METHADONE'] = df['METHADONE'].astype(float)
df['HALLUC'] = df['HALLUC'].astype(float)
df['METHDAYS'] = df['METHDAYS'].astype(float)
df['OTHERDRUGS'] =df['OTHERDRUGS'].astype(float)
df['AGE'] = df['AGE'].astype(float)


### For Example: 

In [None]:
# replacing missing values mod (catogrey)
df['DAST'] = df['DAST'].fillna(df['DAST'].mode())
df['SEX'] = df['SEX'].fillna(df['SEX'].mode())
df['HISPANIC'] = df['HISPANIC'].fillna(df['HISPANIC'].mode())
df['RACE'] = df['RACE'].fillna(df['RACE'].mode())
df['VET'] = df['VET'].fillna(df['VET'].mode())
df['ACTIVE'] = df['ACTIVE'].fillna(df['ACTIVE'].mode())
df['DEPLOY'] = df['DEPLOY'].fillna(df['DEPLOY'].mode())
df['AUDIT'] = df['AUDIT'].fillna(df['AUDIT'].mode())
df['COSCREEN'] = df['COSCREEN'].fillna(df['COSCREEN'].mode())
df['RT'] = df['RT'].fillna(df['RT'].mode())
df['BI'] = df['BI'].fillna(df['BI'].mode())
df['BT'] = df['BT'].fillna(df['BT'].mode())
df['INJECT'] = df['INJECT'].fillna(df['INJECT'].mode())
df['TOBMONTH'] = df['TOBMONTH'].fillna(df['TOBMONTH'].mode())

###############################################################

# replacing missing values in quantity (float) median 
df['ANYALC'] = df['ANYALC'].fillna(df['ANYALC'].median())
df['BINGEDAYS'] = df['BINGEDAYS'].fillna(df['BINGEDAYS'].median())
df['DRUGDAYS'] = df['DRUGDAYS'].fillna(df['DRUGDAYS'].median())
df['ALCDRUGS'] = df['ALCDRUGS'].fillna(df['ALCDRUGS'].median())
df['DAYSCOCAINE'] = df['DAYSCOCAINE'].fillna(df['DAYSCOCAINE'].median())
df['MARYJDAYS'] = df['MARYJDAYS'].fillna(df['MARYJDAYS'].median())
df['ANYOPIATEDAYS'] = df['ANYOPIATEDAYS'].fillna(df['ANYOPIATEDAYS'].median())
df['METHADONE'] = df['METHADONE'].fillna(df['METHADONE'].median())
df['HALLUC'] = df['HALLUC'].fillna(df['HALLUC'].median())
df['METHDAYS'] = df['METHDAYS'].fillna(df['METHDAYS'].median())
df['OTHERDRUGS'] = df['OTHERDRUGS'].fillna(df['OTHERDRUGS'].median())
df['AGE'] = df['AGE'].fillna(df['AGE'].median())

### 3) DataWig 
DataWig learns Machine Learning models to impute missing values in tables (paper: DataWig Missing Value Imputation for Tables
)

### case (1):

In [None]:
df2 = data_0.copy()

In [None]:
df2.isnull().sum()

In [None]:
df1 = df2.copy()
df1=df1.dropna()

In [None]:
df2['DAST'] = df2['DAST'].astype(str)
df2['SEX'] = df2['SEX'].astype(str)
df2['HISPANIC'] = df2['HISPANIC'].astype(str)
df2['RACE'] = df2['RACE'].astype(str)
df2['VET'] = df2['VET'].astype(str)
df2['ACTIVE'] = df2['ACTIVE'].astype(str)
df2['DEPLOY'] = df2['DEPLOY'].astype(str)
df2['AUDIT'] = df2['AUDIT'].astype(str)
df2['COSCREEN'] = df2['COSCREEN'].astype(str)
df2['RT'] = df2['RT'].astype(str)
df2['BI'] = df2['BI'].astype(str)
df2['BT'] = df2['BT'].astype(str)
df2['INJECT'] = df2['INJECT'].astype(str)
df2['TOBMONTH'] = df2['TOBMONTH'].astype(str)
###############################################################
df2['ANYALC'] = df2['ANYALC'].astype(float)
df2['BINGEDAYS'] = df2['BINGEDAYS'].astype(float)
df2['DRUGDAYS'] = df2['DRUGDAYS'].astype(float)

df2['ALCDRUGS'] = df2['ALCDRUGS'].astype(float)
df2['DAYSCOCAINE'] = df2['DAYSCOCAINE'].astype(float)
df2['MARYJDAYS'] = df2['MARYJDAYS'].astype(float)

df2['ANYOPIATEDAYS'] = df2['ANYOPIATEDAYS'].astype(float)
df2['METHADONE'] = df2['METHADONE'].astype(float)
df2['HALLUC'] = df2['HALLUC'].astype(float)

df2['METHDAYS'] = df2['METHDAYS'].astype(float)
df2['OTHERDRUGS'] =df2['OTHERDRUGS'].astype(float)
df2['AGE'] = df2['AGE'].astype(float)

In [None]:
df1['DAST'] = df1['DAST'].astype(str)
df1['SEX'] = df1['SEX'].astype(str)
df1['HISPANIC'] = df1['HISPANIC'].astype(str)
df1['RACE'] = df1['RACE'].astype(str)
df1['VET'] = df1['VET'].astype(str)
df1['ACTIVE'] = df1['ACTIVE'].astype(str)
df1['DEPLOY'] = df1['DEPLOY'].astype(str)
df1['AUDIT'] = df1['AUDIT'].astype(str)
df1['COSCREEN'] = df1['COSCREEN'].astype(str)
df1['RT'] = df1['RT'].astype(str)
df1['BI'] = df1['BI'].astype(str)
df1['BT'] = df1['BT'].astype(str)
df1['INJECT'] = df1['INJECT'].astype(str)
df1['TOBMONTH'] = df1['TOBMONTH'].astype(str)
###############################################################
df1['ANYALC'] = df1['ANYALC'].astype(float)
df1['BINGEDAYS'] = df1['BINGEDAYS'].astype(float)
df1['DRUGDAYS'] = df1['DRUGDAYS'].astype(float)

df1['ALCDRUGS'] = df1['ALCDRUGS'].astype(float)
df1['DAYSCOCAINE'] = df1['DAYSCOCAINE'].astype(float)
df1['MARYJDAYS'] = df1['MARYJDAYS'].astype(float)

df1['ANYOPIATEDAYS'] = df1['ANYOPIATEDAYS'].astype(float)
df1['METHADONE'] = df1['METHADONE'].astype(float)
df1['HALLUC'] = df1['HALLUC'].astype(float)

df1['METHDAYS'] = df1['METHDAYS'].astype(float)
df1['OTHERDRUGS'] =df1['OTHERDRUGS'].astype(float)
df1['AGE'] = df1['AGE'].astype(float)

###  Imputation of categorical columns

In [None]:
!pip install datawig
import datawig
df_train, df_test = datawig.utils.random_split(df1)

### DAST

In [None]:


imputer = datawig.SimpleImputer(
    input_columns=['RT','BI','DRUGDAYS','COSCREEN','MARYJDAYS','BT','ALCDRUGS','AUDIT','TOBMONTH','ANYALC'], # column(s) containing information about the column we want to impute
    output_column='DAST', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train)

In [None]:
predictions  = imputer.predict(df_test)
predictions 

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score,accuracy_score
print("Precision Score : ",precision_score(predictions['DAST'], predictions['DAST_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions['DAST'], predictions['DAST_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("accuracy Score : ",accuracy_score(predictions['DAST'], predictions['DAST_imputed']))

In [None]:
predictions  = imputer.predict(df2)
predictions

### SEX

In [None]:
import datawig
imputer1 = datawig.SimpleImputer(
    input_columns=['VET','AGE','DEPLOY','RACE','HISPANIC','ALCDRUGS','AUDIT','ANYALC'], # column(s) containing information about the column we want to impute
    output_column='SEX', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer1.fit(train_df=df_train)

In [None]:
predictions1  = imputer1.predict(df_test)
predictions1 

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
print("Precision Score : ",precision_score(predictions1['SEX'], predictions1['SEX_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions1['SEX'], predictions1['SEX_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("accuracy Score : ",accuracy_score(predictions1['SEX'], predictions1['SEX_imputed']))

In [None]:
predictions1  = imputer1.predict(df2)
predictions1

### HISPANIC

In [None]:
imputer2 = datawig.SimpleImputer(
    input_columns=['VET','AGE','DEPLOY','RACE','SEX','DRUGDAYS','TOBMONTH','DAST'], # column(s) containing information about the column we want to impute
    output_column='HISPANIC', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer2.fit(train_df=df_train)

In [None]:
predictions2  = imputer2.predict(df_test)
predictions2 

In [None]:
print("Precision Score : ",precision_score(predictions2['HISPANIC'], predictions2['HISPANIC_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions2['HISPANIC'], predictions2['HISPANIC_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("accuracy Score : ",accuracy_score(predictions2['HISPANIC'], predictions2['HISPANIC_imputed']))

In [None]:
predictions2  = imputer2.predict(df2)
predictions2

### RACE

In [None]:
imputer3 = datawig.SimpleImputer(
    input_columns=['HISPANIC','SEX','VET','AGE','TOBMONTH','DEPLOY','RT','DAST','MARYJDAYS'], # column(s) containing information about the column we want to impute
    output_column='RACE', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer3.fit(train_df=df_train)

In [None]:
predictions3  = imputer3.predict(df_test)
predictions3

In [None]:
print("Precision Score : ",precision_score(predictions3['RACE'], predictions3['RACE_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions3['RACE'], predictions3['RACE_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions3  = imputer3.predict(df2)
predictions3

### VET

In [None]:
imputer4 = datawig.SimpleImputer(
    input_columns=['DEPLOY','SEX','AGE','RACE','ACTIVE','HISPANIC','DAST'], # column(s) containing information about the column we want to impute
    output_column='VET', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer4.fit(train_df=df_train)

In [None]:
predictions4  = imputer4.predict(df_test)
predictions4

In [None]:
print("Precision Score : ",precision_score(predictions4['VET'], predictions4['VET_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions4['VET'], predictions4['VET_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions4  = imputer4.predict(df2)
predictions4

### ACTIVE

In [None]:
imputer5 = datawig.SimpleImputer(
    input_columns=['VET','DEPLOY','AGE','BT','DAST','BINGEDAYS','SEX','DRUGDAYS','ANYALC','ALCDRUGS'], # column(s) containing information about the column we want to impute
    output_column='ACTIVE', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer5.fit(train_df=df_train)

In [None]:
predictions5  = imputer5.predict(df_test)
predictions5

In [None]:
print("Precision Score : ",precision_score(predictions5['ACTIVE'], predictions5['ACTIVE_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions5['ACTIVE'], predictions5['ACTIVE_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions5  = imputer5.predict(df2)
predictions5

### DEPLOY

In [None]:
imputer6 = datawig.SimpleImputer(
    input_columns=['VET','SEX','ACTIVE','AGE','RACE','ANYALC','HISPANIC','AUDIT','ALCDRUGS'], # column(s) containing information about the column we want to impute
    output_column='DEPLOY', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer6.fit(train_df=df_train)

In [None]:
predictions6  = imputer6.predict(df_test)
predictions6

In [None]:
print("Precision Score : ",precision_score(predictions6['DEPLOY'], predictions6['DEPLOY_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions6['DEPLOY'], predictions6['DEPLOY_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions6  = imputer6.predict(df2)
predictions6

### AUDIT

In [None]:
imputer7 = datawig.SimpleImputer(
    input_columns=['ANYALC','BINGEDAYS','BI','ALCDRUGS','BT','RT','COSCREEN','DAST','DRUGDAYS','DAYSCOCAINE'], # column(s) containing information about the column we want to impute
    output_column='AUDIT', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer7.fit(train_df=df_train)

In [None]:
predictions7  = imputer7.predict(df_test)
predictions7

In [None]:
print("Precision Score : ",precision_score(predictions7['AUDIT'], predictions7['AUDIT_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions7['AUDIT'], predictions7['AUDIT_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions7  = imputer7.predict(df2)
predictions7

### COSCREEN

In [None]:
imputer8 = datawig.SimpleImputer(
    input_columns=['RT','DAST','BI','AUDIT','DRUGDAYS','MARYJDAYS','BT','ANYALC'], # column(s) containing information about the column we want to impute
    output_column='COSCREEN', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer8.fit(train_df=df_train)

In [None]:
predictions8  = imputer8.predict(df_test)
predictions8

In [None]:
print("Precision Score : ",precision_score(predictions8['COSCREEN'], predictions8['COSCREEN_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions8['COSCREEN'], predictions8['COSCREEN_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions8  = imputer8.predict(df2)
predictions8


### RT

In [None]:
imputer9 = datawig.SimpleImputer(
    input_columns=['DAST','DAYSCOCAINE','COSCREEN','BINGEDAYS','AUDIT','ALCDRUGS','OTHERDRUGS','HALLUC'], # column(s) containing information about the column we want to impute
    output_column='RT', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer9.fit(train_df=df_train)

In [None]:
predictions9  = imputer9.predict(df_test)
predictions9

In [None]:
print("Precision Score : ",precision_score(predictions9['RT'], predictions9['RT_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions9['RT'], predictions9['RT_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions9  = imputer9.predict(df2)
predictions9


### BI

In [None]:
imputer10 = datawig.SimpleImputer(
    input_columns=['DAST','ANYALC','AUDIT','DRUGDAYS','MARYJDAYS','ALCDRUGS','TOBMONTH','COSCREEN','AGE'], # column(s) containing information about the column we want to impute
    output_column='BI', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer10.fit(train_df=df_train)

In [None]:
predictions10  = imputer10.predict(df_test)
predictions10

In [None]:
print("Precision Score : ",precision_score(predictions10['BI'], predictions10['BI_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions10['BI'], predictions10['BI_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions10  = imputer10.predict(df2)
predictions10

### BT

In [None]:
imputer11 = datawig.SimpleImputer(
    input_columns=['DAST','ANYALC','AUDIT','BINGEDAYS','DRUGDAYS','COSCREEN','METHDAYS','ALCDRUGS','MARYJDAYS'], # column(s) containing information about the column we want to impute
    output_column='BT', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer11.fit(train_df=df_train)

In [None]:
predictions11  = imputer11.predict(df_test)
predictions11

In [None]:
print("Precision Score : ",precision_score(predictions11['BT'], predictions11['BT_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions11['BT'], predictions11['BT_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions11  = imputer11.predict(df2)
predictions11

### INJECT

In [None]:
imputer12 = datawig.SimpleImputer(
    input_columns=['ANYOPIATEDAYS','DAYSCOCAINE','DAST','ALCDRUGS','BINGEDAYS','DRUGDAYS','RT','BT','COSCREEN'], # column(s) containing information about the column we want to impute
    output_column='INJECT', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer12.fit(train_df=df_train)

In [None]:
predictions12  = imputer12.predict(df_test)
predictions12

In [None]:
print("Precision Score : ",precision_score(predictions12['INJECT'], predictions12['INJECT_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions12['INJECT'], predictions12['INJECT_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions12  = imputer12.predict(df2)
predictions12

### TOBMONTH

In [None]:
imputer13 = datawig.SimpleImputer(
    input_columns=['DAST','BI','DAST','DRUGDAYS','MARYJDAYS','AUDIT','RACE','ANYALC','BT'], # column(s) containing information about the column we want to impute
    output_column='TOBMONTH', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer13.fit(train_df=df_train)

In [None]:
predictions13  = imputer13.predict(df_test)
predictions13

In [None]:
print("Precision Score : ",precision_score(predictions13['TOBMONTH'], predictions13['TOBMONTH_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions13['TOBMONTH'], predictions13['TOBMONTH_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions13  = imputer13.predict(df2)
predictions13

### Imputation of numerical columns

### ANYALC

In [None]:
imputer14 = datawig.SimpleImputer(
    input_columns=['AUDIT','BINGEDAYS','ALCDRUGS','BI','DAST','DRUGDAYS','RACE','MARYJDAYS','BT'], # column(s) containing information about the column we want to impute
    output_column='ANYALC', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer14.fit(train_df=df_train)

In [None]:
predictions14  = imputer14.predict(df_test)
predictions14

In [None]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(predictions14['ANYALC'], predictions14['ANYALC_imputed'], squared=False)
rms

In [None]:
predictions14  = imputer14.predict(df2)
predictions14

### BINGEDAYS

In [None]:
imputer15 = datawig.SimpleImputer(
    input_columns=['AUDIT','ANYALC','ALCDRUGS','DAYSCOCAINE','BT','RT','DRUGDAYS','MARYJDAYS','DAST'], # column(s) containing information about the column we want to impute
    output_column='BINGEDAYS', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer15.fit(train_df=df_train)

In [None]:
predictions15  = imputer15.predict(df_test)
predictions15

In [None]:
rms = mean_squared_error(predictions15['BINGEDAYS'], predictions15['BINGEDAYS_imputed'], squared=False)
rms

In [None]:
predictions15  = imputer15.predict(df2)
predictions15

### DRUGDAYS

In [None]:
imputer16 = datawig.SimpleImputer(
    input_columns=['AUDIT','BINGEDAYS','ALCDRUGS','BI','DAST','DRUGDAYS','RACE','MARYJDAYS','BT'], # column(s) containing information about the column we want to impute
    output_column='DRUGDAYS', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer16.fit(train_df=df_train)

In [None]:
predictions16  = imputer16.predict(df_test)
predictions16

In [None]:
rms = mean_squared_error(predictions16['DRUGDAYS'], predictions16['DRUGDAYS_imputed'], squared=False)
rms

In [None]:
predictions16  = imputer16.predict(df2)
predictions16

### ALCDRUGS 

In [None]:
imputer17 = datawig.SimpleImputer(
    input_columns=['DRUGDAYS','MARYJDAYS','ANYALC','DAYSCOCAINE','BINGEDAYS','AUDIT','DAST','RT'], # column(s) containing information about the column we want to impute
    output_column='ALCDRUGS', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer17.fit(train_df=df_train)

In [None]:
predictions17  = imputer17.predict(df_test)
predictions17

In [None]:
rms = mean_squared_error(predictions17['ALCDRUGS'], predictions17['ALCDRUGS_imputed'], squared=False)
rms

In [None]:
predictions17  = imputer17.predict(df2)
predictions17

### DAYSCOCAINE

In [None]:
imputer18 = datawig.SimpleImputer(
    input_columns=['ALCDRUGS','BINGEDAYS','INJECT','RT','AUDIT','ANYALC','DAST','DRUGDAYS','COSCREEN','DAST'], # column(s) containing information about the column we want to impute
    output_column='DAYSCOCAINE', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer18.fit(train_df=df_train)

In [None]:
predictions18  = imputer18.predict(df_test)
predictions18

In [None]:
rms = mean_squared_error(predictions18['DAYSCOCAINE'], predictions18['DAYSCOCAINE_imputed'], squared=False)
rms

In [None]:
predictions18  = imputer18.predict(df2)
predictions18

### MARYJDAYS

In [None]:
imputer19 = datawig.SimpleImputer(
    input_columns=['DRUGDAYS','ALCDRUGS','DAST','BI','ANYALC','COSCREEN','BINGEDAYS','AUDIT'], # column(s) containing information about the column we want to impute
    output_column='MARYJDAYS', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer19.fit(train_df=df_train)

In [None]:
predictions19  = imputer19.predict(df_test)
predictions19

In [None]:
rms = mean_squared_error(predictions19['MARYJDAYS'], predictions19['MARYJDAYS_imputed'], squared=False)
rms

In [None]:
predictions19  = imputer19.predict(df2)
predictions19

### ANYOPIATEDAYS

In [None]:
imputer20 = datawig.SimpleImputer(
    input_columns=['INJECT','DAST','BT','DRUGDAYS','DAYSCOCAINE','ALCDRUGS','BINGEDAYS','RT'], # column(s) containing information about the column we want to impute
    output_column='ANYOPIATEDAYS', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer20.fit(train_df=df_train)

In [None]:
predictions20  = imputer20.predict(df_test)
predictions20

In [None]:
rms = mean_squared_error(predictions20['ANYOPIATEDAYS'], predictions20['ANYOPIATEDAYS_imputed'], squared=False)
rms

In [None]:
predictions20  = imputer20.predict(df2)
predictions20

### METHADONE

In [None]:
imputer21 = datawig.SimpleImputer(
    input_columns=['AUDIT','BINGEDAYS','ALCDRUGS','BI','DAST','DRUGDAYS','RACE','MARYJDAYS','BT'], # column(s) containing information about the column we want to impute
    output_column='METHADONE', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer21.fit(train_df=df_train)

In [None]:
predictions21  = imputer21.predict(df_test)
predictions21

In [None]:
rms = mean_squared_error(predictions21['METHADONE'], predictions21['METHADONE_imputed'], squared=False)
rms

In [None]:
predictions21  = imputer14.predict(df2)
predictions21

### HALLUC

In [None]:
imputer22 = datawig.SimpleImputer(
    input_columns=['AUDIT','BINGEDAYS','ALCDRUGS','BI','DAST','DRUGDAYS','RACE','MARYJDAYS','BT'], # column(s) containing information about the column we want to impute
    output_column='HALLUC', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer22.fit(train_df=df_train)

In [None]:
predictions22  = imputer22.predict(df_test)
predictions22

In [None]:
rms = mean_squared_error(predictions22['HALLUC'], predictions22['HALLUC_imputed'], squared=False)
rms

In [None]:
predictions22  = imputer22.predict(df2)
predictions22

### METHDAYS

In [None]:
imputer23 = datawig.SimpleImputer(
    input_columns=['BT','DRUGDAYS','DAST','RT','COSCREEN','RACE','AGE','VET','DEPLOY','TOBMONTH'], # column(s) containing information about the column we want to impute
    output_column='METHDAYS', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer23.fit(train_df=df_train)

In [None]:
predictions23  = imputer23.predict(df_test)
predictions23

In [None]:
rms = mean_squared_error(predictions23['METHDAYS'], predictions23['METHDAYS_imputed'], squared=False)
rms

In [None]:
predictions23  = imputer23.predict(df2)
predictions23

### OTHERDRUGS

In [None]:
imputer24 = datawig.SimpleImputer(
    input_columns=['AUDIT','BINGEDAYS','ALCDRUGS','BI','DAST','DRUGDAYS','RACE','MARYJDAYS','BT'], # column(s) containing information about the column we want to impute
    output_column='OTHERDRUGS', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer24.fit(train_df=df_train)

In [None]:
predictions24  = imputer24.predict(df_test)
predictions24

In [None]:
rms = mean_squared_error(predictions24['OTHERDRUGS'], predictions24['OTHERDRUGS_imputed'], squared=False)
rms

In [None]:
predictions24  = imputer24.predict(df2)
predictions24

### AGE

In [None]:
imputer25 = datawig.SimpleImputer(
    input_columns=['VET','SEX','BI','RACE','DRUGDAYS','ACTIVE','MARYJDAYS','DEPLOY','DAST','HISPANIC'], # column(s) containing information about the column we want to impute
    output_column='AGE', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer25.fit(train_df=df_train)

In [None]:
predictions25 = imputer25.predict(df_test)
predictions25

In [None]:
rms = mean_squared_error(predictions25['AGE'], predictions25['AGE_imputed'], squared=False)
rms

In [None]:
predictions25  = imputer25.predict(df2)
predictions25

### Case(2):

In [None]:
df4 = data_0.copy()


In [None]:
bins= [0,21,30,45,60,100]
labels = [1,2,3,4,5]
df4['AGE'] = pd.cut(df4['AGE'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['BINGEDAYS'] = pd.cut(df4['BINGEDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['DRUGDAYS'] = pd.cut(df4['DRUGDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['ALCDRUGS'] = pd.cut(df4['ALCDRUGS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['DAYSCOCAINE'] = pd.cut(df4['DAYSCOCAINE'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['MARYJDAYS'] = pd.cut(df4['MARYJDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['ANYOPIATEDAYS'] = pd.cut(df4['ANYOPIATEDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['METHADONE'] = pd.cut(df4['METHADONE'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['HALLUC'] = pd.cut(df4['HALLUC'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['METHDAYS'] = pd.cut(df4['METHDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['OTHERDRUGS'] = pd.cut(df4['OTHERDRUGS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df4['ANYALC'] = pd.cut(df4['ANYALC'], bins=bins, labels=labels, right=False)
df4

In [None]:
df3 = df4.copy()
df3=df3.dropna()
df3.dropna()

In [None]:
df4['DAST'] = df4['DAST'].astype(str)
df4['SEX'] = df4['SEX'].astype(str)
df4['HISPANIC'] = df4['HISPANIC'].astype(str)
df4['RACE'] = df4['RACE'].astype(str)
df4['VET'] = df4['VET'].astype(str)
df4['ACTIVE'] = df4['ACTIVE'].astype(str)
df4['DEPLOY'] = df4['DEPLOY'].astype(str)
df4['AUDIT'] = df4['AUDIT'].astype(str)
df4['COSCREEN'] = df4['COSCREEN'].astype(str)
df4['RT'] = df4['RT'].astype(str)
df4['BI'] = df4['BI'].astype(str)
df4['BT'] = df4['BT'].astype(str)
df4['INJECT'] = df4['INJECT'].astype(str)
df4['TOBMONTH'] = df4['TOBMONTH'].astype(str)
###############################################################
df4['ANYALC'] = df4['ANYALC'].astype(str)
df4['BINGEDAYS'] = df4['BINGEDAYS'].astype(str)
df4['DRUGDAYS'] = df4['DRUGDAYS'].astype(str)

df4['ALCDRUGS'] = df4['ALCDRUGS'].astype(str)
df4['DAYSCOCAINE'] = df4['DAYSCOCAINE'].astype(str)
df4['MARYJDAYS'] = df4['MARYJDAYS'].astype(str)

df4['ANYOPIATEDAYS'] = df4['ANYOPIATEDAYS'].astype(str)
df4['METHADONE'] = df4['METHADONE'].astype(str)
df4['HALLUC'] = df4['HALLUC'].astype(str)

df4['METHDAYS'] = df4['METHDAYS'].astype(str)
df4['OTHERDRUGS'] =df4['OTHERDRUGS'].astype(str)
df4['AGE'] = df4['AGE'].astype(str)
df4

In [None]:
df3['DAST'] = df3['DAST'].astype(str)
df3['SEX'] = df3['SEX'].astype(str)
df3['HISPANIC'] = df3['HISPANIC'].astype(str)
df3['RACE'] = df3['RACE'].astype(str)
df3['VET'] = df3['VET'].astype(str)
df3['ACTIVE'] = df3['ACTIVE'].astype(str)
df3['DEPLOY'] = df3['DEPLOY'].astype(str)
df3['AUDIT'] = df3['AUDIT'].astype(str)
df3['COSCREEN'] = df3['COSCREEN'].astype(str)
df3['RT'] = df3['RT'].astype(str)
df3['BI'] = df3['BI'].astype(str)
df3['BT'] = df3['BT'].astype(str)
df3['INJECT'] = df3['INJECT'].astype(str)
df3['TOBMONTH'] = df3['TOBMONTH'].astype(str)
###############################################################
df3['ANYALC'] = df3['ANYALC'].astype(str)
df3['BINGEDAYS'] = df3['BINGEDAYS'].astype(str)
df3['DRUGDAYS'] = df3['DRUGDAYS'].astype(str)

df3['ALCDRUGS'] = df3['ALCDRUGS'].astype(str)
df3['DAYSCOCAINE'] = df3['DAYSCOCAINE'].astype(str)
df3['MARYJDAYS'] = df3['MARYJDAYS'].astype(str)

df3['ANYOPIATEDAYS'] = df3['ANYOPIATEDAYS'].astype(str)
df3['METHADONE'] = df3['METHADONE'].astype(str)
df3['HALLUC'] = df3['HALLUC'].astype(str)

df3['METHDAYS'] = df3['METHDAYS'].astype(str)
df3['OTHERDRUGS'] =df3['OTHERDRUGS'].astype(str)
df3['AGE'] = df3['AGE'].astype(str)

### Imputation of categorical columns

### SEX

In [None]:
import datawig
df_train, df_test = datawig.utils.random_split(df3)

In [None]:
import datawig
imputer26 = datawig.SimpleImputer(
    input_columns=['VET','AGE','DEPLOY','RACE','HISPANIC','ALCDRUGS','AUDIT','ANYALC'], # column(s) containing information about the column we want to impute
    output_column='SEX', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer26.fit(train_df=df_train)

In [None]:
predictions26  = imputer26.predict(df_test)
predictions26 

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print("Precision Score : ",precision_score(predictions26['SEX'], predictions26['SEX_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions26['SEX'], predictions26['SEX_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions26  = imputer26.predict(df4)
predictions26 

### RACE

In [None]:
imputer27 = datawig.SimpleImputer(
    input_columns=['HISPANIC','SEX','VET','AGE','TOBMONTH','DEPLOY','RT','DAST','MARYJDAYS'], # column(s) containing information about the column we want to impute
    output_column='RACE', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer27.fit(train_df=df_train)

In [None]:
predictions27  = imputer27.predict(df_test)
predictions27 

In [None]:
print("Precision Score : ",precision_score(predictions27['RACE'], predictions27['RACE_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions27['RACE'], predictions27['RACE_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions27  = imputer27.predict(df4)
predictions27 

### TOBMONTH

In [None]:
imputer28 = datawig.SimpleImputer(
    input_columns=['DAST','BI','DAST','DRUGDAYS','MARYJDAYS','AUDIT','RACE','ANYALC','BT'], # column(s) containing information about the column we want to impute
    output_column='TOBMONTH', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer28.fit(train_df=df_train)

In [None]:
predictions28  = imputer28.predict(df_test)
predictions28 

In [None]:
print("Precision Score : ",precision_score(predictions28['TOBMONTH'], predictions28['TOBMONTH_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions28['TOBMONTH'], predictions28['TOBMONTH_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:
predictions28  = imputer28.predict(df4)
predictions28 

### ANYALC

In [None]:
imputer29 = datawig.SimpleImputer(
    input_columns=['AUDIT','BINGEDAYS','ALCDRUGS','BI','DAST','DRUGDAYS','RACE','MARYJDAYS','BT'], # column(s) containing information about the column we want to impute
    output_column='ANYALC', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer29.fit(train_df=df_train)

In [None]:
predictions29  = imputer29.predict(df_test)
predictions29 

In [None]:
print("Precision Score : ",precision_score(predictions29['ANYALC'], predictions29['ANYALC_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions29['ANYALC'], predictions29['ANYALC_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:

rms = mean_squared_error(predictions29['ANYALC'], predictions29['ANYALC_imputed'], squared=False)
rms 

In [None]:
predictions29  = imputer29.predict(df4)
predictions29 

### AGE

In [None]:
imputer30 = datawig.SimpleImputer(
    input_columns=['VET','SEX','BI','RACE','DRUGDAYS','ACTIVE','MARYJDAYS','DEPLOY','DAST','HISPANIC'], # column(s) containing information about the column we want to impute
    output_column='AGE', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer30.fit(train_df=df_train)

In [None]:
predictions30  = imputer30.predict(df_test)
predictions30 

In [None]:
print("Precision Score : ",precision_score(predictions30['AGE'], predictions30['AGE_imputed'], 
                                           pos_label='positive',
                                           average='micro'))
print("Recall Score : ",recall_score(predictions30['AGE'], predictions30['AGE_imputed'], 
                                           pos_label='positive',
                                           average='micro'))

In [None]:

rms = mean_squared_error(predictions30['AGE'], predictions30['AGE_imputed'], squared=False)
rms

In [None]:
predictions30  = imputer30.predict(df4)
predictions30 

### 4) KNN Imputer (K-Nearest Neighbor)

#### * Select K nearest or similar data points using allthe non-missing features
#### *Take average ofthe selected data points to ll in the missing feature

In [56]:
df5 = data_0.copy()
df5=df5.sample(frac=1)
cols=['SEX' ,'RACE','VET','BI','RT','DEPLOY','AGE'
      ,'DRUGDAYS','ALCDRUGS','TOBMONTH']
df5=df5[cols]
# df5 =pd.read_csv('/home/sameerahtalafha/new_project/new/tables/ALL-original.csv')
df5

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,AGE,DRUGDAYS,ALCDRUGS,TOBMONTH
6553,1.0,2.0,0.0,1.0,0.0,0.0,29.0,,,
2111,1.0,2.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0
1161,1.0,1.0,0.0,0.0,0.0,0.0,64.0,0.0,0.0,0.0
3796,1.0,2.0,1.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0
5039,0.0,2.0,0.0,0.0,0.0,0.0,62.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
3128,1.0,1.0,0.0,0.0,0.0,0.0,51.0,0.0,0.0,1.0
5602,1.0,1.0,0.0,0.0,0.0,0.0,78.0,0.0,0.0,0.0
3929,1.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0
138,1.0,2.0,0.0,1.0,0.0,0.0,57.0,27.0,0.0,0.0


In [57]:
df6 = df5.copy()
df6=df6.dropna()
df6

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,AGE,DRUGDAYS,ALCDRUGS,TOBMONTH
2111,1.0,2.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0
1161,1.0,1.0,0.0,0.0,0.0,0.0,64.0,0.0,0.0,0.0
3796,1.0,2.0,1.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0
5039,0.0,2.0,0.0,0.0,0.0,0.0,62.0,0.0,0.0,0.0
1411,1.0,2.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2186,0.0,2.0,0.0,0.0,0.0,0.0,45.0,0.0,0.0,0.0
3128,1.0,1.0,0.0,0.0,0.0,0.0,51.0,0.0,0.0,1.0
5602,1.0,1.0,0.0,0.0,0.0,0.0,78.0,0.0,0.0,0.0
3929,1.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0


In [58]:
# bins= [15,21,35,45,60,100]
# labels = [0,1,2,3,4]
# df6['AGE'] = pd.cut(df6['AGE'], bins=bins, labels=labels, right=False)

# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df6['BINGEDAYS'] = pd.cut(df6['BINGEDAYS'], bins=bins, labels=labels, right=False)

# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df6['DRUGDAYS'] = pd.cut(df6['DRUGDAYS'], bins=bins, labels=labels, right=False)

# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df6['ALCDRUGS'] = pd.cut(df6['ALCDRUGS'], bins=bins, labels=labels, right=False)

# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df6['DAYSCOCAINE'] = pd.cut(df6['DAYSCOCAINE'], bins=bins, labels=labels, right=False)

# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df6['MARYJDAYS'] = pd.cut(df6['MARYJDAYS'], bins=bins, labels=labels, right=False)



# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df6['METHDAYS'] = pd.cut(df6['METHDAYS'], bins=bins, labels=labels, right=False)



# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df6['ANYALC'] = pd.cut(df6['ANYALC'], bins=bins, labels=labels, right=False)

df6.isnull().sum()

SEX         0
RACE        0
VET         0
BI          0
RT          0
DEPLOY      0
AGE         0
DRUGDAYS    0
ALCDRUGS    0
TOBMONTH    0
dtype: int64

In [59]:
# df6['DAST'] = df6['DAST'].astype(float).astype(int)
# df6['SEX'] = df6['SEX'].astype(float).astype(int)
# df6['HISPANIC'] = df6['HISPANIC'].astype(float).astype(int)
# df6['RACE'] = df6['RACE'].astype(float).astype(int)
# df6['VET'] = df6['VET'].astype(float).astype(int)
# df6['ACTIVE'] = df6['ACTIVE'].astype(float).astype(int)
# df6['DEPLOY'] = df6['DEPLOY'].astype(float).astype(int)
# df6['AUDIT'] = df6['AUDIT'].astype(float).astype(int)
# df6['COSCREEN'] = df6['COSCREEN'].astype(float).astype(int)
# df6['RT'] = df6['RT'].astype(float).astype(int)
# df6['BI'] = df6['BI'].astype(float).astype(int)
# df6['BT'] = df6['BT'].astype(float).astype(int)
# df6['INJECT'] = df6['INJECT'].astype(float).astype(int)
# df6['TOBMONTH'] = df6['TOBMONTH'].astype(float).astype(int)
# ###############################################################
# df6['ANYALC'] = df6['ANYALC'].astype(float).astype(int)
# df6['BINGEDAYS'] = df6['BINGEDAYS'].astype(float).astype(int)
# df6['DRUGDAYS'] = df6['DRUGDAYS'].astype(float).astype(int)

# df6['ALCDRUGS'] = df6['ALCDRUGS'].astype(float).astype(int)
# df6['DAYSCOCAINE'] = df6['DAYSCOCAINE'].astype(float).astype(int)
# df6['MARYJDAYS'] = df6['MARYJDAYS'].astype(float).astype(int)



# df6['METHDAYS'] = df6['METHDAYS'].astype(float).astype(int)

# df6['AGE'] = df6['AGE'].astype(float).astype(int)

In [60]:
import collections
import random
df7 = df6.copy()
replaced = collections.defaultdict(set)
ix = [(row, col) for row in range(df7.shape[0]) for col in range(df7.shape[1])]
random.shuffle(ix)
to_replace = int(round(.2*len(ix)))
for row, col in ix:
    if len(replaced[row]) < df7.shape[1] - 1:
        df7.iloc[row, col] = np.nan
        to_replace -= 1
        replaced[row].add(col)
        if to_replace == 0:
            break

In [61]:
df7.isnull().sum()

SEX         798
RACE        726
VET         740
BI          771
RT          780
DEPLOY      755
AGE         791
DRUGDAYS    722
ALCDRUGS    718
TOBMONTH    805
dtype: int64

In [62]:
#x_0=np.where(df7['DAST'].isnull())[0]
x_0=np.where(df7['SEX'].isnull())[0]
#x_2=np.where(df7['HISPANIC'].isnull())[0]
x_1=np.where(df7['RACE'].isnull())[0]
x_2=np.where(df7['VET'].isnull())[0]
#x_5=np.where(df7['ACTIVE'].isnull())[0]
x_3=np.where(df7['DEPLOY'].isnull())[0]
#x_7=np.where(df7['AUDIT'].isnull())[0]
#x_8=np.where(df7['COSCREEN'].isnull())[0]
x_4=np.where(df7['BI'].isnull())[0]
#x_10=np.where(df7['BT'].isnull())[0]
x_5=np.where(df7['RT'].isnull())[0]
#x_12=np.where(df7['ANYALC'].isnull())[0]
#x_13=np.where(df7['BINGEDAYS'].isnull())[0]
x_7=np.where(df7['DRUGDAYS'].isnull())[0]
x_8=np.where(df7['ALCDRUGS'].isnull())[0]
#x_16=np.where(df7['DAYSCOCAINE'].isnull())[0]
#x_17=np.where(df7['MARYJDAYS'].isnull())[0]

#x_18=np.where(df7['METHDAYS'].isnull())[0]

#x_19=np.where(df7['INJECT'].isnull())[0]
x_6=np.where(df7['AGE'].isnull())[0]
x_9=np.where(df7['TOBMONTH'].isnull())[0]


In [63]:
# instatiate both packages to use 
from fancyimpute import KNN
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
imputer = KNN(3)
#create a list of categorical columns to iterate over 
cat_cols =['SEX',  'RACE','VET',  'DEPLOY',  'BI', 'RT',  'DRUGDAYS',
       'ALCDRUGS', 'AGE', 'TOBMONTH']

def encode(data):
    '''function to encode non-null data and replace it in the original data '''
    #retain only non-null values 
    notnulls =np.array(data.dropna())
    #reshape the data for encoding
    impute_reshape = notnulls.reshape(-1,1)
    #encode date
    print(impute_reshape.shape)
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data 
for columns in cat_cols:
    encode(df7[columns])

(3005, 1)
(3077, 1)
(3063, 1)
(3048, 1)
(3032, 1)
(3023, 1)
(3081, 1)
(3085, 1)
(3012, 1)
(2998, 1)


In [64]:
encode_data = pd.DataFrame(np.round(imputer.fit_transform(df7)),columns = df7.columns) 

Imputing row 1/3803 with 2 missing, elapsed time: 2.038
Imputing row 101/3803 with 1 missing, elapsed time: 2.045
Imputing row 201/3803 with 3 missing, elapsed time: 2.050
Imputing row 301/3803 with 3 missing, elapsed time: 2.056
Imputing row 401/3803 with 3 missing, elapsed time: 2.062
Imputing row 501/3803 with 4 missing, elapsed time: 2.067
Imputing row 601/3803 with 1 missing, elapsed time: 2.073
Imputing row 701/3803 with 3 missing, elapsed time: 2.080
Imputing row 801/3803 with 3 missing, elapsed time: 2.086
Imputing row 901/3803 with 2 missing, elapsed time: 2.092
Imputing row 1001/3803 with 3 missing, elapsed time: 2.098
Imputing row 1101/3803 with 2 missing, elapsed time: 2.103
Imputing row 1201/3803 with 4 missing, elapsed time: 2.109
Imputing row 1301/3803 with 4 missing, elapsed time: 2.115
Imputing row 1401/3803 with 3 missing, elapsed time: 2.121
Imputing row 1501/3803 with 1 missing, elapsed time: 2.127
Imputing row 1601/3803 with 1 missing, elapsed time: 2.132
Imputing 

In [65]:
encode_data

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,AGE,DRUGDAYS,ALCDRUGS,TOBMONTH
0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,45.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,43.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
3798,0.0,1.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0
3799,1.0,0.0,0.0,0.0,0.0,0.0,32.0,0.0,0.0,1.0
3800,1.0,0.0,0.0,0.0,0.0,0.0,59.0,0.0,0.0,0.0
3801,1.0,1.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0


In [66]:
encode_data.isnull().sum()

SEX         0
RACE        0
VET         0
BI          0
RT          0
DEPLOY      0
AGE         0
DRUGDAYS    0
ALCDRUGS    0
TOBMONTH    0
dtype: int64

In [67]:
df6.iloc[:,0] = df6.iloc[:,0].astype(float).astype(int)
df6.iloc[:,1] = df6.iloc[:,1].astype(float).astype(int)
df6.iloc[:,2] = df6.iloc[:,2].astype(float).astype(int)
df6.iloc[:,3] = df6.iloc[:,3].astype(float).astype(int)
df6.iloc[:,4] = df6.iloc[:,4].astype(float).astype(int)
df6.iloc[:,5] = df6.iloc[:,5].astype(float).astype(int)
df6.iloc[:,6] = df6.iloc[:,6].astype(float).astype(int)
df6.iloc[:,7]= df6.iloc[:,7].astype(float).astype(int)
df6.iloc[:,8] = df6.iloc[:,8].astype(float).astype(int)
df6.iloc[:,9] = df6.iloc[:,9].astype(float).astype(int)
# df6.iloc[:,10] = df6.iloc[:,10].astype(float).astype(int)
# df6.iloc[:,11]= df6.iloc[:,11].astype(float).astype(int)
# df6.iloc[:,12]= df6.iloc[:,12].astype(float).astype(int)
# df6.iloc[:,13] = df6.iloc[:,13].astype(float).astype(int)
# ###############################################################
# df6.iloc[:,14] = df6.iloc[:,14].astype(float).astype(int)
# df6.iloc[:,15] = df6.iloc[:,15].astype(float).astype(int)
# df6.iloc[:,16] = df6.iloc[:,16].astype(float).astype(int)

# df6.iloc[:,17] = df6.iloc[:,17].astype(float).astype(int)
# df6.iloc[:,18]= df6.iloc[:,18].astype(float).astype(int)
# df6.iloc[:,19] = df6.iloc[:,19].astype(float).astype(int)

# df6.iloc[:,20] = df6.iloc[:,20].astype(float).astype(int)
# df6.iloc[:,21]= df6.iloc[:,21].astype(float).astype(int)


#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
encode_data.iloc[:,0] = encode_data.iloc[:,0].astype(float).astype(int)
encode_data.iloc[:,1] = encode_data.iloc[:,1].astype(float).astype(int)
encode_data.iloc[:,2] = encode_data.iloc[:,2].astype(float).astype(int)
encode_data.iloc[:,3] = encode_data.iloc[:,3].astype(float).astype(int)
encode_data.iloc[:,4] = encode_data.iloc[:,4].astype(float).astype(int)
encode_data.iloc[:,5] = encode_data.iloc[:,5].astype(float).astype(int)
encode_data.iloc[:,6] = encode_data.iloc[:,6].astype(float).astype(int)
encode_data.iloc[:,7]= encode_data.iloc[:,7].astype(float).astype(int)
encode_data.iloc[:,8] = encode_data.iloc[:,8].astype(float).astype(int)
encode_data.iloc[:,9] = encode_data.iloc[:,9].astype(float).astype(int)
# encode_data.iloc[:,10] = encode_data.iloc[:,10].astype(float).astype(int)
# encode_data.iloc[:,11]= encode_data.iloc[:,11].astype(float).astype(int)
# encode_data.iloc[:,12]= encode_data.iloc[:,12].astype(float).astype(int)
# encode_data.iloc[:,13] = encode_data.iloc[:,13].astype(float).astype(int)
# ###############################################################
# encode_data.iloc[:,14] = encode_data.iloc[:,14].astype(float).astype(int)
# encode_data.iloc[:,15] = encode_data.iloc[:,15].astype(float).astype(int)
# encode_data.iloc[:,16] = encode_data.iloc[:,16].astype(float).astype(int)

# encode_data.iloc[:,17] = encode_data.iloc[:,17].astype(float).astype(int)
# encode_data.iloc[:,18]= encode_data.iloc[:,18].astype(float).astype(int)
# encode_data.iloc[:,19] = encode_data.iloc[:,19].astype(float).astype(int)

# encode_data.iloc[:,20] = encode_data.iloc[:,20].astype(float).astype(int)
# encode_data.iloc[:,21]= encode_data.iloc[:,21].astype(float).astype(int)


In [68]:
true_0=[]
for i in x_0:
        if(df6.iloc[i,0]==encode_data.iloc[i,0]):
            true_0.append(i)
true_1=[]
for i in x_1:
        if(df6.iloc[i,1]==encode_data.iloc[i,1]):
            true_1.append(i)
true_2=[]
for i in x_2:
        if(df6.iloc[i,2]==encode_data.iloc[i,2]):
            true_2.append(i)
true_3=[]
for i in x_3:
        if(df6.iloc[i,3]==encode_data.iloc[i,3]):
            true_3.append(i)

true_4=[]
for i in x_4:
        if(df6.iloc[i,4]==encode_data.iloc[i,4]):
            true_4.append(i)
   
            
true_5=[]
for i in x_5:
        if(df6.iloc[i,5]==encode_data.iloc[i,5]):
            true_5.append(i)
    
true_6=[]
for i in x_6:
        if(df6.iloc[i,6]==encode_data.iloc[i,6]):
            true_6.append(i)

true_7=[]
for i in x_7:
        if(df6.iloc[i,7]==encode_data.iloc[i,7]):
            true_7.append(i)
true_8=[]
for i in x_8:
        if(df6.iloc[i,8]==encode_data.iloc[i,8]):
            true_8.append(i)
true_9=[]
for i in x_9:
        if(df6.iloc[i,9]==encode_data.iloc[i,9]):
            true_9.append(i)
# true_10=[]
# for i in x_10:
#         if(df6.iloc[i,10]==encode_data.iloc[i,10]):
#             true_10.append(i)
            
# true_11=[]
# for i in x_11:
#         if(df6.iloc[i,11]==encode_data.iloc[i,11]):
#             true_11.append(i)
            
            
# true_12=[]
# for i in x_12:
#         if(df6.iloc[i,12]==encode_data.iloc[i,12]):
#             true_12.append(i)
            
# true_13=[]
# for i in x_13:
#         if(df6.iloc[i,13]==encode_data.iloc[i,13]):
#             true_13.append(i)
            
            
# true_14=[]
# for i in x_14:
#         if(df6.iloc[i,14]==encode_data.iloc[i,14]):
#             true_14.append(i)
            
# true_15=[]
# for i in x_15:
#         if(df6.iloc[i,15]==encode_data.iloc[i,15]):
#             true_15.append(i)    
            
# true_16=[]
# for i in x_16:
#         if(df6.iloc[i,16]==encode_data.iloc[i,16]):
#             true_16.append(i)
            
# true_17=[]
# for i in x_17:
#         if(df6.iloc[i,17]==encode_data.iloc[i,17]):
#             true_17.append(i)
            
# true_18=[]
# for i in x_18:
#         if(df6.iloc[i,18]==encode_data.iloc[i,18]):
#             true_18.append(i)      
            
            
# true_19=[]
# for i in x_19:
#         if(df6.iloc[i,19]==encode_data.iloc[i,19]):
#             true_19.append(i)   
            
            
# true_20=[]
# for i in x_20:
#         if(df6.iloc[i,20]==encode_data.iloc[i,20]):
#             true_20.append(i)   
            
            
# true_21=[]
# for i in x_21:
#         if(df6.iloc[i,21]==encode_data.iloc[i,21]):
#             true_21.append(i)   
            
              
encode_data.iloc[:,8]
            

0       0
1       0
2       0
3       0
4       0
       ..
3798    0
3799    0
3800    0
3801    0
3802    0
Name: ALCDRUGS, Length: 3803, dtype: int64

In [69]:
df6['AGE'].head()

2111    21
1161    64
3796    26
5039    62
1411    61
Name: AGE, dtype: int64

In [47]:
## Accuracy in each column 
print(1-((len(x_0)-len(true_0))/len(x_0)))
print(1-((len(x_1)-len(true_1))/len(x_1)))
print(1-((len(x_2)-len(true_2))/len(x_2)))
print(1-((len(x_3)-len(true_3))/len(x_3)))
print(1-((len(x_4)-len(true_4))/len(x_4)))
print(1-((len(x_5)-len(true_5))/len(x_5)))
print(1-((len(x_6)-len(true_6))/len(x_6)))
print(1-((len(x_7)-len(true_7))/len(x_7)))
print(1-((len(x_8)-len(true_8))/len(x_8)))
print(1-((len(x_9)-len(true_9))/len(x_9)))
# print(1-((len(x_10)-len(true_10))/len(x_10)))
# print(1-((len(x_11)-len(true_11))/len(x_11)))
# print(1-((len(x_12)-len(true_12))/len(x_12)))
# print(1-((len(x_13)-len(true_13))/len(x_13)))
# print(1-((len(x_14)-len(true_14))/len(x_14)))
# print(1-((len(x_15)-len(true_15))/len(x_15)))
# print(1-((len(x_16)-len(true_16))/len(x_16)))
# print(1-((len(x_17)-len(true_17))/len(x_17)))
# print(1-((len(x_18)-len(true_18))/len(x_18)))
# print(1-((len(x_19)-len(true_19))/len(x_19)))
# print(1-((len(x_20)-len(true_20))/len(x_20)))
# print(1-((len(x_21)-len(true_21))/len(x_21)))


0.6935064935064935
0.2276315789473684
0.7745358090185677
0.982256020278834
1.0
0.9459459459459459
0.003973509933774877
0.9585062240663901
0.9887920298879203
0.7013333333333334


### Multiple Imputations by Chained Equations (MICE)

#### *Perform multiple regressions over random sample ofthe data
#### *Take average ofthe multiple regression values
#### *Impute the missing feature value for the data point

In [None]:
#df8 = data_0.copy()
df8 =pd.read_csv('/home/sameerahtalafha/new_project/new/tables/ALL1-original.csv')

In [None]:
df9 = df8.copy()
df9=df9.dropna()
df9.isnull().sum()

In [None]:
bins= [15,21,35,45,60,100]
labels = [0,1,2,3,4]
df9['AGE'] = pd.cut(df9['AGE'], bins=bins, labels=labels, right=False)

bins= [0,1,10,20,31]
labels = [0,1,2,3]
df9['BINGEDAYS'] = pd.cut(df9['BINGEDAYS'], bins=bins, labels=labels, right=False)

bins= [0,1,10,20,31]
labels = [0,1,2,3]
df9['DRUGDAYS'] = pd.cut(df9['DRUGDAYS'], bins=bins, labels=labels, right=False)

bins= [0,1,10,20,31]
labels = [0,1,2,3]
df9['ALCDRUGS'] = pd.cut(df9['ALCDRUGS'], bins=bins, labels=labels, right=False)

bins= [0,1,10,20,31]
labels = [0,1,2,3]
df9['DAYSCOCAINE'] = pd.cut(df9['DAYSCOCAINE'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df9['MARYJDAYS'] = pd.cut(df9['MARYJDAYS'], bins=bins, labels=labels, right=False)

bins= [0,1,10,20,31]
labels = [0,1,2,3]
df9['METHDAYS'] = pd.cut(df9['METHDAYS'], bins=bins, labels=labels, right=False)

bins= [0,1,10,20,31]
labels = [0,1,2,3]
df9['ANYALC'] = pd.cut(df9['ANYALC'], bins=bins, labels=labels, right=False)

df9.isnull().sum()

In [None]:
df9['DAST'] = df9['DAST'].astype(float).astype(int)
df9['SEX'] = df9['SEX'].astype(float).astype(int)
df9['HISPANIC'] = df9['HISPANIC'].astype(float).astype(int)
df9['RACE'] = df9['RACE'].astype(float).astype(int)
df9['VET'] = df9['VET'].astype(float).astype(int)
df9['ACTIVE'] = df9['ACTIVE'].astype(float).astype(int)
df9['DEPLOY'] = df9['DEPLOY'].astype(float).astype(int)
df9['AUDIT'] = df9['AUDIT'].astype(float).astype(object)
df9['COSCREEN'] = df9['COSCREEN'].astype(float).astype(int)
df9['RT'] = df9['RT'].astype(float).astype(int)
df9['BI'] = df9['BI'].astype(float).astype(int)
df9['BT'] = df9['BT'].astype(float).astype(int)
df9['INJECT'] = df9['INJECT'].astype(float).astype(int)
df9['TOBMONTH'] = df9['TOBMONTH'].astype(float).astype(int)
###############################################################
df9['ANYALC'] = df9['ANYALC'].astype(float).astype(int)
df9['BINGEDAYS'] = df9['BINGEDAYS'].astype(float).astype(int)
df9['DRUGDAYS'] = df9['DRUGDAYS'].astype(float).astype(int)

df9['ALCDRUGS'] = df9['ALCDRUGS'].astype(float).astype(int)
df9['DAYSCOCAINE'] = df9['DAYSCOCAINE'].astype(float).astype(int)
df9['MARYJDAYS'] = df9['MARYJDAYS'].astype(float).astype(int)



df9['METHDAYS'] = df9['METHDAYS'].astype(float).astype(int)

df9['AGE'] = df9['AGE'].astype(float).astype(int)

In [None]:
import collections
import random
df10 = df9.copy()
replaced = collections.defaultdict(set)
ix = [(row, col) for row in range(df10.shape[0]) for col in range(df10.shape[1])]
random.shuffle(ix)
to_replace = int(round(.2*len(ix)))
for row, col in ix:
    if len(replaced[row]) < df10.shape[1] - 1:
        df10.iloc[row, col] = np.nan
        to_replace -= 1
        replaced[row].add(col)
        if to_replace == 0:
            break

In [None]:
df10.isnull().sum()

In [None]:
x_0=np.where(df10['DAST'].isnull())[0]
x_1=np.where(df10['SEX'].isnull())[0]
x_2=np.where(df10['HISPANIC'].isnull())[0]
x_3=np.where(df10['RACE'].isnull())[0]
x_4=np.where(df10['VET'].isnull())[0]
x_5=np.where(df10['ACTIVE'].isnull())[0]
x_6=np.where(df10['DEPLOY'].isnull())[0]
x_7=np.where(df10['AUDIT'].isnull())[0]
x_8=np.where(df10['COSCREEN'].isnull())[0]
x_9=np.where(df10['BI'].isnull())[0]
x_10=np.where(df10['BT'].isnull())[0]
x_11=np.where(df10['RT'].isnull())[0]
x_12=np.where(df10['ANYALC'].isnull())[0]
x_13=np.where(df10['BINGEDAYS'].isnull())[0]
x_14=np.where(df10['DRUGDAYS'].isnull())[0]
x_15=np.where(df10['ALCDRUGS'].isnull())[0]
x_16=np.where(df10['DAYSCOCAINE'].isnull())[0]
x_17=np.where(df10['MARYJDAYS'].isnull())[0]

x_18=np.where(df10['METHDAYS'].isnull())[0]

x_19=np.where(df10['INJECT'].isnull())[0]
x_20=np.where(df10['AGE'].isnull())[0]
x_21=np.where(df10['TOBMONTH'].isnull())[0]

In [None]:
# instatiate both packages to use 
from sklearn.preprocessing import OrdinalEncoder
from fancyimpute import IterativeImputer
encoder = OrdinalEncoder()
imputer = IterativeImputer()
#create a list of categorical columns to iterate over 
cat_cols =['DAST', 'SEX', 'HISPANIC', 'RACE','VET',  'ACTIVE', 'DEPLOY', 'AUDIT',
       'COSCREEN', 'BI', 'BT', 'RT', 'ANYALC', 'BINGEDAYS', 'DRUGDAYS',
       'ALCDRUGS', 'DAYSCOCAINE', 'MARYJDAYS', 'METHDAYS', 'INJECT', 'AGE', 'TOBMONTH']

def encode(data):
    '''function to encode non-null data and replace it in the original data '''
    #retain only non-null values 
    notnulls =np.array(data.dropna())
    #reshape the data for encoding
    impute_reshape = notnulls.reshape(-1,1)
    #encode date
    print(impute_reshape.shape)
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data 
for columns in cat_cols:
    encode(df10[columns])

In [None]:
encode_data1 = pd.DataFrame(np.round(imputer.fit_transform(df10)),columns = df10.columns) 

In [None]:
encode_data1

In [None]:
encode_data1.isnull().sum()

In [None]:
df9.iloc[:,0] = df9.iloc[:,0].astype(float).astype(int)
df9.iloc[:,1] = df9.iloc[:,1].astype(float).astype(int)
df9.iloc[:,2] = df9.iloc[:,2].astype(float).astype(int)
df9.iloc[:,3] = df9.iloc[:,3].astype(float).astype(int)
df9.iloc[:,4] = df9.iloc[:,4].astype(float).astype(int)
df9.iloc[:,5] = df9.iloc[:,5].astype(float).astype(int)
df9.iloc[:,6] = df9.iloc[:,6].astype(float).astype(int)
df9.iloc[:,7]= df9.iloc[:,7].astype(float).astype(int)
df9.iloc[:,8] = df9.iloc[:,8].astype(float).astype(int)
df9.iloc[:,9] = df9.iloc[:,9].astype(float).astype(int)
df9.iloc[:,10] = df9.iloc[:,10].astype(float).astype(int)
df9.iloc[:,11]= df9.iloc[:,11].astype(float).astype(int)
df9.iloc[:,12]= df9.iloc[:,12].astype(float).astype(int)
df9.iloc[:,13] = df9.iloc[:,13].astype(float).astype(int)
###############################################################
df9.iloc[:,14] = df9.iloc[:,14].astype(float).astype(int)
df9.iloc[:,15] = df9.iloc[:,15].astype(float).astype(int)
df9.iloc[:,16] = df9.iloc[:,16].astype(float).astype(int)

df9.iloc[:,17] = df9.iloc[:,17].astype(float).astype(int)
df9.iloc[:,18]= df9.iloc[:,18].astype(float).astype(int)
df9.iloc[:,19] = df9.iloc[:,19].astype(float).astype(int)

df9.iloc[:,20] = df9.iloc[:,20].astype(float).astype(int)
df9.iloc[:,21]= df9.iloc[:,21].astype(float).astype(int)


#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
encode_data1.iloc[:,0] = encode_data1.iloc[:,0].astype(float).astype(int)
encode_data1.iloc[:,1] = encode_data1.iloc[:,1].astype(float).astype(int)
encode_data1.iloc[:,2] = encode_data1.iloc[:,2].astype(float).astype(int)
encode_data1.iloc[:,3] = encode_data1.iloc[:,3].astype(float).astype(int)
encode_data1.iloc[:,4] = encode_data1.iloc[:,4].astype(float).astype(int)
encode_data1.iloc[:,5] = encode_data1.iloc[:,5].astype(float).astype(int)
encode_data1.iloc[:,6] = encode_data1.iloc[:,6].astype(float).astype(int)
encode_data1.iloc[:,7]= encode_data1.iloc[:,7].astype(float).astype(int)
encode_data1.iloc[:,8] = encode_data1.iloc[:,8].astype(float).astype(int)
encode_data1.iloc[:,9] = encode_data1.iloc[:,9].astype(float).astype(int)
encode_data1.iloc[:,10] = encode_data1.iloc[:,10].astype(float).astype(int)
encode_data1.iloc[:,11]= encode_data1.iloc[:,11].astype(float).astype(int)
encode_data1.iloc[:,12]= encode_data1.iloc[:,12].astype(float).astype(int)
encode_data1.iloc[:,13] = encode_data1.iloc[:,13].astype(float).astype(int)
###############################################################
encode_data1.iloc[:,14] = encode_data1.iloc[:,14].astype(float).astype(int)
encode_data1.iloc[:,15] = encode_data1.iloc[:,15].astype(float).astype(int)
encode_data1.iloc[:,16] = encode_data1.iloc[:,16].astype(float).astype(int)

encode_data1.iloc[:,17] = encode_data1.iloc[:,17].astype(float).astype(int)
encode_data1.iloc[:,18]= encode_data1.iloc[:,18].astype(float).astype(int)
encode_data1.iloc[:,19] = encode_data1.iloc[:,19].astype(float).astype(int)

encode_data1.iloc[:,20] = encode_data1.iloc[:,20].astype(float).astype(int)
encode_data1.iloc[:,21]= encode_data1.iloc[:,21].astype(float).astype(int)


In [None]:
true_0=[]
for i in x_0:
        if(df9.iloc[i,0]==encode_data1.iloc[i,0]):
            true_0.append(i)
true_1=[]
for i in x_1:
        if(df9.iloc[i,1]==encode_data1.iloc[i,1]):
            true_1.append(i)
true_2=[]
for i in x_2:
        if(df9.iloc[i,2]==encode_data1.iloc[i,2]):
            true_2.append(i)
true_3=[]
for i in x_3:
        if(df9.iloc[i,3]==encode_data1.iloc[i,3]):
            true_3.append(i)

true_4=[]
for i in x_4:
        if(df9.iloc[i,4]==encode_data1.iloc[i,4]):
            true_4.append(i)
   
            
true_5=[]
for i in x_5:
        if(df9.iloc[i,5]==encode_data1.iloc[i,5]):
            true_5.append(i)
    
true_6=[]
for i in x_6:
        if(df9.iloc[i,6]==encode_data1.iloc[i,6]):
            true_6.append(i)

true_7=[]
for i in x_7:
        if(df9.iloc[i,7]==encode_data1.iloc[i,7]):
            true_7.append(i)
true_8=[]
for i in x_8:
        if(df9.iloc[i,8]==encode_data1.iloc[i,8]):
            true_8.append(i)
true_9=[]
for i in x_9:
        if(df9.iloc[i,9]==encode_data1.iloc[i,9]):
            true_9.append(i)
true_10=[]
for i in x_10:
        if(df9.iloc[i,10]==encode_data1.iloc[i,10]):
            true_10.append(i)
            
true_11=[]
for i in x_11:
        if(df9.iloc[i,11]==encode_data1.iloc[i,11]):
            true_11.append(i)
            
            
true_12=[]
for i in x_12:
        if(df9.iloc[i,12]==encode_data1.iloc[i,12]):
            true_12.append(i)
            
true_13=[]
for i in x_13:
        if(df9.iloc[i,13]==encode_data1.iloc[i,13]):
            true_13.append(i)
            
            
true_14=[]
for i in x_14:
        if(df9.iloc[i,14]==encode_data1.iloc[i,14]):
            true_14.append(i)
            
true_15=[]
for i in x_15:
        if(df9.iloc[i,15]==encode_data1.iloc[i,15]):
            true_15.append(i)    
            
true_16=[]
for i in x_16:
        if(df9.iloc[i,16]==encode_data1.iloc[i,16]):
            true_16.append(i)
            
true_17=[]
for i in x_17:
        if(df9.iloc[i,17]==encode_data1.iloc[i,17]):
            true_17.append(i)
            
true_18=[]
for i in x_18:
        if(df9.iloc[i,18]==encode_data1.iloc[i,18]):
            true_18.append(i)      
            
            
true_19=[]
for i in x_19:
        if(df9.iloc[i,19]==encode_data1.iloc[i,19]):
            true_19.append(i)   
            
            
true_20=[]
for i in x_20:
        if(df9.iloc[i,20]==encode_data1.iloc[i,20]):
            true_20.append(i)   
            
            
true_21=[]
for i in x_21:
        if(df9.iloc[i,21]==encode_data1.iloc[i,21]):
            true_21.append(i)   
            


In [None]:
## Accuracy in each column 
print(1-((len(x_0)-len(true_0))/len(x_0)))
print(1-((len(x_1)-len(true_1))/len(x_1)))
print(1-((len(x_2)-len(true_2))/len(x_2)))
print(1-((len(x_3)-len(true_3))/len(x_3)))
print(1-((len(x_4)-len(true_4))/len(x_4)))
print(1-((len(x_5)-len(true_5))/len(x_5)))
print(1-((len(x_6)-len(true_6))/len(x_6)))
print(1-((len(x_7)-len(true_7))/len(x_7)))
print(1-((len(x_8)-len(true_8))/len(x_8)))
print(1-((len(x_9)-len(true_9))/len(x_9)))
print(1-((len(x_10)-len(true_10))/len(x_10)))
print(1-((len(x_11)-len(true_11))/len(x_11)))
print(1-((len(x_12)-len(true_12))/len(x_12)))
print(1-((len(x_13)-len(true_13))/len(x_13)))
print(1-((len(x_14)-len(true_14))/len(x_14)))
print(1-((len(x_15)-len(true_15))/len(x_15)))
print(1-((len(x_16)-len(true_16))/len(x_16)))
print(1-((len(x_17)-len(true_17))/len(x_17)))
print(1-((len(x_18)-len(true_18))/len(x_18)))
print(1-((len(x_19)-len(true_19))/len(x_19)))
print(1-((len(x_20)-len(true_20))/len(x_20)))
print(1-((len(x_21)-len(true_21))/len(x_21)))


### The MIDAS : Missing-Data Imputation with Deep Learning

In [91]:
#df11 = data_0.copy()
df11=pd.read_csv('/home/sameerahtalafha/new_project/new/tables/ALL1-original.csv')
cols=['SEX' ,'RACE','VET','BI','RT','DEPLOY'
      ,'DRUGDAYS','ALCDRUGS','AGE','TOBMONTH']
df11=df11[cols]
df11

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,DRUGDAYS,ALCDRUGS,AGE,TOBMONTH
0,0,1,0,0,1,0,11,10,39,1
1,1,2,0,0,1,0,4,4,68,0
2,0,1,0,0,1,0,0,0,29,1
3,1,2,0,1,0,0,1,0,28,0
4,0,2,0,0,0,0,20,1,45,0
...,...,...,...,...,...,...,...,...,...,...
16905,1,2,0,0,0,0,0,0,82,0
16906,1,1,0,0,1,0,0,0,36,1
16907,0,1,0,0,1,0,0,0,50,1
16908,0,1,0,0,1,0,0,0,49,0


In [92]:
df12 = df11.copy()
df12=df11.dropna()
df12

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,DRUGDAYS,ALCDRUGS,AGE,TOBMONTH
0,0,1,0,0,1,0,11,10,39,1
1,1,2,0,0,1,0,4,4,68,0
2,0,1,0,0,1,0,0,0,29,1
3,1,2,0,1,0,0,1,0,28,0
4,0,2,0,0,0,0,20,1,45,0
...,...,...,...,...,...,...,...,...,...,...
16905,1,2,0,0,0,0,0,0,82,0
16906,1,1,0,0,1,0,0,0,36,1
16907,0,1,0,0,1,0,0,0,50,1
16908,0,1,0,0,1,0,0,0,49,0


In [93]:
bins= [15,21,35,45,60,100]
labels = [0,1,2,3,4]
df12['AGE'] = pd.cut(df12['AGE'], bins=bins, labels=labels, right=False)

# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df12['BINGEDAYS'] = pd.cut(df12['BINGEDAYS'], bins=bins, labels=labels, right=False)

bins= [0,1,10,20,31]
labels = [0,1,2,3]
df12['DRUGDAYS'] = pd.cut(df12['DRUGDAYS'], bins=bins, labels=labels, right=False)

bins= [0,1,10,20,31]
labels = [0,1,2,3]
df12['ALCDRUGS'] = pd.cut(df12['ALCDRUGS'], bins=bins, labels=labels, right=False)

# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df12['DAYSCOCAINE'] = pd.cut(df12['DAYSCOCAINE'], bins=bins, labels=labels, right=False)

# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df12['MARYJDAYS'] = pd.cut(df12['MARYJDAYS'], bins=bins, labels=labels, right=False)





# bins= [0,1,10,20,31]
# labels = [0,1,2,3]
# df12['METHDAYS'] = pd.cut(df12['METHDAYS'], bins=bins, labels=labels, right=False)



# bins= [0,10,20,30,40]
# labels = [1,2,3,4]
# df12['ANYALC'] = pd.cut(df12['ANYALC'], bins=bins, labels=labels, right=False)
df12

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,DRUGDAYS,ALCDRUGS,AGE,TOBMONTH
0,0,1,0,0,1,0,2,2,2,1
1,1,2,0,0,1,0,1,1,4,0
2,0,1,0,0,1,0,0,0,1,1
3,1,2,0,1,0,0,1,0,1,0
4,0,2,0,0,0,0,3,1,3,0
...,...,...,...,...,...,...,...,...,...,...
16905,1,2,0,0,0,0,0,0,4,0
16906,1,1,0,0,1,0,0,0,2,1
16907,0,1,0,0,1,0,0,0,3,1
16908,0,1,0,0,1,0,0,0,3,0


In [94]:
#df12['DAST'] = df12['DAST'].astype(float).astype(int)
df12['SEX'] = df12['SEX'].astype(float).astype(int)
#df12['HISPANIC'] = df12['HISPANIC'].astype(float).astype(int)
df12['RACE'] = df12['RACE'].astype(float).astype(int)
df12['VET'] = df12['VET'].astype(float).astype(int)
#df12['ACTIVE'] = df12['ACTIVE'].astype(float).astype(int)
df12['DEPLOY'] = df12['DEPLOY'].astype(float).astype(int)
#df12['AUDIT'] = df12['AUDIT'].astype(float).astype(int)
#df12['COSCREEN'] = df12['COSCREEN'].astype(float).astype(int)
df12['RT'] = df12['RT'].astype(float).astype(int)
df12['BI'] = df12['BI'].astype(float).astype(int)
#df12['BT'] = df12['BT'].astype(float).astype(int)
#df12['INJECT'] = df12['INJECT'].astype(float).astype(int)
df12['TOBMONTH'] = df12['TOBMONTH'].astype(float).astype(int)
###############################################################
#df12['ANYALC'] = df12['ANYALC'].astype(float).astype(int)
#df12['BINGEDAYS'] = df12['BINGEDAYS'].astype(float).astype(int)
df12['DRUGDAYS'] = df12['DRUGDAYS'].astype(float).astype(int)

df12['ALCDRUGS'] = df12['ALCDRUGS'].astype(float).astype(int)
#df12['DAYSCOCAINE'] = df12['DAYSCOCAINE'].astype(float).astype(int)
#df12['MARYJDAYS'] = df12['MARYJDAYS'].astype(float).astype(int)



#df12['METHDAYS'] = df12['METHDAYS'].astype(float).astype(int)

df12['AGE'] = df12['AGE'].astype(float).astype(int)


In [95]:
df12.isnull().sum()
df12

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,DRUGDAYS,ALCDRUGS,AGE,TOBMONTH
0,0,1,0,0,1,0,2,2,2,1
1,1,2,0,0,1,0,1,1,4,0
2,0,1,0,0,1,0,0,0,1,1
3,1,2,0,1,0,0,1,0,1,0
4,0,2,0,0,0,0,3,1,3,0
...,...,...,...,...,...,...,...,...,...,...
16905,1,2,0,0,0,0,0,0,4,0
16906,1,1,0,0,1,0,0,0,2,1
16907,0,1,0,0,1,0,0,0,3,1
16908,0,1,0,0,1,0,0,0,3,0


In [96]:
df12.to_csv('MIDAS/out-full.csv', index=False)

In [97]:
import collections
import random
df13 = df12.copy()
replaced = collections.defaultdict(set)
ix = [(row, col) for row in range(df13.shape[0]) for col in range(df13.shape[1])]
random.shuffle(ix)
to_replace = int(round(.2*len(ix)))
for row, col in ix:
    if len(replaced[row]) < df13.shape[1] - 1:
        df13.iloc[row, col] = np.nan
        to_replace -= 1
        replaced[row].add(col)
        if to_replace == 0:
            break

In [98]:
df13.to_csv('MIDAS/out-miss.csv', index=False)

In [99]:
df13.isnull().sum()

SEX         3455
RACE        3353
VET         3344
BI          3457
RT          3299
DEPLOY      3392
DRUGDAYS    3413
ALCDRUGS    3380
AGE         3371
TOBMONTH    3356
dtype: int64

In [100]:
df13=pd.read_csv('MIDAS/out-miss.csv')
df13

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,DRUGDAYS,ALCDRUGS,AGE,TOBMONTH
0,0.0,1.0,0.0,0.0,1.0,,,2.0,2.0,1.0
1,1.0,2.0,0.0,0.0,,0.0,1.0,1.0,4.0,
2,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,1.0,2.0,0.0,1.0,0.0,,1.0,0.0,1.0,0.0
4,,2.0,0.0,0.0,0.0,0.0,3.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...
16905,1.0,2.0,,,0.0,0.0,0.0,0.0,4.0,0.0
16906,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,
16907,0.0,,0.0,0.0,,0.0,,0.0,3.0,1.0
16908,0.0,1.0,0.0,,1.0,0.0,0.0,,3.0,0.0


In [101]:
import MIDASpy as md
np.random.seed(441)
categorical = ['SEX',  'RACE', 'VET', 'DEPLOY', 
        'BI',  'RT',  'DRUGDAYS',
       'ALCDRUGS', 'AGE', 'TOBMONTH' ]
data_cat, cat_cols_list = md.cat_conv(df13[categorical])

df13=df13.drop(categorical, axis = 1, inplace = True)
constructor_list = [df13]
constructor_list.append(data_cat)
data_in = pd.concat(constructor_list, axis=1)

na_loc = data_in.isnull()
data_in[na_loc] = np.nan

In [102]:
print("hey")

hey


In [103]:
imputer = md.Midas(layer_structure = [300,300], vae_layer = False, seed = 89, input_drop = 0.2)
imputer.build_model(data_in, softmax_columns = cat_cols_list)
imputer.train_model(training_epochs = 200)

Size index: [3, 3, 2, 6, 2, 2, 4, 4, 5, 2]

Computation graph constructed

Model initialised



2021-08-12 12:32:21.723634: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-08-12 12:32:21.723656: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      


Epoch: 0 , loss: 5.579150409635269
Epoch: 1 , loss: 4.71179188562162
Epoch: 2 , loss: 4.56687775186517
Epoch: 3 , loss: 4.523302167879812
Epoch: 4 , loss: 4.495336133196498
Epoch: 5 , loss: 4.47294744194457
Epoch: 6 , loss: 4.45871858185891
Epoch: 7 , loss: 4.460717489547802
Epoch: 8 , loss: 4.4406908016764755
Epoch: 9 , loss: 4.4342966790903695
Epoch: 10 , loss: 4.439323556694117
Epoch: 11 , loss: 4.4125282299337965
Epoch: 12 , loss: 4.408360169918248
Epoch: 13 , loss: 4.41234044979016
Epoch: 14 , loss: 4.392377692641634
Epoch: 15 , loss: 4.411127327292254
Epoch: 16 , loss: 4.410724212951733
Epoch: 17 , loss: 4.3920331446058825
Epoch: 18 , loss: 4.394383528693155
Epoch: 19 , loss: 4.404772814476129
Epoch: 20 , loss: 4.4117400185628375
Epoch: 21 , loss: 4.389751402943423
Epoch: 22 , loss: 4.414091655476526
Epoch: 23 , loss: 4.4073565839366475
Epoch: 24 , loss: 4.394938779599739
Epoch: 25 , loss: 4.410721169050896
Epoch: 26 , loss: 4.384146653567299
Epoch: 27 , loss: 4.3848283609206025


<MIDASpy.midas_base.Midas at 0x7ff950956f70>

In [104]:
imputations = imputer.generate_samples(m=10).output_list 
n=1
for i in imputations:
    file_out = "MIDAS/midas_imp_" + str(n) + ".csv"
    i.to_csv(file_out, index=False)
    n += 1

INFO:tensorflow:Restoring parameters from tmp/MIDAS
Model restored.


2021-08-12 12:37:24.232056: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-08-12 12:37:24.232077: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      


In [105]:
encode_data2 =  pd.read_csv('MIDAS/midas_imp_7.csv') 
encode_data2.isnull().sum()


SEX_0.0         0
SEX_1.0         0
SEX_2.0         0
RACE_1.0        0
RACE_2.0        0
RACE_3.0        0
VET_0.0         0
VET_1.0         0
DEPLOY_0.0      0
DEPLOY_1.0      0
DEPLOY_2.0      0
DEPLOY_3.0      0
DEPLOY_4.0      0
DEPLOY_5.0      0
BI_0.0          0
BI_1.0          0
RT_0.0          0
RT_1.0          0
DRUGDAYS_0.0    0
DRUGDAYS_1.0    0
DRUGDAYS_2.0    0
DRUGDAYS_3.0    0
ALCDRUGS_0.0    0
ALCDRUGS_1.0    0
ALCDRUGS_2.0    0
ALCDRUGS_3.0    0
AGE_0.0         0
AGE_1.0         0
AGE_2.0         0
AGE_3.0         0
AGE_4.0         0
TOBMONTH_0.0    0
TOBMONTH_1.0    0
dtype: int64

In [106]:
encode_data2

Unnamed: 0,SEX_0.0,SEX_1.0,SEX_2.0,RACE_1.0,RACE_2.0,RACE_3.0,VET_0.0,VET_1.0,DEPLOY_0.0,DEPLOY_1.0,...,ALCDRUGS_1.0,ALCDRUGS_2.0,ALCDRUGS_3.0,AGE_0.0,AGE_1.0,AGE_2.0,AGE_3.0,AGE_4.0,TOBMONTH_0.0,TOBMONTH_1.0
0,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,0.831383,0.003598,...,0.000000,1.000000,0.000000,0.000000,0.00000,1.000000,0.00000,0.000000,0.000000,1.000000
1,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,1.000000,0.731069,0.268931
2,0.776864,0.223045,0.000091,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.00000,0.000000,0.00000,0.000000,0.000000,1.000000
3,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.957236,0.005277,...,0.000000,0.000000,0.000000,0.000000,1.00000,0.000000,0.00000,0.000000,1.000000,0.000000
4,0.762074,0.237756,0.000170,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,0.000000,0.019084,0.44331,0.257709,0.21635,0.063548,0.645944,0.354056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16905,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,0.876935,0.123065,1.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,1.000000,1.000000,0.000000
16906,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,1.000000,0.00000,0.000000,0.567055,0.432945
16907,1.000000,0.000000,0.000000,0.825747,0.171682,0.002571,1.000000,0.000000,1.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,1.00000,0.000000,0.000000,1.000000
16908,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,...,0.055446,0.054841,0.046434,0.000000,0.00000,0.000000,1.00000,0.000000,1.000000,0.000000


In [107]:
def undummify(df, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [108]:
return_df13 = undummify(encode_data2)

In [109]:
#return_df13['DAST'] = return_df13['DAST'].astype(float).astype(int)
return_df13['SEX'] = return_df13['SEX'].astype(float).astype(int)
#return_df13['HISPANIC'] = return_df13['HISPANIC'].astype(float).astype(int)
return_df13['RACE'] = return_df13['RACE'].astype(float).astype(int)
return_df13['VET'] = return_df13['VET'].astype(float).astype(int)
#return_df13['ACTIVE'] = return_df13['ACTIVE'].astype(float).astype(int)
return_df13['DEPLOY'] = return_df13['DEPLOY'].astype(float).astype(int)
#return_df13['AUDIT'] = return_df13['AUDIT'].astype(float).astype(int)
#return_df13['COSCREEN'] = return_df13['COSCREEN'].astype(float).astype(int)
return_df13['RT'] = return_df13['RT'].astype(float).astype(int)
return_df13['BI'] = return_df13['BI'].astype(float).astype(int)
#return_df13['BT'] = return_df13['BT'].astype(float).astype(int)
#return_df13['INJECT'] = return_df13['INJECT'].astype(float).astype(int)
return_df13['TOBMONTH'] = return_df13['TOBMONTH'].astype(float).astype(int)
###############################################################
#return_df13['ANYALC'] = return_df13['ANYALC'].astype(float).astype(int)
#return_df13['BINGEDAYS'] = return_df13['BINGEDAYS'].astype(float).astype(int)
return_df13['DRUGDAYS'] = return_df13['DRUGDAYS'].astype(float).astype(int)

return_df13['ALCDRUGS'] = return_df13['ALCDRUGS'].astype(float).astype(int)
#return_df13['DAYSCOCAINE'] = return_df13['DAYSCOCAINE'].astype(float).astype(int)
#return_df13['MARYJDAYS'] = return_df13['MARYJDAYS'].astype(float).astype(int)



#return_df13['METHDAYS'] = return_df13['METHDAYS'].astype(float).astype(int)

return_df13['AGE'] = return_df13['AGE'].astype(float).astype(int)
return_df13


Unnamed: 0,SEX,RACE,VET,DEPLOY,BI,RT,DRUGDAYS,ALCDRUGS,AGE,TOBMONTH
0,0,1,0,0,0,1,2,2,2,1
1,1,2,0,0,0,0,1,1,4,0
2,0,1,0,0,0,1,0,0,1,1
3,1,2,0,0,1,0,1,0,1,0
4,0,2,0,0,0,0,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
16905,1,2,0,0,0,0,0,0,4,0
16906,1,1,0,0,0,1,0,0,2,0
16907,0,1,0,0,0,1,0,0,3,1
16908,0,1,0,0,0,1,0,0,3,0


In [111]:
df12=pd.read_csv('MIDAS/out-full.csv')
#df12['DAST'] = df12['DAST'].astype(float).astype(int)
df12['SEX'] = df12['SEX'].astype(float).astype(int)
#df12['HISPANIC'] = df12['HISPANIC'].astype(float).astype(int)
df12['RACE'] = df12['RACE'].astype(float).astype(int)
df12['VET'] = df12['VET'].astype(float).astype(int)
#df12['ACTIVE'] = df12['ACTIVE'].astype(float).astype(int)
df12['DEPLOY'] = df12['DEPLOY'].astype(float).astype(int)
#df12['AUDIT'] = df12['AUDIT'].astype(float).astype(int)
#df12['COSCREEN'] = df12['COSCREEN'].astype(float).astype(int)
df12['RT'] = df12['RT'].astype(float).astype(int)
df12['BI'] = df12['BI'].astype(float).astype(int)
#df12['BT'] = df12['BT'].astype(float).astype(int)
#df12['INJECT'] = df12['INJECT'].astype(float).astype(int)
df12['TOBMONTH'] = df12['TOBMONTH'].astype(float).astype(int)
###############################################################
#df12['ANYALC'] = df12['ANYALC'].astype(float).astype(int)
#df12['BINGEDAYS'] = df12['BINGEDAYS'].astype(float).astype(int)
df12['DRUGDAYS'] = df12['DRUGDAYS'].astype(float).astype(int)

df12['ALCDRUGS'] = df12['ALCDRUGS'].astype(float).astype(int)
#df12['DAYSCOCAINE'] = df12['DAYSCOCAINE'].astype(float).astype(int)
#df12['MARYJDAYS'] = df12['MARYJDAYS'].astype(float).astype(int)



#df12['METHDAYS'] = df12['METHDAYS'].astype(float).astype(int)

df12['AGE'] = df12['AGE'].astype(float).astype(int)
df12

Unnamed: 0,SEX,RACE,VET,BI,RT,DEPLOY,DRUGDAYS,ALCDRUGS,AGE,TOBMONTH
0,0,1,0,0,1,0,2,2,2,1
1,1,2,0,0,1,0,1,1,4,0
2,0,1,0,0,1,0,0,0,1,1
3,1,2,0,1,0,0,1,0,1,0
4,0,2,0,0,0,0,3,1,3,0
...,...,...,...,...,...,...,...,...,...,...
16905,1,2,0,0,0,0,0,0,4,0
16906,1,1,0,0,1,0,0,0,2,1
16907,0,1,0,0,1,0,0,0,3,1
16908,0,1,0,0,1,0,0,0,3,0


In [112]:
df13=pd.read_csv('MIDAS/out-miss.csv')


In [113]:
#x_0=np.where(df13['DAST'].isnull())[0]
x_0=np.where(df13['SEX'].isnull())[0]
#x_2=np.where(df13['HISPANIC'].isnull())[0]
x_1=np.where(df13['RACE'].isnull())[0]
x_2=np.where(df13['VET'].isnull())[0]
#x_5=np.where(df13['ACTIVE'].isnull())[0]
x_3=np.where(df13['DEPLOY'].isnull())[0]
#x_7=np.where(df13['AUDIT'].isnull())[0]
#x_8=np.where(df13['COSCREEN'].isnull())[0]
x_4=np.where(df13['BI'].isnull())[0]
#x_10=np.where(df13['BT'].isnull())[0]
x_5=np.where(df13['RT'].isnull())[0]
#x_12=np.where(df13['ANYALC'].isnull())[0]
#x_13=np.where(df13['BINGEDAYS'].isnull())[0]
x_6=np.where(df13['DRUGDAYS'].isnull())[0]
x_7=np.where(df13['ALCDRUGS'].isnull())[0]
#x_16=np.where(df13['DAYSCOCAINE'].isnull())[0]
#x_17=np.where(df13['MARYJDAYS'].isnull())[0]

#x_18=np.where(df13['METHDAYS'].isnull())[0]

#x_19=np.where(df13['INJECT'].isnull())[0]
x_8=np.where(df13['AGE'].isnull())[0]
x_9=np.where(df13['TOBMONTH'].isnull())[0]

In [114]:
true_0=[]
for i in x_0:
        if(df12.iloc[i,0]==return_df13.iloc[i,0]):
            true_0.append(i)
true_1=[]
for i in x_1:
        if(df12.iloc[i,1]==return_df13.iloc[i,1]):
            true_1.append(i)
true_2=[]
for i in x_2:
        if(df12.iloc[i,2]==return_df13.iloc[i,2]):
            true_2.append(i)
true_3=[]
for i in x_3:
        if(df12.iloc[i,3]==return_df13.iloc[i,3]):
            true_3.append(i)

true_4=[]
for i in x_4:
        if(df12.iloc[i,4]==return_df13.iloc[i,4]):
            true_4.append(i)
   
            
true_5=[]
for i in x_5:
        if(df12.iloc[i,5]==return_df13.iloc[i,5]):
            true_5.append(i)
    
true_6=[]
for i in x_6:
        if(df12.iloc[i,6]==return_df13.iloc[i,6]):
            true_6.append(i)

true_7=[]
for i in x_7:
        if(df12.iloc[i,7]==return_df13.iloc[i,7]):
            true_7.append(i)
true_8=[]
for i in x_8:
        if(df12.iloc[i,8]==return_df13.iloc[i,8]):
            true_8.append(i)
true_9=[]
for i in x_9:
        if(df12.iloc[i,9]==return_df13.iloc[i,9]):
            true_9.append(i)
# true_10=[]
# for i in x_10:
#         if(df12.iloc[i,10]==return_df13.iloc[i,10]):
#             true_10.append(i)
            
# true_11=[]
# for i in x_11:
#         if(df12.iloc[i,11]==return_df13.iloc[i,11]):
#             true_11.append(i)
            
            
# true_12=[]
# for i in x_12:
#         if(df12.iloc[i,12]==return_df13.iloc[i,12]):
#             true_12.append(i)
            
# true_13=[]
# for i in x_13:
#         if(df12.iloc[i,13]==return_df13.iloc[i,13]):
#             true_13.append(i)
            
            
# true_14=[]
# for i in x_14:
#         if(df12.iloc[i,14]==return_df13.iloc[i,14]):
#             true_14.append(i)
            
# true_15=[]
# for i in x_15:
#         if(df12.iloc[i,15]==return_df13.iloc[i,15]):
#             true_15.append(i)    
            
# true_16=[]
# for i in x_16:
#         if(df12.iloc[i,16]==return_df13.iloc[i,16]):
#             true_16.append(i)
            
# true_17=[]
# for i in x_17:
#         if(df12.iloc[i,17]==return_df13.iloc[i,17]):
#             true_17.append(i)
            
# true_18=[]
# for i in x_18:
#         if(df12.iloc[i,18]==return_df13.iloc[i,18]):
#             true_18.append(i)      
            
            
# true_19=[]
# for i in x_19:
#         if(df12.iloc[i,19]==return_df13.iloc[i,19]):
#             true_19.append(i)   
            
            
# true_20=[]
# for i in x_20:
#         if(df12.iloc[i,20]==return_df13.iloc[i,20]):
#             true_20.append(i)   
            
            
# true_21=[]
# for i in x_21:
#         if(df12.iloc[i,21]==return_df13.iloc[i,21]):
#             true_21.append(i)   
            
            


In [115]:
## Accuracy in each column 
print(1-((len(x_0)-len(true_0))/len(x_0)))
print(1-((len(x_1)-len(true_1))/len(x_1)))
print(1-((len(x_2)-len(true_2))/len(x_2)))
print(1-((len(x_3)-len(true_3))/len(x_3)))
print(1-((len(x_4)-len(true_4))/len(x_4)))
print(1-((len(x_5)-len(true_5))/len(x_5)))
print(1-((len(x_6)-len(true_6))/len(x_6)))
print(1-((len(x_7)-len(true_7))/len(x_7)))
print(1-((len(x_8)-len(true_8))/len(x_8)))
print(1-((len(x_9)-len(true_9))/len(x_9)))
# print(1-((len(x_10)-len(true_10))/len(x_10)))
# print(1-((len(x_11)-len(true_11))/len(x_11)))
# print(1-((len(x_12)-len(true_12))/len(x_12)))
# print(1-((len(x_13)-len(true_13))/len(x_13)))
# print(1-((len(x_14)-len(true_14))/len(x_14)))
# print(1-((len(x_15)-len(true_15))/len(x_15)))
# print(1-((len(x_16)-len(true_16))/len(x_16)))
# print(1-((len(x_17)-len(true_17))/len(x_17)))
# print(1-((len(x_18)-len(true_18))/len(x_18)))
# print(1-((len(x_19)-len(true_19))/len(x_19)))
# print(1-((len(x_20)-len(true_20))/len(x_20)))
# print(1-((len(x_21)-len(true_21))/len(x_21)))


0.7157742402315485
0.6033402922755742
0.9046052631578947
0.7841981132075472
0.5903962973676598
0.6771749014852986
0.6100205098154117
0.7973372781065089
0.40759418570157224
0.5899880810488677


### Iterative SVD

In [None]:
df12.columns

In [None]:
df14 = data_0.copy()

In [None]:
df15 = df14.copy()
df15=df15.dropna()
df15.isnull().sum()

In [None]:
bins= [0,21,30,45,60,100]
labels = [1,2,3,4,5]
df15['AGE'] = pd.cut(df15['AGE'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['BINGEDAYS'] = pd.cut(df15['BINGEDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df9['DRUGDAYS'] = pd.cut(df15['DRUGDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['ALCDRUGS'] = pd.cut(df15['ALCDRUGS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['DAYSCOCAINE'] = pd.cut(df15['DAYSCOCAINE'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['MARYJDAYS'] = pd.cut(df15['MARYJDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['ANYOPIATEDAYS'] = pd.cut(df15['ANYOPIATEDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['METHADONE'] = pd.cut(df15['METHADONE'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['HALLUC'] = pd.cut(df15['HALLUC'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['METHDAYS'] = pd.cut(df15['METHDAYS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['OTHERDRUGS'] = pd.cut(df15['OTHERDRUGS'], bins=bins, labels=labels, right=False)

bins= [0,10,20,30,40]
labels = [1,2,3,4]
df15['ANYALC'] = pd.cut(df15['ANYALC'], bins=bins, labels=labels, right=False)

df15.isnull().sum()

In [None]:
df15['DAST'] = df15['DAST'].astype(float).astype(object)
df15['SEX'] = df15['SEX'].astype(float).astype(object)
df15['HISPANIC'] = df15['HISPANIC'].astype(float).astype(object)
df15['RACE'] = df15['RACE'].astype(float).astype(object)
df15['VET'] = df15['VET'].astype(float).astype(object)
df15['ACTIVE'] = df15['ACTIVE'].astype(float).astype(object)
df15['DEPLOY'] = df15['DEPLOY'].astype(float).astype(object)
df15['AUDIT'] = df15['AUDIT'].astype(float).astype(object)
df15['COSCREEN'] = df15['COSCREEN'].astype(float).astype(object)
df15['RT'] = df15['RT'].astype(float).astype(object)
df15['BI'] = df15['BI'].astype(float).astype(object)
df15['BT'] = df15['BT'].astype(float).astype(object)
df15['INJECT'] = df15['INJECT'].astype(float).astype(object)
df15['TOBMONTH'] = df15['TOBMONTH'].astype(float).astype(object)
###############################################################
df15['ANYALC'] = df15['ANYALC'].astype(float).astype(object)
df15['BINGEDAYS'] = df15['BINGEDAYS'].astype(float).astype(object)
df15['DRUGDAYS'] = df15['DRUGDAYS'].astype(float).astype(object)

df15['ALCDRUGS'] = df15['ALCDRUGS'].astype(float).astype(object)
df15['DAYSCOCAINE'] = df15['DAYSCOCAINE'].astype(float).astype(object)
df15['MARYJDAYS'] = df15['MARYJDAYS'].astype(float).astype(object)

df15['ANYOPIATEDAYS'] = df15['ANYOPIATEDAYS'].astype(float).astype(object)
df15['METHADONE'] = df15['METHADONE'].astype(float).astype(object)
df15['HALLUC'] = df15['HALLUC'].astype(float).astype(object)

df15['METHDAYS'] = df15['METHDAYS'].astype(float).astype(object)
df15['OTHERDRUGS'] =df15['OTHERDRUGS'].astype(float).astype(object)
df15['AGE'] = df15['AGE'].astype(float).astype(object)

In [None]:
import collections
import random
df16 = df15.copy()
replaced = collections.defaultdict(set)
ix = [(row, col) for row in range(df16.shape[0]) for col in range(df16.shape[1])]
random.shuffle(ix)
to_replace = int(round(.1*len(ix)))
for row, col in ix:
    if len(replaced[row]) < df16.shape[1] - 1:
        df16.iloc[row, col] = np.nan
        to_replace -= 1
        replaced[row].add(col)
        if to_replace == 0:
            break

In [None]:
df16.isnull().sum()

In [None]:
x_0=np.where(df16['DAST'].isnull())[0]
x_1=np.where(df16['SEX'].isnull())[0]
x_2=np.where(df16['HISPANIC'].isnull())[0]
x_3=np.where(df16['RACE'].isnull())[0]
x_4=np.where(df16['VET'].isnull())[0]
x_5=np.where(df16['ACTIVE'].isnull())[0]
x_6=np.where(df16['DEPLOY'].isnull())[0]
x_7=np.where(df16['AUDIT'].isnull())[0]
x_8=np.where(df16['COSCREEN'].isnull())[0]
x_9=np.where(df16['BI'].isnull())[0]
x_10=np.where(df16['BT'].isnull())[0]
x_11=np.where(df16['RT'].isnull())[0]
x_12=np.where(df16['ANYALC'].isnull())[0]
x_13=np.where(df16['BINGEDAYS'].isnull())[0]
x_14=np.where(df16['DRUGDAYS'].isnull())[0]
x_15=np.where(df16['ALCDRUGS'].isnull())[0]
x_16=np.where(df16['DAYSCOCAINE'].isnull())[0]
x_17=np.where(df16['MARYJDAYS'].isnull())[0]
x_18=np.where(df16['ANYOPIATEDAYS'].isnull())[0]
x_19=np.where(df16['METHADONE'].isnull())[0]
x_20=np.where(df16['HALLUC'].isnull())[0]
x_21=np.where(df16['METHDAYS'].isnull())[0]
x_22=np.where(df16['OTHERDRUGS'].isnull())[0]
x_23=np.where(df16['INJECT'].isnull())[0]
x_24=np.where(df16['AGE'].isnull())[0]
x_25=np.where(df16['TOBMONTH'].isnull())[0]

In [None]:
# instatiate both packages to use 
from sklearn.preprocessing import OrdinalEncoder
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute
from fancyimpute import IterativeSVD
encoder = OrdinalEncoder()
imputer = IterativeSVD()
#create a list of categorical columns to iterate over 
cat_cols =['DAST', 'SEX', 'HISPANIC', 'RACE','VET',  'ACTIVE', 'DEPLOY', 'AUDIT',
       'COSCREEN', 'BI', 'BT', 'RT', 'ANYALC', 'BINGEDAYS', 'DRUGDAYS',
       'ALCDRUGS', 'DAYSCOCAINE', 'MARYJDAYS', 'ANYOPIATEDAYS', 'METHADONE',
       'HALLUC', 'METHDAYS', 'OTHERDRUGS', 'INJECT', 'AGE', 'TOBMONTH']

def encode(data):
    '''function to encode non-null data and replace it in the original data '''
    #retain only non-null values 
    notnulls =np.array(data.dropna())
    #reshape the data for encoding
    impute_reshape = notnulls.reshape(-1,1)
    #encode date
    print(impute_reshape.shape)
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data 
for columns in cat_cols:
    encode(df16[columns])

In [None]:
encode_data3 = pd.DataFrame(np.round(imputer.fit_transform(df16)),columns = df16.columns) 

In [None]:
encode_data3

In [None]:
encode_data3.isnull().sum()

In [None]:
df15.iloc[:,0] = df15.iloc[:,0].astype(float).astype(int)
df15.iloc[:,1] = df15.iloc[:,1].astype(float).astype(int)
df15.iloc[:,2] = df15.iloc[:,2].astype(float).astype(int)
df15.iloc[:,3] = df15.iloc[:,3].astype(float).astype(int)
df15.iloc[:,4] = df15.iloc[:,4].astype(float).astype(int)
df15.iloc[:,5] = df15.iloc[:,5].astype(float).astype(int)
df15.iloc[:,6] = df15.iloc[:,6].astype(float).astype(int)
df15.iloc[:,7]= df15.iloc[:,7].astype(float).astype(int)
df15.iloc[:,8] = df15.iloc[:,8].astype(float).astype(int)
df15.iloc[:,9] = df15.iloc[:,9].astype(float).astype(int)
df15.iloc[:,10] = df15.iloc[:,10].astype(float).astype(int)
df15.iloc[:,11]= df15.iloc[:,11].astype(float).astype(int)
df15.iloc[:,12]= df15.iloc[:,12].astype(float).astype(int)
df15.iloc[:,13] = df15.iloc[:,13].astype(float).astype(int)
###############################################################
df15.iloc[:,14] = df15.iloc[:,14].astype(float).astype(int)
df15.iloc[:,15] = df15.iloc[:,15].astype(float).astype(int)
df15.iloc[:,16] = df15.iloc[:,16].astype(float).astype(int)

df15.iloc[:,17] = df15.iloc[:,17].astype(float).astype(int)
df15.iloc[:,18]= df15.iloc[:,18].astype(float).astype(int)
df15.iloc[:,19] = df15.iloc[:,19].astype(float).astype(int)

df15.iloc[:,20] = df15.iloc[:,20].astype(float).astype(int)
df15.iloc[:,21]= df15.iloc[:,21].astype(float).astype(int)
df15.iloc[:,22]= df15.iloc[:,22].astype(float).astype(int)

df15.iloc[:,23]= df15.iloc[:,23].astype(float).astype(int)
df15.iloc[:,24]=df15.iloc[:,24].astype(float).astype(int)
df15.iloc[:,25] = df15.iloc[:,25].astype(float).astype(int)

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
encode_data3.iloc[:,0] = encode_data3.iloc[:,0].astype(float).astype(int)
encode_data3.iloc[:,1] = encode_data3.iloc[:,1].astype(float).astype(int)
encode_data3.iloc[:,2] = encode_data3.iloc[:,2].astype(float).astype(int)
encode_data3.iloc[:,3] = encode_data3.iloc[:,3].astype(float).astype(int)
encode_data3.iloc[:,4] = encode_data3.iloc[:,4].astype(float).astype(int)
encode_data3.iloc[:,5] = encode_data3.iloc[:,5].astype(float).astype(int)
encode_data3.iloc[:,6] = encode_data3.iloc[:,6].astype(float).astype(int)
encode_data3.iloc[:,7]= encode_data3.iloc[:,7].astype(float).astype(int)
encode_data3.iloc[:,8] = encode_data3.iloc[:,8].astype(float).astype(int)
encode_data3.iloc[:,9] = encode_data3.iloc[:,9].astype(float).astype(int)
encode_data3.iloc[:,10] = encode_data3.iloc[:,10].astype(float).astype(int)
encode_data3.iloc[:,11]= encode_data3.iloc[:,11].astype(float).astype(int)
encode_data3.iloc[:,12]= encode_data3.iloc[:,12].astype(float).astype(int)
encode_data3.iloc[:,13] = encode_data3.iloc[:,13].astype(float).astype(int)
###############################################################
encode_data3.iloc[:,14] = encode_data3.iloc[:,14].astype(float).astype(int)
encode_data3.iloc[:,15] = encode_data3.iloc[:,15].astype(float).astype(int)
encode_data3.iloc[:,16] = encode_data3.iloc[:,16].astype(float).astype(int)

encode_data3.iloc[:,17] = encode_data3.iloc[:,17].astype(float).astype(int)
encode_data3.iloc[:,18]= encode_data3.iloc[:,18].astype(float).astype(int)
encode_data3.iloc[:,19] = encode_data3.iloc[:,19].astype(float).astype(int)

encode_data3.iloc[:,20] = encode_data3.iloc[:,20].astype(float).astype(int)
encode_data3.iloc[:,21]= encode_data3.iloc[:,21].astype(float).astype(int)
encode_data3.iloc[:,22]= encode_data3.iloc[:,22].astype(float).astype(int)

encode_data3.iloc[:,23]= encode_data3.iloc[:,23].astype(float).astype(int)
encode_data3.iloc[:,24]=encode_data3.iloc[:,24].astype(float).astype(int)
encode_data3.iloc[:,25] = encode_data3.iloc[:,25].astype(float).astype(int)

In [None]:
true_0=[]
for i in x_0:
        if(df15.iloc[i,0]==encode_data3.iloc[i,0]):
            true_0.append(i)
true_1=[]
for i in x_1:
        if(df15.iloc[i,1]==encode_data3.iloc[i,1]):
            true_1.append(i)
true_2=[]
for i in x_2:
        if(df15.iloc[i,2]==encode_data3.iloc[i,2]):
            true_2.append(i)
true_3=[]
for i in x_3:
        if(df15.iloc[i,3]==encode_data3.iloc[i,3]):
            true_3.append(i)

true_4=[]
for i in x_4:
        if(df15.iloc[i,4]==encode_data3.iloc[i,4]):
            true_4.append(i)
   
            
true_5=[]
for i in x_5:
        if(df15.iloc[i,5]==encode_data3.iloc[i,5]):
            true_5.append(i)
    
true_6=[]
for i in x_6:
        if(df15.iloc[i,6]==encode_data3.iloc[i,6]):
            true_6.append(i)

true_7=[]
for i in x_7:
        if(df15.iloc[i,7]==encode_data3.iloc[i,7]):
            true_7.append(i)
true_8=[]
for i in x_8:
        if(df15.iloc[i,8]==encode_data3.iloc[i,8]):
            true_8.append(i)
true_9=[]
for i in x_9:
        if(df15.iloc[i,9]==encode_data3.iloc[i,9]):
            true_9.append(i)
true_10=[]
for i in x_10:
        if(df15.iloc[i,10]==encode_data3.iloc[i,10]):
            true_10.append(i)
            
true_11=[]
for i in x_11:
        if(df15.iloc[i,11]==encode_data3.iloc[i,11]):
            true_11.append(i)
            
            
true_12=[]
for i in x_12:
        if(df15.iloc[i,12]==encode_data3.iloc[i,12]):
            true_12.append(i)
            
true_13=[]
for i in x_13:
        if(df15.iloc[i,13]==encode_data3.iloc[i,13]):
            true_13.append(i)
            
            
true_14=[]
for i in x_14:
        if(df15.iloc[i,14]==encode_data3.iloc[i,14]):
            true_14.append(i)
            
true_15=[]
for i in x_15:
        if(df15.iloc[i,15]==encode_data3.iloc[i,15]):
            true_15.append(i)    
            
true_16=[]
for i in x_16:
        if(df15.iloc[i,16]==encode_data3.iloc[i,16]):
            true_16.append(i)
            
true_17=[]
for i in x_17:
        if(df15.iloc[i,17]==encode_data3.iloc[i,17]):
            true_17.append(i)
            
true_18=[]
for i in x_18:
        if(df15.iloc[i,18]==encode_data3.iloc[i,18]):
            true_18.append(i)      
            
            
true_19=[]
for i in x_19:
        if(df15.iloc[i,19]==encode_data3.iloc[i,19]):
            true_19.append(i)   
            
            
true_20=[]
for i in x_20:
        if(df15.iloc[i,20]==encode_data3.iloc[i,20]):
            true_20.append(i)   
            
            
true_21=[]
for i in x_21:
        if(df15.iloc[i,21]==encode_data3.iloc[i,21]):
            true_21.append(i)   
            
            
true_22=[]
for i in x_22:
        if(df15.iloc[i,22]==encode_data3.iloc[i,22]):
            true_22.append(i)   
            
            
true_23=[]
for i in x_23:
        if(df15.iloc[i,23]==encode_data3.iloc[i,23]):
            true_23.append(i)       
            
true_24=[]
for i in x_24:
        if(df15.iloc[i,24]==encode_data3.iloc[i,24]):
            true_24.append(i)   
            
true_25=[]
for i in x_25:
        if(df15.iloc[i,25]==encode_data3.iloc[i,25]):
            true_25.append(i)               
          

In [None]:
## Accuracy in each column 
print(1-((len(x_0)-len(true_0))/len(x_0)))
print(1-((len(x_1)-len(true_1))/len(x_1)))
print(1-((len(x_2)-len(true_2))/len(x_2)))
print(1-((len(x_3)-len(true_3))/len(x_3)))
print(1-((len(x_4)-len(true_4))/len(x_4)))
print(1-((len(x_5)-len(true_5))/len(x_5)))
print(1-((len(x_6)-len(true_6))/len(x_6)))
print(1-((len(x_7)-len(true_7))/len(x_7)))
print(1-((len(x_8)-len(true_8))/len(x_8)))
print(1-((len(x_9)-len(true_9))/len(x_9)))
print(1-((len(x_10)-len(true_10))/len(x_10)))
print(1-((len(x_11)-len(true_11))/len(x_11)))
print(1-((len(x_12)-len(true_12))/len(x_12)))
print(1-((len(x_13)-len(true_13))/len(x_13)))
print(1-((len(x_14)-len(true_14))/len(x_14)))
print(1-((len(x_15)-len(true_15))/len(x_15)))
print(1-((len(x_16)-len(true_16))/len(x_16)))
print(1-((len(x_17)-len(true_17))/len(x_17)))
print(1-((len(x_18)-len(true_18))/len(x_18)))
print(1-((len(x_19)-len(true_19))/len(x_19)))
print(1-((len(x_20)-len(true_20))/len(x_20)))
print(1-((len(x_21)-len(true_21))/len(x_21)))
print(1-((len(x_22)-len(true_22))/len(x_22)))
print(1-((len(x_23)-len(true_23))/len(x_23)))
print(1-((len(x_24)-len(true_24))/len(x_24)))
print(1-((len(x_25)-len(true_25))/len(x_25)))

### Beta VAE Imputation

In [None]:
df16=pd.read_csv('/home/sameerahtalafha/new_project/new/tables/ALL1-original.csv')

In [None]:
df16.iloc[:,0] = df16.iloc[:,0].astype(float).astype(int)
df16.iloc[:,1] = df16.iloc[:,1].astype(float).astype(int)
df16.iloc[:,2] = df16.iloc[:,2].astype(float).astype(int)
df16.iloc[:,3] = df16.iloc[:,3].astype(float).astype(int)
df16.iloc[:,4] = df16.iloc[:,4].astype(float).astype(int)
df16.iloc[:,5] = df16.iloc[:,5].astype(float).astype(int)
df16.iloc[:,6] = df16.iloc[:,6].astype(float).astype(int)
df16.iloc[:,7]= df16.iloc[:,7].astype(float).astype(int)
df16.iloc[:,8] = df16.iloc[:,8].astype(float).astype(int)
df16.iloc[:,9] = df16.iloc[:,9].astype(float).astype(int)
df16.iloc[:,10] = df16.iloc[:,10].astype(float).astype(int)
df16.iloc[:,11]= df16.iloc[:,11].astype(float).astype(int)
df16.iloc[:,12]= df16.iloc[:,12].astype(float).astype(int)
df16.iloc[:,13] = df16.iloc[:,13].astype(float).astype(int)
###############################################################
df16.iloc[:,14] = df16.iloc[:,14].astype(float).astype(int)
df16.iloc[:,15] = df16.iloc[:,15].astype(float).astype(int)
df16.iloc[:,16] = df16.iloc[:,16].astype(float).astype(int)

df16.iloc[:,17] = df16.iloc[:,17].astype(float).astype(int)
df16.iloc[:,18]= df16.iloc[:,18].astype(float).astype(int)
df16.iloc[:,19] = df16.iloc[:,19].astype(float).astype(int)

df16.iloc[:,20] = df16.iloc[:,20].astype(float).astype(int)
df16.iloc[:,21]= df16.iloc[:,21].astype(float).astype(int)

df16

In [None]:
df17=pd.read_csv('/home/sameerahtalafha/new_project/new/tables/ALL1-Missing.csv')
df17.isnull().sum()

In [None]:
x_0=np.where(df17['DAST'].isnull())[0]
x_1=np.where(df17['SEX'].isnull())[0]
x_2=np.where(df17['HISPANIC'].isnull())[0]
x_3=np.where(df17['RACE'].isnull())[0]
x_4=np.where(df17['VET'].isnull())[0]
x_5=np.where(df17['ACTIVE'].isnull())[0]
x_6=np.where(df17['DEPLOY'].isnull())[0]
x_7=np.where(df17['AUDIT'].isnull())[0]
x_8=np.where(df17['COSCREEN'].isnull())[0]
x_9=np.where(df17['BI'].isnull())[0]
x_10=np.where(df17['BT'].isnull())[0]
x_11=np.where(df17['RT'].isnull())[0]
x_12=np.where(df17['ANYALC'].isnull())[0]
x_13=np.where(df17['BINGEDAYS'].isnull())[0]
x_14=np.where(df17['DRUGDAYS'].isnull())[0]
x_15=np.where(df17['ALCDRUGS'].isnull())[0]
x_16=np.where(df17['DAYSCOCAINE'].isnull())[0]
x_17=np.where(df17['MARYJDAYS'].isnull())[0]

x_18=np.where(df17['METHDAYS'].isnull())[0]

x_19=np.where(df17['INJECT'].isnull())[0]
x_20=np.where(df17['AGE'].isnull())[0]
x_21=np.where(df17['TOBMONTH'].isnull())[0]

In [None]:
df18=pd.read_csv('/home/sameerahtalafha/new_project/Missing-Experments/Beta-VAE-Imputation/imputed_data_trial_1_VAE.csv')

df18

In [None]:
DAST=[]
for i in range(16910):
    x=df18.iloc[i,0]
#     print(x)
#     print(i)
#     if(x[:1]=='+'):
#         print(x)
#         y=x[5:8]
#         print(y)
#     else:
    y=x[:3]
    
        
    DAST.append(float(y))


In [None]:
SEX=[]
for i in range(16910):
    x=df18.iloc[i,1]
    y=x[:3]
    
        
    SEX.append(float(y))
SEX

In [None]:
RACE=[]
for i in range(16910):
    x=df18.iloc[i,3]
    y=x[:3]
    
        
    RACE.append(float(y))
RACE

In [None]:
VET=[]
for i in range(16910):
    x=df18.iloc[i,4]
    y=x[:3]
    
        
    VET.append(float(y))
VET

In [None]:
AUDIT=[]
for i in range(16910):
    x=df18.iloc[i,7]
    y=x[:3]
    
        
    AUDIT.append(float(y))
AUDIT

In [None]:
COSCREEN=[]
for i in range(16910):
    x=df18.iloc[i,8]
    y=x[:3]
    
        
    COSCREEN.append(float(y))
COSCREEN

In [None]:
ANYALC=[]
for i in range(16910):
    x=df18.iloc[i,12]
    y=x[:3]
    
        
    ANYALC.append(float(y))
ANYALC

In [None]:
COSCREEN=[]
for i in range(16910):
    x=df18.iloc[i,13]
    y=x[:3]
    
        
    COSCREEN.append(float(y))
COSCREEN

In [None]:
COSCREEN=[]
for i in range(16910):
    x=df18.iloc[i,14]
    y=x[:3]
    
        
    COSCREEN.append(float(y))
COSCREEN

In [None]:
df18.iloc[:,0] = np.array(DAST).astype(float).astype(int)
df18.iloc[:,1] = np.array(SEX).astype(float).astype(int)
# df18.iloc[:,2] = df18.iloc[:,2].astype(float).astype(int)
df18.iloc[:,3] = np.array(RACE).astype(float).astype(int)
df18.iloc[:,4] = np.array(VET).astype(float).astype(int)
# df18.iloc[:,5] = df18.iloc[:,5].astype(float).astype(int)
# df18.iloc[:,6] = df18.iloc[:,6].astype(float).astype(int)
df18.iloc[:,7]= np.array(AUDIT).astype(float).astype(int)
df18.iloc[:,8] = np.array(COSCREEN).astype(float).astype(int)
# df18.iloc[:,9] = df18.iloc[:,9].astype(float).astype(int)
# df18.iloc[:,10] = df18.iloc[:,10].astype(float).astype(int)
# df18.iloc[:,11]= df18.iloc[:,11].astype(float).astype(int)
# df18.iloc[:,12]= np.array(ANYALC).astype(float).astype(int)
# df18.iloc[:,13] = np.array(BINGEDAYS).astype(float).astype(int)
# # ###############################################################
# df18.iloc[:,14] = np.array(DRUGDAYS).astype(float).astype(int)
# df18.iloc[:,15] = df18.iloc[:,15].astype(float).astype(int)
# df18.iloc[:,16] = df18.iloc[:,16].astype(float).astype(int)

# df18.iloc[:,17] = df18.iloc[:,17].astype(float).astype(int)
# df18.iloc[:,18]= df18.iloc[:,18].astype(float).astype(int)
# df18.iloc[:,19] = df18.iloc[:,19].astype(float).astype(int)

# df18.iloc[:,20] = df18.iloc[:,20].astype(float).astype(int)
# df18.iloc[:,21]= df18.iloc[:,21].astype(float).astype(int)




In [None]:
true_0=[]
for i in x_0:
        if(df16.iloc[i,0]==df18.iloc[i,0]):
            true_0.append(i)
true_1=[]
for i in x_1:
        if(df16.iloc[i,1]==df18.iloc[i,1]):
            true_1.append(i)
# true_2=[]
# for i in x_2:
#         if(df16.iloc[i,2]==df18.iloc[i,2]):
            true_2.append(i)
true_3=[]
for i in x_3:
        if(df16.iloc[i,3]==df18.iloc[i,3]):
            true_3.append(i)

# true_4=[]
# for i in x_4:
#         if(df16.iloc[i,4]==df18.iloc[i,4]):
#             true_4.append(i)
   
            
# true_5=[]
# for i in x_5:
#         if(df16.iloc[i,5]==df18.iloc[i,5]):
#             true_5.append(i)
    
# true_6=[]
# for i in x_6:
#         if(df16.iloc[i,6]==df18.iloc[i,6]):
#             true_6.append(i)

true_7=[]
for i in x_7:
        if(df16.iloc[i,7]==df18.iloc[i,7]):
            true_7.append(i)
true_8=[]
for i in x_8:
        if(df16.iloc[i,8]==df18.iloc[i,8]):
            true_8.append(i)
# true_9=[]
# for i in x_9:
#         if(df16.iloc[i,9]==df18.iloc[i,9]):
#             true_9.append(i)
# true_10=[]
# for i in x_10:
#         if(df16.iloc[i,10]==df18.iloc[i,10]):
#             true_10.append(i)
            
# true_11=[]
# for i in x_11:
#         if(df16.iloc[i,11]==df18.iloc[i,11]):
#             true_11.append(i)
            
            
# true_12=[]
# for i in x_12:
#         if(df16.iloc[i,12]==df18.iloc[i,12]):
#             true_12.append(i)
            
# true_13=[]
# for i in x_13:
#         if(df16.iloc[i,13]==df18.iloc[i,13]):
#             true_13.append(i)
            
            
# true_14=[]
# for i in x_14:
#         if(df16.iloc[i,14]==df18.iloc[i,14]):
#             true_14.append(i)
            
# true_15=[]
# for i in x_15:
#         if(df16.iloc[i,15]==df18.iloc[i,15]):
#             true_15.append(i)    
            
# true_16=[]
# for i in x_16:
#         if(df16.iloc[i,16]==df18.iloc[i,16]):
#             true_16.append(i)
            
# true_17=[]
# for i in x_17:
#         if(df16.iloc[i,17]==df18.iloc[i,17]):
#             true_17.append(i)
            
# true_18=[]
# for i in x_18:
#         if(df16.iloc[i,18]==df18.iloc[i,18]):
#             true_18.append(i)      
            
            
# true_19=[]
# for i in x_19:
#         if(df16.iloc[i,19]==df18.iloc[i,19]):
#             true_19.append(i)   
            
            
# true_20=[]
# for i in x_20:
#         if(df16.iloc[i,20]==df18.iloc[i,20]):
#             true_20.append(i)   
            
            
# true_21=[]
# for i in x_21:
#         if(df16.iloc[i,21]==df18.iloc[i,21]):
#             true_21.append(i)   
            
            
            

In [None]:
## Accuracy in each column 
print(1-((len(x_0)-len(true_0))/len(x_0)))
print(1-((len(x_1)-len(true_1))/len(x_1)))
# print(1-((len(x_2)-len(true_2))/len(x_2)))
print(1-((len(x_3)-len(true_3))/len(x_3)))
print(1-((len(x_4)-len(true_4))/len(x_4)))
# print(1-((len(x_5)-len(true_5))/len(x_5)))
# print(1-((len(x_6)-len(true_6))/len(x_6)))
print(1-((len(x_7)-len(true_7))/len(x_7)))
print(1-((len(x_8)-len(true_8))/len(x_8)))
# print(1-((len(x_9)-len(true_9))/len(x_9)))
# print(1-((len(x_10)-len(true_10))/len(x_10)))
# print(1-((len(x_11)-len(true_11))/len(x_11)))
# print(1-((len(x_12)-len(true_12))/len(x_12)))
# print(1-((len(x_13)-len(true_13))/len(x_13)))
# print(1-((len(x_14)-len(true_14))/len(x_14)))
# print(1-((len(x_15)-len(true_15))/len(x_15)))
# print(1-((len(x_16)-len(true_16))/len(x_16)))
# print(1-((len(x_17)-len(true_17))/len(x_17)))
# print(1-((len(x_18)-len(true_18))/len(x_18)))
# print(1-((len(x_19)-len(true_19))/len(x_19)))
# print(1-((len(x_20)-len(true_20))/len(x_20)))
# print(1-((len(x_21)-len(true_21))/len(x_21)))


### Generative Adversarial Imputation Networks (GAIN)

In [None]:
df19=pd.read_csv('/home/sameerahtalafha/new_project/Missing-Experments/GAIN/data/original_data.csv')
df19.isnull().sum()

In [None]:
df19.iloc[:,0] = df19.iloc[:,0].astype(float).astype(int)
df19.iloc[:,1] = df19.iloc[:,1].astype(float).astype(int)
df19.iloc[:,2] = df19.iloc[:,2].astype(float).astype(int)
df19.iloc[:,3] = df19.iloc[:,3].astype(float).astype(int)
df19.iloc[:,4] = df19.iloc[:,4].astype(float).astype(int)
df19.iloc[:,5] = df19.iloc[:,5].astype(float).astype(int)
df19.iloc[:,6] = df19.iloc[:,6].astype(float).astype(int)
df19.iloc[:,7]= df19.iloc[:,7].astype(float).astype(int)
df19.iloc[:,8] = df19.iloc[:,8].astype(float).astype(int)
df19.iloc[:,9] = df19.iloc[:,9].astype(float).astype(int)
df19.iloc[:,10] = df19.iloc[:,10].astype(float).astype(int)
df19.iloc[:,11]= df19.iloc[:,11].astype(float).astype(int)
df19.iloc[:,12]= df19.iloc[:,12].astype(float).astype(int)
df19.iloc[:,13] = df19.iloc[:,13].astype(float).astype(int)
###############################################################
df19.iloc[:,14] = df19.iloc[:,14].astype(float).astype(int)
df19.iloc[:,15] = df19.iloc[:,15].astype(float).astype(int)
df19.iloc[:,16] = df19.iloc[:,16].astype(float).astype(int)

df19.iloc[:,17] = df19.iloc[:,17].astype(float).astype(int)
df19.iloc[:,18]= df19.iloc[:,18].astype(float).astype(int)
df19.iloc[:,19] = df19.iloc[:,19].astype(float).astype(int)

df19.iloc[:,20] = df19.iloc[:,20].astype(float).astype(int)
df19.iloc[:,21]= df19.iloc[:,21].astype(float).astype(int)
df19

In [None]:
df20=pd.read_csv('/home/sameerahtalafha/new_project/Missing-Experments/GAIN/data/missing_data.csv')
df20.isnull().sum()

In [None]:
x_0=np.where(df20['DAST'].isnull())[0]
x_1=np.where(df20['SEX'].isnull())[0]
x_2=np.where(df20['HISPANIC'].isnull())[0]
x_3=np.where(df20['RACE'].isnull())[0]
x_4=np.where(df20['VET'].isnull())[0]
x_5=np.where(df20['ACTIVE'].isnull())[0]
x_6=np.where(df20['DEPLOY'].isnull())[0]
x_7=np.where(df20['AUDIT'].isnull())[0]
x_8=np.where(df20['COSCREEN'].isnull())[0]
x_9=np.where(df20['BI'].isnull())[0]
x_10=np.where(df20['BT'].isnull())[0]
x_11=np.where(df20['RT'].isnull())[0]
x_12=np.where(df20['ANYALC'].isnull())[0]
x_13=np.where(df20['BINGEDAYS'].isnull())[0]
x_14=np.where(df20['DRUGDAYS'].isnull())[0]
x_15=np.where(df20['ALCDRUGS'].isnull())[0]
x_16=np.where(df20['DAYSCOCAINE'].isnull())[0]
x_17=np.where(df20['MARYJDAYS'].isnull())[0]
x_18=np.where(df20['METHDAYS'].isnull())[0]
x_19=np.where(df20['INJECT'].isnull())[0]
x_20=np.where(df20['AGE'].isnull())[0]
x_21=np.where(df20['TOBMONTH'].isnull())[0]

In [None]:
df21=pd.read_csv('/home/sameerahtalafha/new_project/Missing-Experments/GAIN/data/imputed_data.csv')
df21.isnull().sum()

In [None]:
df21.iloc[:,0] = df21.iloc[:,0].astype(float).astype(int)
df21.iloc[:,1] = df21.iloc[:,1].astype(float).astype(int)
df21.iloc[:,2] = df21.iloc[:,2].astype(float).astype(int)
df21.iloc[:,3] = df21.iloc[:,3].astype(float).astype(int)
df21.iloc[:,4] = df21.iloc[:,4].astype(float).astype(int)
df21.iloc[:,5] = df21.iloc[:,5].astype(float).astype(int)
df21.iloc[:,6] = df21.iloc[:,6].astype(float).astype(int)
df21.iloc[:,7]= df21.iloc[:,7].astype(float).astype(int)
df21.iloc[:,8] = df21.iloc[:,8].astype(float).astype(int)
df21.iloc[:,9] = df21.iloc[:,9].astype(float).astype(int)
df21.iloc[:,10] = df21.iloc[:,10].astype(float).astype(int)
df21.iloc[:,11]= df21.iloc[:,11].astype(float).astype(int)
df21.iloc[:,12]= df21.iloc[:,12].astype(float).astype(int)
df21.iloc[:,13] = df21.iloc[:,13].astype(float).astype(int)
###############################################################
df21.iloc[:,14] = df21.iloc[:,14].astype(float).astype(int)
# df21.iloc[:,15] = df21.iloc[:,15].astype(float).astype(int)
# df21.iloc[:,16] = df21.iloc[:,16].astype(float).astype(int)

# df21.iloc[:,17] = df21.iloc[:,17].astype(float).astype(int)
# df21.iloc[:,18]= df21.iloc[:,18].astype(float).astype(int)
# df21.iloc[:,19] = df21.iloc[:,19].astype(float).astype(int)

# df21.iloc[:,20] = df21.iloc[:,20].astype(float).astype(int)
# df21.iloc[:,21]= df21.iloc[:,21].astype(float).astype(int)


In [None]:
true_0=[]
for i in x_0:
        if(df19.iloc[i,0]==df21.iloc[i,0]):
            true_0.append(i)
true_1=[]
for i in x_1:
        if(df19.iloc[i,1]==df21.iloc[i,1]):
            true_1.append(i)
true_2=[]
for i in x_2:
        if(df19.iloc[i,2]==df21.iloc[i,2]):
            true_2.append(i)
true_3=[]
for i in x_3:
        if(df19.iloc[i,3]==df21.iloc[i,3]):
            true_3.append(i)

true_4=[]
for i in x_4:
        if(df19.iloc[i,4]==df21.iloc[i,4]):
            true_4.append(i)
   
            
true_5=[]
for i in x_5:
        if(df19.iloc[i,5]==df21.iloc[i,5]):
            true_5.append(i)
    
true_6=[]
for i in x_6:
        if(df19.iloc[i,6]==df21.iloc[i,6]):
            true_6.append(i)

true_7=[]
for i in x_7:
        if(df19.iloc[i,7]==df21.iloc[i,7]):
            true_7.append(i)
true_8=[]
for i in x_8:
        if(df19.iloc[i,8]==df21.iloc[i,8]):
            true_8.append(i)
true_9=[]
for i in x_9:
        if(df19.iloc[i,9]==df21.iloc[i,9]):
            true_9.append(i)
true_10=[]
for i in x_10:
        if(df19.iloc[i,10]==df21.iloc[i,10]):
            true_10.append(i)
            
true_11=[]
for i in x_11:
        if(df19.iloc[i,11]==df21.iloc[i,11]):
            true_11.append(i)
            
            
true_12=[]
for i in x_12:
        if(df19.iloc[i,12]==df21.iloc[i,12]):
            true_12.append(i)
            
true_13=[]
for i in x_13:
        if(df19.iloc[i,13]==df21.iloc[i,13]):
            true_13.append(i)
            
            
true_14=[]
for i in x_14:
        if(df19.iloc[i,14]==df21.iloc[i,14]):
            true_14.append(i)
            
true_15=[]
for i in x_15:
        if(df19.iloc[i,15]==df21.iloc[i,15]):
            true_15.append(i)    
            
# true_16=[]
# for i in x_16:
#         if(df19.iloc[i,16]==df21.iloc[i,16]):
#             true_16.append(i)
            
# true_17=[]
# for i in x_17:
#         if(df19.iloc[i,17]==df21.iloc[i,17]):
#             true_17.append(i)
            
# true_18=[]
# for i in x_18:
#         if(df19.iloc[i,18]==df21.iloc[i,18]):
#             true_18.append(i)      
            
            
# true_19=[]
# for i in x_19:
#         if(df19.iloc[i,19]==df21.iloc[i,19]):
#             true_19.append(i)   
            
            
# true_20=[]
# for i in x_20:
#         if(df19.iloc[i,20]==df21.iloc[i,20]):
#             true_20.append(i)   
            
            
# true_21=[]
# for i in x_21:
#         if(df19.iloc[i,21]==df21.iloc[i,21]):
#             true_21.append(i)   
        
            

In [None]:
## Accuracy in each column 
print(1-((len(x_0)-len(true_0))/len(x_0)))
print(1-((len(x_1)-len(true_1))/len(x_1)))
print(1-((len(x_2)-len(true_2))/len(x_2)))
print(1-((len(x_3)-len(true_3))/len(x_3)))
print(1-((len(x_4)-len(true_4))/len(x_4)))
print(1-((len(x_5)-len(true_5))/len(x_5)))
print(1-((len(x_6)-len(true_6))/len(x_6)))
print(1-((len(x_7)-len(true_7))/len(x_7)))
print(1-((len(x_8)-len(true_8))/len(x_8)))
print(1-((len(x_9)-len(true_9))/len(x_9)))
print(1-((len(x_10)-len(true_10))/len(x_10)))
print(1-((len(x_11)-len(true_11))/len(x_11)))
print(1-((len(x_12)-len(true_12))/len(x_12)))
print(1-((len(x_13)-len(true_13))/len(x_13)))
print(1-((len(x_14)-len(true_14))/len(x_14)))
print(1-((len(x_15)-len(true_15))/len(x_15)))
# print(1-((len(x_16)-len(true_16))/len(x_16)))
# print(1-((len(x_17)-len(true_17))/len(x_17)))
# print(1-((len(x_18)-len(true_18))/len(x_18)))
# print(1-((len(x_19)-len(true_19))/len(x_19)))
# print(1-((len(x_20)-len(true_20))/len(x_20)))
# print(1-((len(x_21)-len(true_21))/len(x_21)))


In [None]:
df19.columns

In [None]:
df21['AGE']

In [None]:
df20['AGE']

In [None]:
x_0=np.where(df20['AGE'].isnull())[0]
true_0=[]
for i in x_0:
    if(df19.iloc[i,0]==df21.iloc[i,0]):
        true_0.append(i)
len(true_0)

In [None]:
df21.iloc[:,0]

In [None]:
x_0=np.where(df20['AGE'].isnull())[0]
len(x_0)

In [None]:
473/2836

In [None]:
x_0