In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df = pd.read_csv("/content/gdrive/MyDrive/diabetic_data.csv")

In [None]:
df.shape

(101766, 50)

# 1- Seleccion de datos


## Borrar columnas que contienen unicamente una sola clase

In [None]:
for col in df.columns:
  if(len(df[col].value_counts())==1):
    print("Columna ",col,"eliminada.")
    del df[col]


Columna  examide eliminada.
Columna  citoglipton eliminada.


## Se borran las columnas weight y payer code

In [None]:
 del df["weight"]
 del df["payer_code"]

##Borrar columnas en las que aparece en mas de 95% de registros una misma clase

Se puede ver que todas las columnas son las relacionadas con el cambio de medicinas

In [None]:
categorical_columns = df.select_dtypes(include=[object]).columns
drop_mayority95_cols=[]
for col in categorical_columns:
  v_count=df[col].value_counts()
  porcentaje=100*v_count[0]/(v_count[0]+v_count[1])
  if(porcentaje>95):
    drop_mayority95_cols.append(col)

print(drop_mayority95_cols)

['max_glu_serum', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'tolbutamide', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']


Se define una funcion para que elimine todos las columnas en las que aparece en mas de 95% de registros una misma clase

In [None]:
def borrar_columnas(df):
  df=df.drop(columns=drop_mayority95_cols)
  return df

## Se borran encounter ID y patient number porque estos atributos no afectan en los reingresos.

In [None]:
df=df.drop(['encounter_id','patient_nbr'], axis=1)

# 2-Limpieza de datos

##Valores faltantes

In [None]:
NanColumns=df.columns[(df == '?').mean()>0]
for col in NanColumns:
  a= df[col].value_counts()
  for e in range(0,len(a)):
    if(a.index[e]=="?"):
      print(col,a.index[e],a[e])

race ? 2273
medical_specialty ? 49949
diag_1 ? 21
diag_2 ? 358
diag_3 ? 1423


Rellenar los valores faltantes de la columna race y los valores unknown de gender con la moda 

In [None]:
df["gender"]=df["gender"].replace(['Unknown/Invalid'], df["gender"].mode()[0])
df["race"]=df["race"].replace(['?'], df["race"].mode()[0])

Borramos los registros que tienen tres diagnosticos faltantes

In [None]:
df= df.drop(df[(df["diag_1"]=="?") & (df["diag_2"]=="?")  & (df["diag_3"]=="?")].index)

En los demas reemplazar ? con unknown para que sea mas comprensivo

In [None]:
NanColumns=df.columns[(df == '?').mean()>0]
for col in NanColumns:
  print("Los valores faltantes de la columna",col,"han sido rellenados con Unknown")
  df[col]=df[col].replace(['?'], "Unknown")

Los valores faltantes de la columna medical_specialty han sido rellenados con Unknown
Los valores faltantes de la columna diag_1 han sido rellenados con Unknown
Los valores faltantes de la columna diag_2 han sido rellenados con Unknown
Los valores faltantes de la columna diag_3 han sido rellenados con Unknown


## Borrar encuentros que han acabado en Hospice o ha fallecido el paciente

In [None]:
#alta a hospice
df = df.loc[~df.discharge_disposition_id.isin([13,14,19,20,21])]
#muerte del paciente
df=df.loc[(df['discharge_disposition_id'] != 11)]

# 3- Construcción de datos

### Construccion columna numero de visitas a centros hospitalarios




In [None]:
df["number_hospital"]=df["number_emergency"]+df["number_inpatient"]+df["number_outpatient"]


### Construccion de columna número de procedimientos totales

In [None]:
df["total_procedures"]=df["num_lab_procedures"]+df["num_procedures"]

###Contruccion de columna numero de cambios de medicinas

In [None]:
medications_change=["repaglinide","nateglinide","chlorpropamide","glimepiride","acetohexamide","tolbutamide","acarbose","miglitol","tolazamide", "glyburide-metformin","glipizide-metformin","glimepiride-pioglitazone","metformin-rosiglitazone","metformin-pioglitazone","troglitazone","insulin","pioglitazone","metformin","glipizide","glyburide","rosiglitazone"] 

In [None]:
dosis_changes= [0]*df.shape[0]
for col in medications_change:
  num=0
  for val in df[col]:
    if val=="Up" or val=="Down":
      dosis_changes[num]= dosis_changes[num]+1
    num+=1
df["dosis_changes"]=dosis_changes

In [None]:
df["dosis_changes"].value_counts()

0    72323
1    25609
2     1299
3      107
4        5
Name: dosis_changes, dtype: int64

### Construccion de columna numero de medicinas que toma


In [None]:
num_medicines= [0]*df.shape[0]
for col in medications_change:
  num=0
  for val in df[col]:
    if val!="No":
      num_medicines[num]= num_medicines[num]+1
    num+=1
df["num_medicines"]=num_medicines

In [None]:
df["num_medicines"].value_counts()

1    46030
0    22624
2    21595
3     7707
4     1324
5       58
6        5
Name: num_medicines, dtype: int64

### Construción de columna que indica si se aumento o disminuyo la dosis de los medicamientos

In [None]:
dosis_plus_minus= [0]*df.shape[0]
for col in medications_change:
  num=0
  for val in df[col]:
    if val=="Up":
      dosis_plus_minus[num]= dosis_plus_minus[num]+1
    if val=="Down":
      dosis_plus_minus[num] = dosis_plus_minus[num]-1
    num+=1
df["dosis_plus_minus"]=dosis_plus_minus

In [None]:
df["dosis_plus_minus"].value_counts()

 0    72890
-1    12921
 1    12760
 2      501
-2      236
 3       30
-3        5
Name: dosis_plus_minus, dtype: int64

### Se borran las columnas correspondientes a cambios de medicamientos que contienen un mayor valor de 90% en los cambios de medicamentos

In [None]:
df=borrar_columnas(df)

# 4-Formateo de datos

## Tratamiento de los diagnosticos

In [None]:
 def formatear_icd9(col): 
  lastList=[]
  for i in range(0,len(col)):
    val=col.iloc[i]
    if(val[0].capitalize()=="E" or val[0].capitalize()=="V" ):
      lastList.append("E–V")
    elif(val=="Unknown"):
      lastList.append("Unknown")
      continue
    else:
      val=float(val)
      if((val>=390 and val<460) or (val>=785 and val<786)):
        val="Circulatory"
      elif((val>=460 and val<520) or (val>=786 and val<787)):
        val="Respiratory"
      elif((val>=520 and val<580) or (val>=787 and val<788)):
        val="Digestive"
      elif((val>=580 and val<630) or (val>=788 and val<789)):
        val="Genitourinary"
      elif(val>=710 and val<740):
        val="Musculoskeletal"
      elif(val>=800 and val<1000):
        val="Injury"
      elif(val>=140 and val<240):
        val="Neoplasm"
      elif(val>=250 and val<251):
        val="Diabetes"
      elif((val>=780 and val<782)or (val>=784 and val<785)or (val>=790 and val<800)):
        val="Other_symptoms"
      elif((val>=240 and val<250)or (val>=251 and val<280) or(val>=783 and val<784)):
        val="nutritional"
      elif((val>=680 and val<710) or (val>=782 and val<783)):
        val="skin"
      elif(val>=1 and val<140):
        val="Infectious"
      elif(val>=290 and val<320):
        val="Mental"
      elif(val>=280 and val<290):
        val="blood"
      elif(val>=320 and val<360):
        val="nervous_system"
      elif(val>=630 and val<680):
        val="pregnancy"
      elif(val>=360 and val<390):
        val="sense_organs"
      elif(val>=740 and val<760):
        val="Congenital"
      elif (val>=789 and val<790):
        val="pelvis_abdomen"
      else:
        print(val,"not included")
      lastList.append(val)
  return np.array(lastList)

In [None]:
def formatear_minorias_icd9(col):

  v_counts=col.value_counts()
  a=sum(v_counts)
  other_columns=[]
  for i in range(0,len(v_counts)):
    if((100*v_counts[i])/a<3):
      other_columns.append(v_counts.index[i])

  lastList=[]
  for val in col:
    if val in other_columns:
      val="Other"
    lastList.append(val)
  return lastList

In [None]:
df["diag_1"] = formatear_icd9(df["diag_1"])
df["diag_1"] = formatear_minorias_icd9(df["diag_1"])
df["diag_2"] = formatear_icd9(df["diag_2"]) 
df["diag_2"] = formatear_minorias_icd9(df["diag_2"]) 
df["diag_3"] = formatear_icd9(df["diag_3"])  
df["diag_3"] = formatear_minorias_icd9(df["diag_3"])  

Distribución de diag_1

In [None]:
v_c=df["diag_1"].value_counts()
for i in range(0,len(v_c)):
  valor="{:.2f}".format(v_c[i]*100/df.shape[0])
  print(v_c.index[i],":",valor)

Circulatory : 29.88
Other : 17.93
Respiratory : 14.03
Digestive : 9.39
Diabetes : 8.72
Injury : 6.90
Genitourinary : 5.04
Musculoskeletal : 4.97
Neoplasm : 3.15


Distribución de diag_2

In [None]:
v_c=df["diag_2"].value_counts()
for i in range(0,len(v_c)):
  valor="{:.2f}".format(v_c[i]*100/df.shape[0])
  print(v_c.index[i],":",valor)

Circulatory : 31.36
Other : 21.32
Diabetes : 12.79
Respiratory : 10.46
Genitourinary : 8.20
nutritional : 8.12
Digestive : 4.12
skin : 3.63


Distribución de diag_2

In [None]:
v_c=df["diag_3"].value_counts()
for i in range(0,len(v_c)):
  valor="{:.2f}".format(v_c[i]*100/df.shape[0])
  print(v_c.index[i],":",valor)

Circulatory : 29.80
Other : 18.52
Diabetes : 17.09
nutritional : 9.01
Respiratory : 7.05
Genitourinary : 6.48
E–V : 5.05
Digestive : 3.88
Mental : 3.12


## Tratamiento de medical_specialty

Se clasifican como other las clases que contienen menos de 80 etiquetas

In [None]:
v_count=df["medical_specialty"].value_counts()
for a in range(0,len(v_count)):
  print(v_count.index[a]," : ",v_count[a]*100/df.shape[0])
    



Unknown  :  48.937519503135604
InternalMedicine  :  14.33115569290237
Emergency/Trauma  :  7.468065188287046
Family/GeneralPractice  :  7.299960742075435
Cardiology  :  5.3139124044975485
Surgery-General  :  3.079230544678538
Nephrology  :  1.5491781001177738
Orthopedics  :  1.401205922913542
Orthopedics-Reconstructive  :  1.2381345439537763
Radiologist  :  1.1284136778635636
Pulmonology  :  0.859647886615061
Psychiatry  :  0.8586412731646921
Urology  :  0.686510373151606
ObstetricsandGynecology  :  0.67342439829681
Surgery-Cardiovascular/Thoracic  :  0.6462458351368491
Gastroenterology  :  0.541558036298481
Surgery-Vascular  :  0.528472061443685
Surgery-Neuro  :  0.4650554140704428
PhysicalMedicineandRehabilitation  :  0.3935858590942492
Oncology  :  0.3211096906676867
Pediatrics  :  0.25467320294333773
Neurology  :  0.20232930352415368
Hematology/Oncology  :  0.18823671521898874
Pediatrics-Endocrinology  :  0.1600515386086589
Otolaryngology  :  0.1258266812961155
Endocrinology  :  0.

In [None]:
def formatear_especialidad(col):
  #Se encuentrar las clases que aparecen menos de 80 veces
  v_count=col.value_counts()
  lista_other=[]
  for a in range(0,len(v_count)):
    proportion=v_count[a]*100/df.shape[0]
    if proportion<1:
      lista_other.append(v_count.index[a])

  lastList=[]
  for val in col:
    if val in lista_other:
      val="Other"
    lastList.append(val)
  return lastList

In [None]:
print(len(df["medical_specialty"].value_counts()))
df["medical_specialty"]=formatear_especialidad(df["medical_specialty"])
print(len(df["medical_specialty"].value_counts()))

73
11


In [None]:
v_c=df["medical_specialty"].value_counts()
for i in range(0,len(v_c)):
  valor="{:.2f}".format(v_c[i]*100/df.shape[0])
  print(v_c.index[i],":",valor)

Unknown : 48.94
InternalMedicine : 14.33
Other : 8.25
Emergency/Trauma : 7.47
Family/GeneralPractice : 7.30
Cardiology : 5.31
Surgery-General : 3.08
Nephrology : 1.55
Orthopedics : 1.40
Orthopedics-Reconstructive : 1.24
Radiologist : 1.13


## formatear los fatures admission type discharge disposition, admission source,A1Cresult Y los cambios de medicina

a. formateo de admission_type_id

In [None]:
print("numero de categorias antes de ser formateada:" ,len(df['admission_type_id'].value_counts()))
df['admission_type_id'] = df['admission_type_id'].replace(2,1)
df['admission_type_id'] = df['admission_type_id'].replace(7,1)
df['admission_type_id'] = df['admission_type_id'].replace(6,5)
df['admission_type_id'] = df['admission_type_id'].replace(8,5)
print("numero de categorias despues de ser formateada:" ,len(df['admission_type_id'].value_counts()))


numero de categorias antes de ser formateada: 8
numero de categorias despues de ser formateada: 4


b. formateo de admission_source_id

In [None]:
print("numero de categorias antes de ser formateada:" ,len(df['admission_source_id'].value_counts()))
df['admission_source_id'] = df['admission_source_id'].replace(2,1)
df['admission_source_id'] = df['admission_source_id'].replace(3,1)
df['admission_source_id'] = df['admission_source_id'].replace(4,2)
df['admission_source_id'] = df['admission_source_id'].replace(5,2)
df['admission_source_id'] = df['admission_source_id'].replace(6,2)
df['admission_source_id'] = df['admission_source_id'].replace(10,2)
df['admission_source_id'] = df['admission_source_id'].replace(22,2)
df['admission_source_id'] = df['admission_source_id'].replace(25,2)
df['admission_source_id'] = df['admission_source_id'].replace(9,3)
df['admission_source_id'] = df['admission_source_id'].replace(15,3)
df['admission_source_id'] = df['admission_source_id'].replace(17,3)
df['admission_source_id'] = df['admission_source_id'].replace(20,3)
df['admission_source_id'] = df['admission_source_id'].replace(21,3)
df['admission_source_id'] = df['admission_source_id'].replace(13,11)
df['admission_source_id'] = df['admission_source_id'].replace(14,11)
print("numero de categorias despues de ser formateada:" ,len(df['admission_source_id'].value_counts()))


numero de categorias antes de ser formateada: 17
numero de categorias despues de ser formateada: 6


c. formateo de discharge_disposition_id

In [None]:
print("numero de categorias antes de ser formateada:" ,len(df['discharge_disposition_id'].value_counts()))
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(6,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(8,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(9,1)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(3,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(4,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(5,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(22,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(23,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(24,2)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(12,3)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(15,3)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(16,3)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(17,3)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(18,4)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(25,4)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(26,4)
df['admission_type_id'] = df['admission_type_id'].replace(8,5)
print("numero de categorias despues de ser formateada:" ,len(df['discharge_disposition_id'].value_counts()))


numero de categorias antes de ser formateada: 21
numero de categorias despues de ser formateada: 8


d. formateo de A1Cresult

In [None]:
df['A1Cresult'] = df['A1Cresult'].replace('>7', 1)
df['A1Cresult'] = df['A1Cresult'].replace('>8', 1)
df['A1Cresult'] = df['A1Cresult'].replace('Norm', 0)
df['A1Cresult'] = df['A1Cresult'].replace('None', -1)


In [None]:
print(df.select_dtypes(include=[object]).columns)

Index(['race', 'gender', 'age', 'medical_specialty', 'diag_1', 'diag_2',
       'diag_3', 'metformin', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')


e. Formateo de los cambios de medicina

In [None]:
medicine_cols=['metformin', 'glipizide', 'glyburide', 'rosiglitazone','insulin','pioglitazone', 'rosiglitazone']
for col in medicine_cols: 
    df[col] = df[col].replace('No', 0)
    df[col] = df[col].replace('Up', 1)
    df[col] = df[col].replace('Down', 1)
    df[col] = df[col].replace('Steady', 1)

##Cambio de tipo de variable

Cambio de variable age de str a int

In [None]:
df["age"].value_counts()

[70-80)     25331
[60-70)     22059
[50-60)     17060
[80-90)     16434
[40-50)      9607
[30-40)      3764
[90-100)     2589
[20-30)      1649
[10-20)       690
[0-10)        160
Name: age, dtype: int64

In [None]:
def cambiar_age(column):

  lista_final=[]
  for val in column:
    if(val=="[0-10)"):
      lista_final.append(10)
    elif(val=="[10-20)"):
      lista_final.append(20)
    elif(val=="[20-30)"):
      lista_final.append(30)
    elif(val=="[30-40)"):
      lista_final.append(40)
    elif(val=="[40-50)"):
          lista_final.append(50)
    elif(val=="[50-60)"):
      lista_final.append(60)
    elif(val=="[60-70)"):
      lista_final.append(70)
    elif(val=="[70-80)"):
      lista_final.append(80)
    elif(val=="[80-90)"):
      lista_final.append(90)
    elif(val=="[90-100)"):
      lista_final.append(100)
    else:
      print("ELSE ",val)
  return lista_final  

In [None]:
df["age"]=cambiar_age(df["age"])


Se cambia el tipo de "admission_type_id","discharge_disposition_id" y "admission_source_id" a categorica

In [None]:
columns=["admission_type_id","discharge_disposition_id", "admission_source_id"]
df[columns]=df[columns].astype(object)

## Escalar los datos y convertir las variables categoricas a enteros

In [None]:
df

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted,number_hospital,total_procedures,dosis_changes,num_medicines,dosis_plus_minus
0,Caucasian,Female,10,5,4,1,1,Other,41,0,1,0,0,0,Diabetes,Other,Other,1,-1,0,0,0,0,0,0,No,No,NO,0,41,0,0,0
1,Caucasian,Female,20,1,1,7,3,Unknown,59,0,18,0,0,0,Other,Diabetes,nutritional,9,-1,0,0,0,0,0,1,Ch,Yes,>30,0,59,1,1,1
2,AfricanAmerican,Female,30,1,1,7,2,Unknown,11,5,13,2,0,1,Other,Diabetes,E–V,6,-1,0,1,0,0,0,0,No,Yes,NO,3,16,0,1,0
3,Caucasian,Male,40,1,1,7,2,Unknown,44,1,16,0,0,0,Other,Diabetes,Circulatory,7,-1,0,0,0,0,0,1,Ch,Yes,NO,0,45,1,1,1
4,Caucasian,Male,50,1,1,7,1,Unknown,51,0,8,0,0,0,Neoplasm,Other,Diabetes,5,-1,0,1,0,0,0,1,Ch,Yes,NO,0,51,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,80,1,2,7,3,Unknown,51,0,16,0,0,0,Diabetes,Other,Circulatory,9,1,1,0,0,0,0,1,Ch,Yes,>30,0,51,1,2,-1
101762,AfricanAmerican,Female,90,1,2,2,5,Unknown,33,3,18,0,0,1,Digestive,nutritional,Digestive,9,-1,0,0,0,0,0,1,No,Yes,NO,1,36,0,1,0
101763,Caucasian,Male,80,1,1,7,1,Unknown,53,0,9,1,0,0,Other,Genitourinary,Mental,13,-1,1,0,0,0,0,1,Ch,Yes,NO,1,53,1,2,-1
101764,Caucasian,Female,90,1,2,7,10,Surgery-General,45,2,21,0,0,1,Injury,Other,Other,9,-1,0,1,0,1,0,1,Ch,Yes,NO,1,47,1,3,1


numero total de variables numericas y categoricas

In [None]:
medicine_cols=['metformin', 'glipizide', 'glyburide', 'rosiglitazone','insulin','pioglitazone']
categorical_columns = df.select_dtypes(include=[object]).columns.tolist()+medicine_cols
numerical_columns=df[df.columns.difference(categorical_columns)].columns

print("Categoricas:",len(categorical_columns))
print(categorical_columns)
print()
print("Numericas:",len(numerical_columns))
print(numerical_columns.tolist())



Categoricas: 18
['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'change', 'diabetesMed', 'readmitted', 'metformin', 'glipizide', 'glyburide', 'rosiglitazone', 'insulin', 'pioglitazone']

Numericas: 15
['A1Cresult', 'age', 'dosis_changes', 'dosis_plus_minus', 'num_lab_procedures', 'num_medications', 'num_medicines', 'num_procedures', 'number_diagnoses', 'number_emergency', 'number_hospital', 'number_inpatient', 'number_outpatient', 'time_in_hospital', 'total_procedures']


In [None]:
df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'A1Cresult', 'metformin', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'change', 'diabetesMed', 'readmitted',
       'number_hospital', 'total_procedures', 'dosis_changes', 'num_medicines',
       'dosis_plus_minus'],
      dtype='object')

a. Escalado de los datos numericos

In [None]:
df[numerical_columns]=(df[numerical_columns]-df[numerical_columns].min())/(df[numerical_columns].max()-df[numerical_columns].min())

b.1 Las columnas que contienen dos clases solos se formatean de la siguiente manera:


*   Se establece 0 para una clase
*   Se establece 1 para la otra clase



In [None]:
for col in categorical_columns:
  v_count=df[col].value_counts()
  if((len(v_count)==2) & (df[col].dtypes=='O')):
    print(col," : ",v_count)
    print()

gender  :  Female    53457
Male      45886
Name: gender, dtype: int64

change  :  No    53221
Ch    46122
Name: change, dtype: int64

diabetesMed  :  Yes    76719
No     22624
Name: diabetesMed, dtype: int64



In [None]:
df["gender"] = df["gender"].replace("Female", 0)
df["gender"] = df["gender"].replace("Male", 1)
df["diabetesMed"] = df["diabetesMed"].replace("No", 0)
df["diabetesMed"] = df["diabetesMed"].replace("Yes", 1)
df["change"] = df["change"].replace("No", 0)
df["change"] = df["change"].replace("Yes", 1)

b.2 Para las demas columnas se crea una columna nueva para cada clase

In [None]:
categorical_columns = df.select_dtypes(include=[object]).columns
for column in categorical_columns:
  if column=="readmitted":
    continue
  df=pd.get_dummies(df,columns=[column],prefix=[column])

In [None]:
df.shape

(99343, 86)

In [None]:
df

Unnamed: 0,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,diabetesMed,readmitted,number_hospital,total_procedures,dosis_changes,num_medicines,dosis_plus_minus,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,admission_type_id_1,admission_type_id_3,admission_type_id_4,admission_type_id_5,discharge_disposition_id_1,discharge_disposition_id_2,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_7,discharge_disposition_id_10,discharge_disposition_id_27,...,admission_source_id_11,medical_specialty_Cardiology,medical_specialty_Emergency/Trauma,medical_specialty_Family/GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Nephrology,medical_specialty_Orthopedics,medical_specialty_Orthopedics-Reconstructive,medical_specialty_Other,medical_specialty_Radiologist,medical_specialty_Surgery-General,medical_specialty_Unknown,diag_1_Circulatory,diag_1_Diabetes,diag_1_Digestive,diag_1_Genitourinary,diag_1_Injury,diag_1_Musculoskeletal,diag_1_Neoplasm,diag_1_Other,diag_1_Respiratory,diag_2_Circulatory,diag_2_Diabetes,diag_2_Digestive,diag_2_Genitourinary,diag_2_Other,diag_2_Respiratory,diag_2_nutritional,diag_2_skin,diag_3_Circulatory,diag_3_Diabetes,diag_3_Digestive,diag_3_E–V,diag_3_Genitourinary,diag_3_Mental,diag_3_Other,diag_3_Respiratory,diag_3_nutritional,change_0,change_Ch
0,0,0.000000,0.000000,0.305344,0.000000,0.0000,0.000000,0.0,0.000000,0.000000,0.0,0,0,0,0,0,0,0,NO,0.0000,0.298507,0.00,0.000000,0.500000,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0.111111,0.153846,0.442748,0.000000,0.2125,0.000000,0.0,0.000000,0.533333,0.0,0,0,0,0,0,1,1,>30,0.0000,0.432836,0.25,0.166667,0.666667,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,0,0.222222,0.076923,0.076336,0.833333,0.1500,0.047619,0.0,0.047619,0.333333,0.0,0,1,0,0,0,0,1,NO,0.0375,0.111940,0.00,0.166667,0.500000,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,1,0.333333,0.076923,0.328244,0.166667,0.1875,0.000000,0.0,0.000000,0.400000,0.0,0,0,0,0,0,1,1,NO,0.0000,0.328358,0.25,0.166667,0.666667,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,1,0.444444,0.000000,0.381679,0.000000,0.0875,0.000000,0.0,0.000000,0.266667,0.0,0,1,0,0,0,1,1,NO,0.0000,0.373134,0.00,0.333333,0.500000,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,0.777778,0.153846,0.381679,0.000000,0.1875,0.000000,0.0,0.000000,0.533333,1.0,1,0,0,0,0,1,1,>30,0.0000,0.373134,0.25,0.333333,0.333333,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
101762,0,0.888889,0.307692,0.244275,0.500000,0.2125,0.000000,0.0,0.047619,0.533333,0.0,0,0,0,0,0,1,1,NO,0.0125,0.261194,0.00,0.166667,0.500000,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0
101763,1,0.777778,0.000000,0.396947,0.000000,0.1000,0.023810,0.0,0.000000,0.800000,0.0,1,0,0,0,0,1,1,NO,0.0125,0.388060,0.25,0.333333,0.333333,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
101764,0,0.888889,0.692308,0.335878,0.333333,0.2500,0.000000,0.0,0.047619,0.533333,0.0,0,1,0,1,0,1,1,NO,0.0125,0.343284,0.25,0.500000,0.666667,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1


## Formatear columna de readmitted

In [None]:
df['readmitted'] = df['readmitted'].replace('>30', 0)
df['readmitted'] = df['readmitted'].replace('<30', 1)
df['readmitted'] = df['readmitted'].replace('NO', 0)

# Guardar datos CSV

In [None]:
df.to_csv('/content/gdrive/MyDrive/datosTratados_last.csv')