## 7.1 - *Adult* [dataset](https://archive.ics.uci.edu/ml/datasets/Adult)

In [1]:
# importamos las bibliotecas necesarias
import os # Para obtener el directorio activo
import requests # Para descargar ficheros
import re
import pandas as pd
import numpy as np

In [2]:
# Volvemos atrás un directorio
if not os.getcwd().split('\\')[-1]=='adult': os.chdir('..')
# Ahora localizamos la nueva ruta
if not os.path.isdir('imports_dataset/'): os.mkdir('imports_dataset/')
os.chdir('imports_dataset/')
# Descargamos el fichero que contiene los datos a nuestro directorio activo
response = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')
# Guardamos los archivos descargados de los datos
with open("adults.data", "wb") as f:
    f.write(response.content)
# Descargamos la metadata asociada al conjunto de datos
response = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names')
# Guardamos la metadata asociada al conjunto de datos
with open("adults.names", "wb") as f:
    f.write(response.content)
# Leemos datos
with open('adults.data','r') as f:
    data = f.read().splitlines() # Dividimos el texto por saltos de línea
    data = [elem.split(',') for elem in data] # Dividimos cada línea por las comas y removemos líneas vacías
# Leemos metadata
with open('adults.names','r') as f:
    metadata = f.read().splitlines()
# Regex - extraemos información por expresiones regulares, localizamos cadena de texto a partir de determinado patrón.
regex_fn = lambda text: re.findall('^[a-zA-Z-]+:{1}', text)
reg_text_fn = lambda text : re.findall('[a-zA-Z-]+', text)
metadata_list = [regex_fn(elem)[0] for elem in metadata if regex_fn(elem)]
col_names = [reg_text_fn(elem)[0] for elem in metadata_list if regex_fn(elem)] + ["label"]
# Construimos el objeto pd.DataFrame
df = pd.DataFrame(data=data, columns=col_names)
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [4]:
df.dtypes

age               object
workclass         object
fnlwgt            object
education         object
education-num     object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain      object
capital-loss      object
hours-per-week    object
native-country    object
label             object
dtype: object

In [5]:
# cambiamos los datos numéricos a números
# Iteramos sobre las columnas del dataset, el error es para que ignore las columnas de texto o los Nan
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')
# Comprobamos que se han convertido con éxito
df.info()

  df[col] = pd.to_numeric(df[col], errors='ignore')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32562 entries, 0 to 32561
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  float64
 1   workclass       32561 non-null  object 
 2   fnlwgt          32561 non-null  float64
 3   education       32561 non-null  object 
 4   education-num   32561 non-null  float64
 5   marital-status  32561 non-null  object 
 6   occupation      32561 non-null  object 
 7   relationship    32561 non-null  object 
 8   race            32561 non-null  object 
 9   sex             32561 non-null  object 
 10  capital-gain    32561 non-null  float64
 11  capital-loss    32561 non-null  float64
 12  hours-per-week  32561 non-null  float64
 13  native-country  32561 non-null  object 
 14  label           32561 non-null  object 
dtypes: float64(6), object(9)
memory usage: 3.7+ MB


In [6]:
#buscamos los datos nulos
df.isna().sum()

age               1
workclass         1
fnlwgt            1
education         1
education-num     1
marital-status    1
occupation        1
relationship      1
race              1
sex               1
capital-gain      1
capital-loss      1
hours-per-week    1
native-country    1
label             1
dtype: int64

In [7]:
# determinamos que la ultima fila tiene datos nulos, por lo tanto eliminamos los datos nulos y se eliminará esta última.
df = df.dropna()

In [8]:
# volvemos a corroborar nulos
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
label             0
dtype: int64