## *Adult* [dataset](https://archive.ics.uci.edu/ml/datasets/Adult)

In [1]:
# import the necessary packages
import os # to manage directories
import requests # to make requests to the web
import re
import pandas as pd
import numpy as np

In [11]:
# go back to the root directory
if not os.getcwd().split('\\')[-1]=='adult': os.chdir('..')
# localize a directory to save the dataset
if not os.path.isdir('imports_dataset/'): os.mkdir('imports_dataset/')
os.chdir('imports_dataset/')



In [12]:
# download the dataset
response = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')
# save the dataset
with open("adults.data", "wb") as f:
    f.write(response.content)

In [13]:
# download the metadata
response = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names')
# save the metadata
with open("adults.names", "wb") as f:
    f.write(response.content)


In [15]:
# read the dataset
with open('adults.data','r') as f:
    data = f.read().splitlines() # split the data by lines
    data = [elem.split(',') for elem in data] # split the lines by commas and delete the '\n' character
data


[['39',
  ' State-gov',
  ' 77516',
  ' Bachelors',
  ' 13',
  ' Never-married',
  ' Adm-clerical',
  ' Not-in-family',
  ' White',
  ' Male',
  ' 2174',
  ' 0',
  ' 40',
  ' United-States',
  ' <=50K'],
 ['50',
  ' Self-emp-not-inc',
  ' 83311',
  ' Bachelors',
  ' 13',
  ' Married-civ-spouse',
  ' Exec-managerial',
  ' Husband',
  ' White',
  ' Male',
  ' 0',
  ' 0',
  ' 13',
  ' United-States',
  ' <=50K'],
 ['38',
  ' Private',
  ' 215646',
  ' HS-grad',
  ' 9',
  ' Divorced',
  ' Handlers-cleaners',
  ' Not-in-family',
  ' White',
  ' Male',
  ' 0',
  ' 0',
  ' 40',
  ' United-States',
  ' <=50K'],
 ['53',
  ' Private',
  ' 234721',
  ' 11th',
  ' 7',
  ' Married-civ-spouse',
  ' Handlers-cleaners',
  ' Husband',
  ' Black',
  ' Male',
  ' 0',
  ' 0',
  ' 40',
  ' United-States',
  ' <=50K'],
 ['28',
  ' Private',
  ' 338409',
  ' Bachelors',
  ' 13',
  ' Married-civ-spouse',
  ' Prof-specialty',
  ' Wife',
  ' Black',
  ' Female',
  ' 0',
  ' 0',
  ' 40',
  ' Cuba',
  ' <=50K'],


In [16]:
# read the metadata
with open('adults.names','r') as f:
    metadata = f.read().splitlines()
metadata

['| This data was extracted from the census bureau database found at',
 '| http://www.census.gov/ftp/pub/DES/www/welcome.html',
 '| Donor: Ronny Kohavi and Barry Becker,',
 '|        Data Mining and Visualization',
 '|        Silicon Graphics.',
 '|        e-mail: ronnyk@sgi.com for questions.',
 '| Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).',
 '| 48842 instances, mix of continuous and discrete    (train=32561, test=16281)',
 '| 45222 if instances with unknown values are removed (train=30162, test=15060)',
 '| Duplicate or conflicting instances : 6',
 '| Class probabilities for adult.all file',
 "| Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)",
 "| Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)",
 '|',
 '| Extraction was done by Barry Becker from the 1994 Census database.  A set of',
 '|   reasonably clean records was extracted using the following conditions:',
 '|   ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0

In [17]:
# Regex functions - extract information from text, using a pattern.
regex_fn = lambda text: re.findall('^[a-zA-Z-]+:{1}', text)
reg_text_fn = lambda text : re.findall('[a-zA-Z-]+', text)
metadata_list = [regex_fn(elem)[0] for elem in metadata if regex_fn(elem)]
col_names = [reg_text_fn(elem)[0] for elem in metadata_list if regex_fn(elem)] + ["label"]
col_names

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'label']

In [18]:
# build the dataframe
df = pd.DataFrame(data=data, columns=col_names)
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [19]:
df.dtypes

age               object
workclass         object
fnlwgt            object
education         object
education-num     object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain      object
capital-loss      object
hours-per-week    object
native-country    object
label             object
dtype: object

In [20]:
# change the numeric columns to numeric
# iterate over the columns and convert them to numeric, if possible, otherwise ignore the errors
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')
# check the types of the columns again
df.info()

  df[col] = pd.to_numeric(df[col], errors='ignore')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32562 entries, 0 to 32561
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  float64
 1   workclass       32561 non-null  object 
 2   fnlwgt          32561 non-null  float64
 3   education       32561 non-null  object 
 4   education-num   32561 non-null  float64
 5   marital-status  32561 non-null  object 
 6   occupation      32561 non-null  object 
 7   relationship    32561 non-null  object 
 8   race            32561 non-null  object 
 9   sex             32561 non-null  object 
 10  capital-gain    32561 non-null  float64
 11  capital-loss    32561 non-null  float64
 12  hours-per-week  32561 non-null  float64
 13  native-country  32561 non-null  object 
 14  label           32561 non-null  object 
dtypes: float64(6), object(9)
memory usage: 3.7+ MB


In [6]:
#shearch for missing values
df.isna().sum()

age               1
workclass         1
fnlwgt            1
education         1
education-num     1
marital-status    1
occupation        1
relationship      1
race              1
sex               1
capital-gain      1
capital-loss      1
hours-per-week    1
native-country    1
label             1
dtype: int64

In [7]:
# we see that the last row has missing values, so we drop the missing values and so, we will drop the last row.
df = df.dropna()

In [8]:
# check the missing values again
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
label             0
dtype: int64