In [1]:
import pandas as pd
# read in data
df = pd.read_csv("adult.data",names = ["age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income"])

In [2]:
# Display for exploring data
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Display for exploring data
df.shape

(32561, 15)

In [4]:
# Display for exploring data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
# checking missing values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [6]:
# separate numerical data
df_numeric = df[['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']].copy()

In [7]:
from sklearn.preprocessing import StandardScaler

# standardization of numeric values
scaler = StandardScaler()
df_numeric_sc = pd.DataFrame(scaler.fit_transform(df_numeric))
df_numeric_sc.rename(mapper=dict(zip(df_numeric_sc.columns, df_numeric.columns)), axis=1, inplace=True)
df_numeric_sc.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,-2.705915e-17,-1.001625e-16,1.471887e-16,1.309314e-17,1.0169e-16,-1.5493550000000002e-17
std,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.582206,-1.681631,-3.529656,-0.1459205,-0.2166595,-3.19403
25%,-0.7757679,-0.681691,-0.4200596,-0.1459205,-0.2166595,-0.03542945
50%,-0.1159546,-0.1082193,-0.03136003,-0.1459205,-0.2166595,-0.03542945
75%,0.6904838,0.4478765,0.7460392,-0.1459205,-0.2166595,0.3695194
max,3.769612,12.26856,2.300838,13.39458,10.59351,4.742967


In [8]:
from sklearn.preprocessing import Normalizer

# normalize numeric data
normal = Normalizer()
df_numeric_nrm = pd.DataFrame(normal.fit_transform(df_numeric))
df_numeric_nrm.rename(mapper=dict(zip(df_numeric_nrm.columns, df_numeric)), axis=1, inplace=True)
df_numeric_nrm.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.000308,0.998796,8e-05,0.00705,0.000691,0.000321
std,0.000311,0.016682,7.6e-05,0.045375,0.004263,0.000325
min,1.7e-05,0.246804,2e-06,0.0,0.0,3e-06
25%,0.000135,1.0,3.9e-05,0.0,0.0,0.000153
50%,0.000213,1.0,5.6e-05,0.0,0.0,0.000225
75%,0.000349,1.0,8.7e-05,0.0,0.0,0.000348
max,0.00398,1.0,0.001008,0.969065,0.09435,0.004177


In [9]:
# create dataframe for categorical data
df_cat = df[['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']].copy()

In [10]:
# check how many work-classes exists. If there are more than two different values One-Hot-Encoding is used, otherwise Label-Encoding
df_cat['workclass'].unique()
# TODO there is a null - field which was not recognized from the further check - function !!!

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [11]:
df_cat['education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [12]:
df_cat['marital_status'].unique()

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [13]:
df_cat['occupation'].unique()
# TODO contains ? -> is it an null value?

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', ' ?', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [14]:
df_cat['relationship'].unique()

array([' Not-in-family', ' Husband', ' Wife', ' Own-child', ' Unmarried',
       ' Other-relative'], dtype=object)

In [15]:
df_cat['race'].unique()

array([' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo',
       ' Other'], dtype=object)

In [16]:
df_cat['native_country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [17]:
df_cat['sex'].unique()

array([' Male', ' Female'], dtype=object)

In [18]:
df_cat['income'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
# start training

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# because income has also only two entries -> use LabelEncoder
lbencod = LabelEncoder()

df_encode = pd.DataFrame()
df_encode['sex_encode'] = pd.DataFrame(lbencod.fit_transform(df_cat['sex']))
df_encode['income_encode'] = pd.DataFrame(lbencod.fit_transform(df_cat['income']))

onehotenc = OneHotEncoder(categories='auto', sparse_output=False)
census_onehot_workclass = pd.DataFrame(onehotenc.fit_transform(df[['workclass']]))
census_onehot_workclass.columns = onehotenc.get_feature_names_out()

census_onehot_education = pd.DataFrame(onehotenc.fit_transform(df[['education']]))
census_onehot_education.columns = onehotenc.get_feature_names_out()

census_onehot_marital = pd.DataFrame(onehotenc.fit_transform(df[['marital_status']]))
census_onehot_marital.columns = onehotenc.get_feature_names_out()

census_onehot_occupation = pd.DataFrame(onehotenc.fit_transform(df[['occupation']]))
census_onehot_occupation.columns = onehotenc.get_feature_names_out()

census_onehot_relationship = pd.DataFrame(onehotenc.fit_transform(df[['relationship']]))
census_onehot_relationship.columns = onehotenc.get_feature_names_out()

census_onehot_race = pd.DataFrame(onehotenc.fit_transform(df[['race']]))
census_onehot_race.columns = onehotenc.get_feature_names_out()

census_onehot_country = pd.DataFrame(onehotenc.fit_transform(df[['native_country']]))
census_onehot_country.columns = onehotenc.get_feature_names_out()

# concat all one-hot categories
census_onehot = pd.concat([census_onehot_workclass,census_onehot_education ], axis=1)
census_onehot = pd.concat([census_onehot, census_onehot_marital], axis=1)
census_onehot = pd.concat([census_onehot, census_onehot_relationship], axis=1)
census_onehot = pd.concat([census_onehot, census_onehot_race], axis=1)
census_onehot = pd.concat([census_onehot, census_onehot_country], axis=1)

# concat one-hot with label encode dataframes
df_encode = pd.concat([df_encode, census_onehot], axis=1)

# concat categories with normalized numeric dataframes
df_encode = pd.concat([df_numeric_nrm, df_encode])

df_encode.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,sex_encode,income_encode,workclass_ ?,workclass_ Federal-gov,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,...,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.000308,0.998796,8e-05,0.00705,0.000691,0.000321,0.669205,0.24081,0.056386,0.029483,...,0.001136,0.003501,0.000369,0.002457,0.001566,0.000553,0.000584,0.895857,0.002058,0.000491
std,0.000311,0.016682,7.6e-05,0.045375,0.004263,0.000325,0.470506,0.427581,0.23067,0.169159,...,0.033691,0.059068,0.019194,0.049507,0.039546,0.023506,0.024149,0.305451,0.045316,0.022162
min,1.7e-05,0.246804,2e-06,0.0,0.0,3e-06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000135,1.0,3.9e-05,0.0,0.0,0.000153,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.000213,1.0,5.6e-05,0.0,0.0,0.000225,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.000349,1.0,8.7e-05,0.0,0.0,0.000348,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,0.00398,1.0,0.001008,0.969065,0.09435,0.004177,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
# split data into training and test set

x = df_encode.iloc[:, [1, 2, 3, 4]].dropna() # TODO drop data before encoding ?
y = df_encode.loc[:, 'income_encode'].dropna() # TODO drop data before encoding ?

# This has to be done, otherwise Error will occur
#y = y.astype('float')

x_training, x_test, y_training, y_test = train_test_split(x, y, train_size=0.75)
y_test.describe()

count    8141.000000
mean        0.240142
std         0.427196
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: income_encode, dtype: float64

In [21]:
# using the kNN classifier
model = KNeighborsClassifier(n_neighbors=10)
model.fit(x_training, y_training)

In [22]:
from sklearn.metrics import accuracy_score
# predict
y_prediction = model.predict(x_test)

# evaluation
acc = accuracy_score(y_test, y_prediction)
print(f'accurancy: {acc}')

accurancy: 0.7895835892396511
