In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name="") :
    if id_name == "" :
        df = df.reset_index().rename(columns = {"index" : "id"})
        id_name = 'id'
    else :
        id_name = id_name

    if null_name != "" :
        df[df==null_name] = np.nan

    X_train, X_test = train_test_split(df, test_size=.2, random_state=2021)

    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])

    return X_train, X_test, y_train, y_test

df = pd.read_csv('archive/adult.csv')

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [20]:
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((26048, 15), (26048, 2), (6513, 15), (6513, 2))

In [21]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26048 entries, 21851 to 25716
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26048 non-null  int64 
 1   age             26048 non-null  int64 
 2   workclass       24592 non-null  object
 3   fnlwgt          26048 non-null  int64 
 4   education       26048 non-null  object
 5   education.num   26048 non-null  int64 
 6   marital.status  26048 non-null  object
 7   occupation      24585 non-null  object
 8   relationship    26048 non-null  object
 9   race            26048 non-null  object
 10  sex             26048 non-null  object
 11  capital.gain    26048 non-null  int64 
 12  capital.loss    26048 non-null  int64 
 13  hours.per.week  26048 non-null  int64 
 14  native.country  25587 non-null  object
dtypes: int64(7), object(8)
memory usage: 3.2+ MB


In [22]:
Numeric_features = ['age', 'fnlwgt', 'education.num','capital.gain', 'capital.loss', 'hours.per.week']

cat_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']



In [23]:
X_train[Numeric_features].describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,26048.0,26048.0,26048.0,26048.0,26048.0,26048.0
mean,38.610335,189574.1,10.082118,1081.193796,88.477695,40.420224
std,13.628346,104384.8,2.574608,7404.962675,404.689981,12.354707
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,118247.2,9.0,0.0,0.0,40.0
50%,37.0,178575.5,10.0,0.0,0.0,40.0
75%,48.0,236596.8,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [24]:
X_train.isnull().sum()

id                   0
age                  0
workclass         1456
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1463
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     461
dtype: int64

In [25]:
X_test.isnull().sum()

id                  0
age                 0
workclass         380
fnlwgt              0
education           0
education.num       0
marital.status      0
occupation        380
relationship        0
race                0
sex                 0
capital.gain        0
capital.loss        0
hours.per.week      0
native.country    122
dtype: int64

In [26]:
X_train['workclass'].value_counts()

workclass
Private             18160
Self-emp-not-inc     2049
Local-gov            1648
State-gov            1037
Self-emp-inc          909
Federal-gov           770
Without-pay            12
Never-worked            7
Name: count, dtype: int64

In [27]:
X_train['occupation'].value_counts()

occupation
Exec-managerial      3323
Prof-specialty       3306
Craft-repair         3296
Adm-clerical         3037
Sales                2898
Other-service        2624
Machine-op-inspct    1584
Transport-moving     1257
Handlers-cleaners    1080
Farming-fishing       786
Tech-support          746
Protective-serv       521
Priv-house-serv       119
Armed-Forces            8
Name: count, dtype: int64

In [28]:
X_train['native.country'].value_counts()

native.country
United-States                 23381
Mexico                          516
Philippines                     158
Germany                         108
Canada                           88
Puerto-Rico                      87
El-Salvador                      76
India                            73
Cuba                             73
England                          69
Italy                            63
South                            62
Jamaica                          59
Vietnam                          57
China                            57
Guatemala                        54
Dominican-Republic               51
Japan                            49
Poland                           47
Columbia                         44
Taiwan                           37
Haiti                            37
Iran                             34
Portugal                         32
Peru                             29
Nicaragua                        27
Ecuador                          25
Greece       

In [29]:
def data_fillna(df) :
    df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])
    df['occupation'] = df['occupation'].fillna('null')
    df['native.country'] = df['native.country'].fillna(df['native.country'].mode()[0])

    return df

X_train = data_fillna(X_train)
X_test = data_fillna(X_test)

X_train.isnull().sum()


id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

In [30]:
from sklearn.preprocessing import LabelEncoder

all_df = pd.concat([X_train.assign(ind='train'), X_test.assign(ind='test')])
le = LabelEncoder()
all_df[cat_features] = all_df[cat_features].apply(le.fit_transform)

X_train = all_df[all_df['ind'] == 'train']
X_train = X_train.drop('ind', axis=1)
X_train

X_test = all_df[all_df['ind'] == 'test']
X_test = X_test.drop('ind', axis=1)





In [None]:
from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler()
X_train[Numeric_features] = sc.fit_transform(X_train[Numeric_features])
X_test[Numeric_features] = sc.transform(X_test[Numeric_features])

X_train

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,0.260274,3,0.156011,9,0.800000,2,2,0,4,1,0.000000,0.0,0.500000,38
7632,7632,0.493151,3,0.062255,12,0.866667,0,9,1,4,0,0.000000,0.0,0.397959,38
27878,27878,0.027397,3,0.129566,15,0.600000,4,12,1,4,0,0.000000,0.0,0.244898,38
14121,14121,0.041096,3,0.061343,11,0.533333,4,5,3,4,1,0.000000,0.0,0.295918,38
32345,32345,0.506849,6,0.085958,11,0.533333,2,9,0,4,1,0.000000,0.0,0.397959,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2669,2669,0.383562,3,0.118910,12,0.866667,0,3,4,4,1,0.074301,0.0,0.704082,38
17536,17536,0.260274,3,0.110039,1,0.400000,0,13,1,4,1,0.000000,0.0,0.397959,38
6201,6201,0.410959,3,0.178669,7,0.733333,6,7,3,4,0,0.000000,0.0,0.346939,38
27989,27989,0.452055,5,0.125113,10,1.000000,2,9,0,4,1,0.000000,0.0,0.224490,38


In [33]:
y = (y_train['income'] != '<=50k').astype(int)

y[:5]

21851    1
7632     1
27878    1
14121    1
32345    1
Name: income, dtype: int64

In [35]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=.15, random_state=2021)

X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((22140, 15), (3908, 15), (22140,), (3908,))

In [36]:
X_tr.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
1437,1437,0.191781,3,0.216501,9,0.8,4,3,1,4,0,0.0,0.323232,0.397959,38
7151,7151,0.287671,5,0.127591,11,0.533333,2,2,0,4,1,0.0,0.0,0.602041,38
30296,30296,0.424658,3,0.217452,9,0.8,2,9,0,4,1,0.0,0.0,0.346939,38
15372,15372,0.452055,3,0.142442,11,0.533333,2,2,0,4,1,0.0,0.0,0.397959,38
13800,13800,0.178082,3,0.187243,15,0.6,4,5,1,4,1,0.0,0.0,0.397959,38


In [37]:
X_tr = X_tr.drop(columns=['id'])
X_val = X_val.drop('id', axis=1)

In [38]:
X_tr

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
1437,0.191781,3,0.216501,9,0.800000,4,3,1,4,0,0.0,0.323232,0.397959,38
7151,0.287671,5,0.127591,11,0.533333,2,2,0,4,1,0.0,0.000000,0.602041,38
30296,0.424658,3,0.217452,9,0.800000,2,9,0,4,1,0.0,0.000000,0.346939,38
15372,0.452055,3,0.142442,11,0.533333,2,2,0,4,1,0.0,0.000000,0.397959,38
13800,0.178082,3,0.187243,15,0.600000,4,5,1,4,1,0.0,0.000000,0.397959,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20375,0.260274,3,0.151510,11,0.533333,4,11,1,4,0,0.0,0.000000,0.397959,38
14775,0.383562,4,0.183118,11,0.533333,2,2,0,4,1,0.0,0.000000,0.479592,38
21698,0.534247,3,0.092164,15,0.600000,3,13,1,4,1,0.0,0.000000,0.397959,38
6824,0.520548,3,0.120375,5,0.200000,2,2,0,4,1,0.0,0.000000,0.602041,38


In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(random_state=2022)
model.fit(X_tr, y_tr)

pred = model.predict(X_val)

print('accuracy score : ', (accuracy_score(y_val, pred)))

accuracy score :  1.0


In [40]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=2022)
rf.fit(X_tr, y_tr)

pred = rf.predict(X_val)

print('accuracy_score : ', (accuracy_score(y_val, pred)))

accuracy_score :  1.0


In [41]:
X_test_id = X_test.pop('id')
pred = model.predict(X_test)

In [42]:
output = pd.DataFrame({'id' : X_test_id, 'income' : pred})
output.to_csv('asa.csv', index=False)
output.head()

Unnamed: 0,id,income
20901,20901,1
14170,14170,1
1776,1776,1
30428,30428,1
8602,8602,1
