In [89]:
import pandas as pd

In [90]:
# load data

df = pd.read_csv('data.csv')

### Basic Stats of the Data

In [93]:
# let have a glimpse of the data set

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [91]:
# quick stats

display(df.info(), df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  31978 non-null  object
 14  target          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


None

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [None]:
# another way to summarize

display(df.describe(include='object').T, df.describe(exclude='object').T)

### Check missing values

In [94]:
nans = train.shape[0] - train.dropna().shape[0]
print ("%d rows have missing values in the train data" %nans)

nand = test.shape[0] - test.dropna().shape[0]
print ("%d rows have missing values in the test data" %nand)

2399 rows have missing values in the train data
1221 rows have missing values in the test data


In [95]:
# check which column has missing values

train.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

In [96]:
# count the number of unique values from character variables

cat = train.select_dtypes(include=['O'])
cat.apply(pd.Series.nunique)    # or use nunique()

workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64

In [97]:
# since missing values are found in all 3 character variables
# let's impute these missing values with their respective modes

display(train.workclass.value_counts(sort=True))
display(train.occupation.value_counts(sort=True))
display(train['native.country'].value_counts(sort=True))

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64

 United-States                 29170
 Mexico                          643
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 Greece                           29
 

In [98]:
# impute missing values with modes

train.workclass.fillna('Private', inplace=True)
train.occupation.fillna('Prof-specialty', inplace=True)
train['native.country'].fillna('United-States', inplace=True)

### Impute with Mode

In [None]:
vars_with_na = [var for var in train.columns if train[var].isnull().sum() > 0]

for var in vars_with_na:
    
    # calculate the mode
    mode_val1 = x_train[var].mode()[0]
    mode_val2 = x_val[var].mode()[0]
    mode_val3 = x_test[var].mode()[0]
    
    # replace missing values by the mode
    x_train[var] = x_train[var].fillna(mode_val1)
    x_val[var] = x_val[var].fillna(mode_val2)
    x_test[var] = x_test[var].fillna(mode_val3)

### Date Variables

In [None]:
time_vars = [col for col in train.columns if '_time' in col]

for each in time_vars:
    train[each] = pd.to_datetime(train[each], format='%Y-%m-%d')
    train[each+'minute'] = train[each].dt.round('1min').dt.minute
    train[each+'month'] = train[each].dt.month
    train[each+'hour'] = train[each].dt.hour
    train[each+'day'] = train[each].dt.day

for each in time_vars:
    train.drop(each, axis=1, inplace=True)

### Crosstab

In [100]:
# create a cross tab of the target variable with education
# we'll try to understand the influence of education on the target variable

pd.crosstab(train.education, train.target, margins=True)/train.shape[0]

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,0.02675,0.001904,0.028654
11th,0.034243,0.001843,0.036086
12th,0.012285,0.001013,0.013298
1st-4th,0.004975,0.000184,0.00516
5th-6th,0.009736,0.000491,0.010227
7th-8th,0.018611,0.001228,0.01984
9th,0.014957,0.000829,0.015786
Assoc-acdm,0.024631,0.008139,0.032769
Assoc-voc,0.031357,0.011087,0.042443
Bachelors,0.09625,0.06821,0.164461


### Dropping Unwanted Rows

In [None]:
# identify keys of the missing values

keys_removed = np.where(train.Temperature.isnull(), train.key, 0)

# get rid the 0s

keys_removed = np.setdiff(keys_removed, [0]).tolist()

# subsetting the dataframe according to the keys

keys_removed = train.query('key==@keys_removed').index

# drop rows from the original dataframe if their index are identified

train.drop(keys_removed, axis=0, inplace=True)

### Calculate Range in a Certain Column among the Identified Rows

In [None]:
# formatting the date column first

train['time'] = pd.to_datetime(train['time'], format='%Y-%m-%d')

# differences between the first and the last temperatures

diff = []

for each in train.key.unique():
    
    # identify the last row of the selected range, then minus value from the first row
    a = train.query('key==@each').iloc[len(train.query('key==@each'))-1].Temperature - \
    train.query('key==@each').iloc[0].Temperature
    
    # record the difference
    diff.append(a)

# form a resulting dataframe
diff = pd.DataFrame(diff, index=train.key.unique(), column=['Temperature diff'])

### Label encoding

In [101]:
from sklearn import preprocessing

# prepare a encoder

encoder = preprocessing.OrdinalEncoder()

# encoding the categorical features

for each in train.columns:
    if train[each].dtype == 'object':
        train[each] = encoder.fit_transform(train[each])

In [None]:
# method 2

encoder = preprocessing.OrdinalEncoder()
cat_vars = [var for var in train.columns if df[var].dtypes == 'O']

train[cat_vars] = encoder.fit_transform(df[cat_vars])

In [None]:
# method 3

cat_vars = [var for var in train.columns if train[var].dtypes == 'O']

for each in cat_vars:
    train[each] = train[each].astype('category').cat.codes

In [None]:
# method 3
# sort categorical labels based on mean value of the target

for each in x_train.columns:
    ordered_categorical = x_train.groupby(each)['SalePrice'].mean().sort_values().index
    ordered_label = {value: count for count, value in enumerate(ordered_categorical,0)}
    x_train[each] = x_train[each].map(ordered_label)
    x_test[each] = x_test[each].map(ordered_label)

### Check target variable

In [99]:
# check class proportion

train.target.value_counts()/train.shape[0]

 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64

### Train-test Split

In [103]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# train-test split

x = train
y = train['target']
del train['target'] # or train.remove('target')

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1,stratify=y)
print ("The train data has", train.shape)
print ("The test data has", test.shape)

RandomForestClassifier(max_depth=6, n_estimators=500)

### Scaling

In [None]:
scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train)
x_val = scaler.fit_transform(x_val)
x_test = scaler.fit_transform(x_test)

### Build a model

In [None]:
# train the RF classifier

clf = RandomForestClassifier(n_estimators = 500, max_depth = 6)
clf.fit(x_train, y_train)

# make prediction and check model's accuracy

pred = clf.predict(x_test)
acc =  accuracy_score(y_test, pred)
print ('The accuracy of Random Forest is {}'.format(acc))

### Cross-validated Score

In [None]:
baseline = cross_val_score(ElasticNet(random_state=123), x_train, y_train, 
                           scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

print('Baseline MAE from using ElasticNet is: {.3f}'.format(baseline.mean()))

In [None]:
baseline = cross_validate(RandomForestRegressor(n_estimators=100, random_state=123), x_train, y_train, 
                          scoring='neg_mean_absolute_error', cv=5, n_jobs=-1, error_score='raise')

baseline['test_score'].mean()

### LightGBM

In [None]:
# LGB as a potentially improved model

lgb = LGBRegressor(random_state=123)
lgb.fit(x_train, y_train, eval_set=[(x_val,y_val)], eval_metric='mae', early_stopping_rounds=30, verbose=20)
y_pred = lgb.predict(x_val)

print('Improved MAE from using LightGBM Regressor is: {.3f}'.format(mean_absolute_error(y_val, y_pred)))

# or we can use the cv method

dftrainLGB = lgb.Dataset(data = x_train, label = y_train, feature_name = x_train.columns.tolist())
params = {'objective': 'regression'}
lgb.cv(
    params,
    dftrainLGB,
    num_boost_round=100,
    nfold=3,
    metrics='mae',
    early_stopping_rounds=10
)