In [242]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer



In [243]:
data = pd.read_csv('/home/hasan/Downloads/gender-height-weight-index.csv')

In [244]:
data.head()

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174.0,96,4.0
1,Male,189.0,87,2.0
2,Female,185.0,110,4.0
3,Female,195.0,104,3.0
4,Male,149.0,61,3.0


In [245]:
data.shape

(515, 4)

In [246]:
#checking null value
data.isnull().sum()

Gender    15
Height    15
Weight     9
Index     15
dtype: int64

I am seeing this dataset have some null value

In [247]:
#seperating feature data and label data
X = data.drop('Index', axis=1)
y = data['Index']

print('Shape of X is :',X.shape)
print('Shape of y is :',y.shape)

Shape of X is : (515, 3)
Shape of y is : (515,)


In [248]:
#splitting dataset into train and test
xtrain,xtest, ytrain,ytest = train_test_split(X,y, test_size=.25, random_state=0)

I splitted datast before feature engineering. Because if I split dataset after feature engineering, model 
could face data leakage. data leakage means our model will give a good accuracy because some train data will 
repeat in test dataset. But When we deploy model it will not predict correctly.

# Handling Missing/nan value in train dataset

In [249]:
#adding xtrain and ytrain data
train = pd.concat([xtrain,ytrain], axis=1)

In [250]:
#checking nan value
train.isnull().sum()

Gender    8
Height    8
Weight    4
Index     8
dtype: int64

In [251]:
#filling the null value
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(train[['Height','Weight']])
train[['Height','Weight']] = imputer.transform(train[['Height','Weight']])
#train['Weight'] = imputer.transform(train[['Weight']])

In [252]:
train['Gender'] = train['Gender'].fillna(method='ffill')
train['Index'] = train['Index'].fillna(method='bfill')

In [253]:
#checking null value
train.isnull().sum()

Gender    0
Height    0
Weight    0
Index     0
dtype: int64

now there is no null value

# Handling missing/nan values in test dataset

In [254]:
#adding xtest and ytest
test = pd.concat([xtest,ytest], axis=1)

In [255]:
#checking null
test.isnull().sum()

Gender    7
Height    7
Weight    5
Index     7
dtype: int64

In [256]:
#handling missing/nan value using SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(test[['Height','Weight']])
test[['Height','Weight']] = imputer.transform(test[['Height','Weight']])

In [257]:
test['Gender'] = test['Gender'].fillna(method='ffill')
test['Index'] = test['Index'].fillna(method='bfill')

In [258]:
test.isnull().sum()

Gender    0
Height    0
Weight    0
Index     0
dtype: int64

# Handling String value in Gender column

In [259]:
#for train data
dummy_gender = pd.get_dummies(train['Gender'])

In [260]:
train = pd.concat([train,dummy_gender], axis=1)
print('Shape of the train :',train.shape)

Shape of the train : (386, 6)


In [261]:
train.head()

Unnamed: 0,Gender,Height,Weight,Index,Female,Male
414,Female,195.0,104.0,3.0,1,0
59,Male,196.0,116.0,4.0,0,1
468,Female,183.0,79.0,2.0,1,0
405,Female,180.0,70.0,2.0,1,0
426,Male,181.0,154.0,5.0,0,1


In [262]:
#for test data
dummy_gender = pd.get_dummies(test['Gender'])

In [263]:
test = pd.concat([test,dummy_gender], axis=1)
print('Shape of the train :',test.shape)

Shape of the train : (129, 6)


In [264]:
test.head()

Unnamed: 0,Gender,Height,Weight,Index,Female,Male
284,Female,187.0,121.0,4.0,1,0
100,Female,194.0,111.0,3.0,1,0
37,Female,157.0,153.0,5.0,1,0
46,Male,157.0,56.0,2.0,0,1
441,Female,187.0,130.0,4.0,1,0


In [265]:
#spliting train dataset
xtrain = train.drop(['Gender','Index'], axis=1)
ytrain = train['Index']

print('Shape of xtrain :',xtrain.shape)
print('Shape of ytrain :',ytrain.shape)

Shape of xtrain : (386, 4)
Shape of ytrain : (386,)


In [266]:
#spliting test dataset
xtest = test.drop(['Gender','Index'], axis=1)
ytest = test['Index']

print('Shape of xtest :',xtest.shape)
print('Shape of ytest :',ytest.shape)

Shape of xtest : (129, 4)
Shape of ytest : (129,)


In [267]:
model = RandomForestClassifier(n_estimators=100)
model.fit(xtrain,ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [268]:
model.score(xtest,ytest)

0.8294573643410853

In [269]:
predicted = model.predict(xtest)
predicted

array([4., 3., 5., 2., 4., 3., 4., 2., 5., 4., 4., 0., 5., 1., 4., 4., 5.,
       3., 5., 5., 2., 4., 5., 5., 4., 5., 4., 4., 4., 5., 4., 4., 5., 4.,
       3., 4., 4., 5., 2., 0., 5., 3., 4., 5., 4., 4., 5., 3., 5., 2., 1.,
       4., 2., 4., 2., 3., 2., 5., 5., 4., 5., 4., 5., 3., 4., 5., 2., 4.,
       4., 2., 2., 5., 1., 5., 4., 4., 5., 5., 2., 5., 4., 3., 5., 4., 1.,
       5., 3., 4., 5., 4., 4., 5., 5., 4., 5., 3., 2., 3., 3., 4., 2., 3.,
       3., 4., 3., 5., 5., 4., 5., 5., 5., 3., 4., 3., 4., 5., 4., 3., 4.,
       5., 4., 2., 4., 2., 5., 2., 5., 3., 3.])