### This program does the followings:
1. reads the dataset and determines the weight of a person based on height and gender
2. looks for nulls - Data Preprocessing
3. looks for outliers and eliminates outliers - Data Preprocessing
4. Transforms categorical dat using pandas get_dummies which is similar to one hot encoding - Data Preprocessing
5. seperates datasets in x (features) and y (label/class) 
6. splits datasets into train and test
7. trains the dataset with LinearRegression model
8. tests the model
9. lastly, it applies K Nearest Neighbors for Regreesion problem (KNNRegressor model) to test the score

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('weight-height.csv')

In [4]:
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [5]:
df.isnull().sum()

Gender    0
Height    0
Weight    0
dtype: int64

In [6]:
df.shape

(8555, 3)

### check for outliers

In [14]:
mean = df['Weight'].mean() # Find mean
mean 

165.6327353266768

In [16]:
std = df['Weight'].std()
std

32.04392221721551

In [17]:
std_positive_25 = mean + 2.5 * std 
std_positive_25

245.74254086971558

In [18]:
std_negative_25 = mean - 2.5 * std 
std_negative_25

85.52292978363802

In [21]:
outlier_right = df[df['Weight'] > std_positive_25]
outlier_right

Unnamed: 0,Gender,Height,Weight
994,Male,78.095867,255.690835
1506,Male,75.698618,249.565628
1610,Male,76.617546,255.863326
1750,Male,74.604668,249.946283
1896,Male,76.47288,246.232321
2014,Male,78.998742,269.989698
2070,Male,77.465569,252.556689
2971,Male,75.156879,250.317151
3228,Male,76.268668,247.386741
3285,Male,78.52821,253.889004


In [22]:
outlier_right.shape

(14, 3)

In [23]:
outlier_left = df[df['Weight'] < std_negative_25]
outlier_left

Unnamed: 0,Gender,Height,Weight
151,Male,65.350411,65.78
160,Male,64.333648,68.64
5026,Female,56.547975,84.872124
5214,Female,56.785434,83.993077
5360,Female,55.668202,68.98253
5740,Female,56.108902,80.531259
6624,Female,54.616858,71.393749
6982,Female,56.764456,79.174376
7294,Female,54.873728,78.60667
7324,Female,56.810317,84.170695


In [24]:
outlier_left.shape

(13, 3)

In [25]:
total_outliers = pd.concat([outlier_right, outlier_left])

In [26]:
total_outliers

Unnamed: 0,Gender,Height,Weight
994,Male,78.095867,255.690835
1506,Male,75.698618,249.565628
1610,Male,76.617546,255.863326
1750,Male,74.604668,249.946283
1896,Male,76.47288,246.232321
2014,Male,78.998742,269.989698
2070,Male,77.465569,252.556689
2971,Male,75.156879,250.317151
3228,Male,76.268668,247.386741
3285,Male,78.52821,253.889004


In [27]:
total_outliers.shape

(27, 3)

In [47]:
total_valid_data = df[ (df['Weight'] < std_positive_25) & (df['Weight'] > std_negative_25)]
total_valid_data.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [48]:
total_valid_data.shape

(8528, 3)

### Transform categorical data (Gender column) with oneHotEncoder (pd.get_dummies)

In [49]:
new_gender = pd.get_dummies(total_valid_data['Gender'], drop_first=True, dtype=int)
new_gender

Unnamed: 0,Male
0,1
1,1
2,1
3,1
4,1
...,...
8550,0
8551,0
8552,0
8553,0


In [50]:
total_valid_data = pd.concat([total_valid_data, new_gender], axis=1)
total_valid_data.head()

Unnamed: 0,Gender,Height,Weight,Male
0,Male,73.847017,241.893563,1
1,Male,68.781904,162.310473,1
2,Male,74.110105,212.740856,1
3,Male,71.730978,220.04247,1
4,Male,69.881796,206.349801,1


In [51]:
df_revised = total_valid_data.drop('Gender', axis=1)

In [52]:
df_revised.head()

Unnamed: 0,Height,Weight,Male
0,73.847017,241.893563,1
1,68.781904,162.310473,1
2,74.110105,212.740856,1
3,71.730978,220.04247,1
4,69.881796,206.349801,1


In [69]:
x = df_revised.drop('Weight', axis=1)
x.head()

Unnamed: 0,Height,Male
0,73.847017,1
1,68.781904,1
2,74.110105,1
3,71.730978,1
4,69.881796,1


In [55]:
y = df_revised['Weight']

### Train Test Split and Train model

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.30, random_state=1)

In [72]:
xtrain.shape

(5969, 2)

In [73]:
xtest.shape

(2559, 2)

In [58]:
from sklearn.linear_model import LinearRegression

In [59]:
lr_model = LinearRegression()

In [60]:
lr_model.fit(xtrain, ytrain)

### Test Model

In [61]:
lr_model.score(xtest, ytest)

0.9017919517062599

In [62]:
y_predict = lr_model.predict(xtest)

In [65]:
from sklearn.metrics import r2_score, mean_squared_error

In [64]:
r2_score(ytest, y_predict)

0.9017919517062599

In [67]:
MSE = mean_squared_error(ytest, y_predict)
MSE

97.10200590470174

In [68]:
lr_model.coef_

array([ 5.91264237, 20.07336249])

In [71]:
lr_model.intercept_

-241.09237958372282

###  Apply the model K Nearest Neighbors for Regression problems (KNNRegressor)

In [74]:
from sklearn.neighbors import KNeighborsRegressor

In [75]:
knn = KNeighborsRegressor(n_neighbors=100) # generally choose 5% of total data

In [76]:
knn.fit(xtrain, ytrain)

In [78]:
y_predict2 = knn.predict(xtest)

In [79]:
MSE2 = mean_squared_error(ytest, y_predict2)
MSE2

99.32487410697597

In [80]:
r2_score(ytest, y_predict2)

0.8995437638781545