In [38]:
import numpy as np
import pandas as pd

In [43]:
#LinearRegressionModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [21]:
bmi = pd.read_csv("BMI.csv")
bmi.head()  

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3


In [23]:
bmi.head(10)

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3
5,Male,189,104,3
6,Male,147,92,5
7,Male,154,111,5
8,Male,174,90,3
9,Female,169,103,4


In [24]:
bmi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
Gender    500 non-null object
Height    500 non-null int64
Weight    500 non-null int64
Index     500 non-null int64
dtypes: int64(3), object(1)
memory usage: 15.7+ KB


In [29]:
bmi.isnull().any()     #to detect null values in a dataset

Gender    False
Height    False
Weight    False
Index     False
dtype: bool

In [28]:
#to check if there's any uppercase or lowercase differences in categorical attribute
bmi["Gender"].unique()
bmi["Gender"].value_counts()

0    255
1    245
Name: Gender, dtype: int64

In [25]:
bmi = bmi.replace(to_replace = "Male", value = 1)         #to replace male by digit 1
bmi = bmi.replace(to_replace = "Female", value = 0)    #to replace female by digit 0
bmi.head()        #check the modified dataset

Unnamed: 0,Gender,Height,Weight,Index
0,1,174,96,4
1,1,189,87,2
2,0,185,110,4
3,0,195,104,3
4,1,149,61,3


In [26]:
# Split the dataset into train_set and test_set which are used to train the model and test the model respectively.
train_set, test_set = train_test_split(bmi, test_size = 0.1, train_size = 0.9, random_state = 42, shuffle = False)
#train_set looks like
train_set.head(10)
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 450 to 499
Data columns (total 4 columns):
Gender    50 non-null int64
Height    50 non-null int64
Weight    50 non-null int64
Index     50 non-null int64
dtypes: int64(4)
memory usage: 2.0 KB


In [27]:
#correlation matrix reveals some interesting patterns in attributes
corr_matrix = bmi.corr()
corr_matrix

Unnamed: 0,Gender,Height,Weight,Index
Gender,1.0,-0.017677,0.009523,0.028786
Height,-0.017677,1.0,0.000446,-0.422223
Weight,0.009523,0.000446,1.0,0.804569
Index,0.028786,-0.422223,0.804569,1.0


In [30]:
#separate the  labels from the training examples
X_train = train_set.drop("Index", axis = 1)
y_train = train_set["Index"].copy()
X_train.tail()

Unnamed: 0,Gender,Height,Weight
445,1,159,140
446,1,146,70
447,0,176,121
448,0,146,101
449,1,159,145


In [67]:
X_train.shape     #dimensions of dataFrame

(450, 3)

In [31]:
#Training a linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [32]:
#Making test_set unlabeled to check the predictions
X_test = test_set.drop("Index", axis = 1)
y_test = test_set["Index"].copy()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 450 to 499
Data columns (total 3 columns):
Gender    50 non-null int64
Height    50 non-null int64
Weight    50 non-null int64
dtypes: int64(3)
memory usage: 1.6 KB


In [36]:
#Now,the test set is ready and linear regression model's too for making predictions
some_data = X_test.iloc[ :5]
some_labels = y_test.iloc[ :5]
print("Predictions:",lin_reg.predict(some_data).round())

Predictions: [6. 3. 4. 1. 2.]


In [37]:
print("Labels:",list(some_labels))

Labels: [5, 4, 5, 0, 2]


In [42]:
bmi_predictions = lin_reg.predict(X_test)
lin_mse = mean_squared_error(y_test, bmi_predictions.round())
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.7071067811865476

In [47]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lin_reg, X_test, y_test, scoring = "neg_mean_squared_error", cv = 10)
lin_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Standard Deviation:",scores.std())
display_scores(lin_rmse_scores)

Scores: [0.95671006 0.49761252 0.51496599 0.51792401 0.49611129 0.36093845
 0.7706192  0.44153621 0.86897806 0.88195869]
Mean: 0.630735448071423
Standard Deviation: 0.20402468217324154


In [49]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
print("Predictions: ",tree_reg.predict(some_data))

Predictions:  [5. 4. 5. 0. 2.]


In [50]:
print("Labels:", list(some_labels))

Labels: [5, 4, 5, 0, 2]


In [51]:
bmi_predictions_tree = tree_reg.predict(X_test)
tree_mse = mean_squared_error(y_test, bmi_predictions_tree)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.5099019513592785

In [52]:
scores_tree = cross_val_score(tree_reg, X_test, y_test, scoring = "neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-scores_tree)
display_scores(tree_rmse_scores)

Scores: [0.4472136  1.         0.4472136  0.63245553 0.63245553 0.63245553
 1.67332005 0.63245553 0.4472136  0.4472136 ]
Mean: 0.6991996563202687
Standard Deviation: 0.3621047370604508


In [54]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
print("Predictions:", forest_reg.predict(some_data).round())

Predictions: [5. 3. 5. 0. 2.]




In [55]:
print("Labels:",list(some_labels))

Labels: [5, 4, 5, 0, 2]


In [56]:
bmi_predictions_forest = forest_reg.predict(X_test)
forest_mse = mean_squared_error(bmi_predictions_forest, y_test)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.31208973068654466

In [57]:
scores_forest = cross_val_score(forest_reg, X_test, y_test, scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-scores_forest)
display_scores(forest_rmse_scores)

Scores: [0.55497748 0.97672924 0.4494441  0.50596443 0.52345009 0.60827625
 0.74966659 0.14832397 0.36331804 0.44497191]
Mean: 0.5325122099679194
Standard Deviation: 0.21031107016769815


In [58]:
from sklearn.svm import SVR
svr_reg = SVR(kernel = "linear" )
svr_reg.fit(X_train, y_train)
print("Predictions:", svr_reg.predict(some_data).round())

Predictions: [6. 3. 4. 2. 2.]


In [59]:
print("Labels:",list(some_labels))

Labels: [5, 4, 5, 0, 2]


In [61]:
bmi_predictions_svr = svr_reg.predict(X_test)
svr_mse = mean_squared_error(bmi_predictions_svr, y_test)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

0.6258930034552042

In [63]:
scores_svr = cross_val_score(svr_reg, X_test, y_test, scoring = "neg_mean_squared_error", cv = 10)
svr_rmse_scores = np.sqrt(-scores_svr)
display_scores(svr_rmse_scores)

Scores: [0.90273801 0.49336718 0.66040681 0.61644549 0.53153169 0.37876297
 0.84294637 0.44820189 0.83444409 1.04976393]
Mean: 0.6758608418567441
Standard Deviation: 0.21031941954620445
