In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression

In [72]:
df1 = pd.read_csv('ANSUR_II_FEMALE.csv')
df1.head()
# Non-numerical columns in the dataset
non_numeric = ['Component','Gender', 'BMI_class', 'Height_class', 'Branch']
# Drop the non-numerical columns from df
df = df1.drop(non_numeric, axis=1)
df.head()

Unnamed: 0,abdominalextensiondepthsitting,acromialheight,acromionradialelength,anklecircumference,axillaheight,balloffootcircumference,balloffootlength,biacromialbreadth,bicepscircumferenceflexed,bicristalbreadth,...,waistbreadth,waistcircumference,waistdepth,waistfrontlengthsitting,waistheightomphalion,wristcircumference,wristheight,weight_kg,stature_m,BMI
0,231,1282,301,204,1180,222,177,373,315,263,...,295,850,217,345,942,152,756,65.7,1.56,26.997041
1,194,1379,320,207,1292,225,178,372,272,250,...,254,708,168,329,1032,155,815,53.4,1.665,19.262506
2,183,1369,329,233,1271,237,196,397,300,276,...,269,727,159,367,1035,162,799,66.3,1.711,22.647148
3,261,1356,306,214,1250,240,188,384,364,276,...,302,923,235,371,999,173,818,78.2,1.66,28.378575
4,309,1303,308,214,1210,217,182,378,320,336,...,405,1163,300,380,911,152,762,88.6,1.572,35.853259


In [73]:
# Import train_test_split()
from sklearn.model_selection import train_test_split

# Select the Gender column as the feature to be predicted (y)
y = df['BMI']

# Remove the Gender column to create the training data
X = df.drop('BMI', axis=1)

# Perform a 70% train and 30% test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("{} rows in test set vs. {} in training set. {} Features.".format(X_test.shape[0], X_train.shape[0], X_test.shape[1]))

596 rows in test set vs. 1390 in training set. 93 Features.


In [74]:

# Fit the scaler on the training features and transform these in one go
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

# Create the Lasso model
la = Lasso(alpha=0.01, random_state=0)

# Fit it to the standardized training data
la.fit(X_train_std,y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=0,
      selection='cyclic', tol=0.0001, warm_start=False)

In [75]:
# Transform the test set with the pre-fitted scaler
X_test_std = scaler.transform(X_test)

# Calculate the coefficient of determination (R squared) on X_test_std
r_squared = la.score(X_test_std, y_test)
print("The model can predict {0:.1%} of the variance in the test set.".format(r_squared))

# Create a list that has True values when coefficients equal 0
zero_coef = la.coef_ == 0

# Calculate how many features have a zero coefficient
n_ignored = sum(zero_coef)
print("The model has ignored {} out of {} features.".format(n_ignored, len(la.coef_)))

The model can predict 99.4% of the variance in the test set.
The model has ignored 63 out of 93 features.


In [76]:
from sklearn.linear_model import LassoCV

# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train,y_train)
print('Optimal alpha = {0:.3f}'.format(lcv.alpha_))

# Calculate R squared on the test set
r_squared = lcv.score(X_test,y_test)
print('The model explains {0:.1%} of the test set variance'.format(r_squared))

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_!= 0
print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask)))



Optimal alpha = 0.297
The model explains 98.8% of the test set variance
41 features out of 93 selected


In [77]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))

# Assign the support array to gb_mask
gb_mask = rfe_gb.support_

Fitting estimator with 93 features.
Fitting estimator with 90 features.
Fitting estimator with 87 features.
Fitting estimator with 84 features.
Fitting estimator with 81 features.
Fitting estimator with 78 features.
Fitting estimator with 75 features.
Fitting estimator with 72 features.
Fitting estimator with 69 features.
Fitting estimator with 66 features.
Fitting estimator with 63 features.
Fitting estimator with 60 features.
Fitting estimator with 57 features.
Fitting estimator with 54 features.
Fitting estimator with 51 features.
Fitting estimator with 48 features.
Fitting estimator with 45 features.
Fitting estimator with 42 features.
Fitting estimator with 39 features.
Fitting estimator with 36 features.
Fitting estimator with 33 features.
Fitting estimator with 30 features.
Fitting estimator with 27 features.
Fitting estimator with 24 features.
Fitting estimator with 21 features.
Fitting estimator with 18 features.
Fitting estimator with 15 features.
Fitting estimator with 12 fe

In [78]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

# Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step
rfe_rf = RFE(estimator=RandomForestRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_rf.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))

# Assign the support array to gb_mask
rf_mask = rfe_rf.support_

Fitting estimator with 93 features.




Fitting estimator with 90 features.




Fitting estimator with 87 features.




Fitting estimator with 84 features.




Fitting estimator with 81 features.




Fitting estimator with 78 features.




Fitting estimator with 75 features.




Fitting estimator with 72 features.




Fitting estimator with 69 features.




Fitting estimator with 66 features.




Fitting estimator with 63 features.




Fitting estimator with 60 features.




Fitting estimator with 57 features.




Fitting estimator with 54 features.




Fitting estimator with 51 features.




Fitting estimator with 48 features.




Fitting estimator with 45 features.




Fitting estimator with 42 features.




Fitting estimator with 39 features.




Fitting estimator with 36 features.




Fitting estimator with 33 features.




Fitting estimator with 30 features.




Fitting estimator with 27 features.




Fitting estimator with 24 features.




Fitting estimator with 21 features.




Fitting estimator with 18 features.




Fitting estimator with 15 features.




Fitting estimator with 12 features.




The model can explain 94.9% of the variance in the test set


In [79]:
lm = LinearRegression()
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

# Create a mask for features selected by all 3 models
meta_mask = votes >= 3

# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]

# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print('The model can explain {0:.1%} of the variance in the test set using {1:} features.'.format(r_squared, len(lm.coef_)))

The model can explain 91.1% of the variance in the test set using 7 features.
