In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import load_img
from keras.preprocessing import image
from keras_vggface.vggface import VGGFace
from keras_vggface.utils import preprocess_input

### import data

In [2]:
import pandas as pd
data = pd.read_csv('BMI/Data/data.csv')
data = data.drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,bmi,gender,is_training,name
0,34.207396,Male,1,img_0.bmp
1,26.45372,Male,1,img_1.bmp
2,34.967561,Female,1,img_2.bmp
3,22.044766,Female,1,img_3.bmp
4,37.758789,Female,1,img_4.bmp


In [3]:
data.shape

(4206, 4)

In [4]:
import os
import numpy as np
from PIL import Image

folder_path = 'BMI/Data/Images/'

images = []
filenames = []

for filename in os.listdir(folder_path):
    if filename.endswith('.bmp'):
        img_path = os.path.join(folder_path, filename)
        img = load_img(img_path)
        img = img.resize((224, 224))
        img_array = np.array(img)
        img_array = img_array.astype(np.float64)
        images.append(img_array)
        filenames.append(filename)

In [5]:
if len(filenames) == len(images):
    print(len(filenames),'images has been imported')

3963 images has been imported


### data processing

In [6]:
data = data[data['name'].isin(filenames)]
data.shape

(3962, 4)

- 1 image that has been imported does not exist in the bmi dataset, so we will not use it in the following modeling process.

In [7]:
name_img = pd.DataFrame({'name': filenames, 'img_array': images})
name_img.head()

Unnamed: 0,name,img_array
0,img_1561.bmp,"[[[142.0, 102.0, 66.0], [147.0, 104.0, 69.0], ..."
1,img_581.bmp,"[[[184.0, 193.0, 190.0], [184.0, 193.0, 190.0]..."
2,img_2068.bmp,"[[[77.0, 90.0, 96.0], [77.0, 90.0, 96.0], [78...."
3,img_3410.bmp,"[[[197.0, 182.0, 153.0], [197.0, 182.0, 151.0]..."
4,img_1207.bmp,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,..."


In [8]:
#combine the bmi data with the img_array by filenames
img_data = pd.merge(data, name_img, on='name', how='inner')
img_data.head()

Unnamed: 0,bmi,gender,is_training,name,img_array
0,34.207396,Male,1,img_0.bmp,"[[[176.0, 194.0, 216.0], [174.0, 194.0, 219.0]..."
1,26.45372,Male,1,img_1.bmp,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,..."
2,34.967561,Female,1,img_2.bmp,"[[[189.0, 111.0, 107.0], [188.0, 109.0, 105.0]..."
3,22.044766,Female,1,img_3.bmp,"[[[111.0, 106.0, 100.0], [111.0, 106.0, 100.0]..."
4,25.845588,Female,1,img_6.bmp,"[[[255.0, 255.0, 255.0], [255.0, 255.0, 255.0]..."


In [9]:
#reshape the img_array for VGG model
img_data['img_array'] = img_data['img_array'].apply(lambda x: np.resize(x, (224,224,3)))

### fc6 feature extraction using VGGFace

In [10]:
from keras.preprocessing import image
from keras_vggface import utils
from tensorflow.keras.models import Model

vggface = VGGFace(model='vgg16', include_top=True, input_shape=(224, 224, 3), pooling='avg')
vggface_model = Model(inputs=vggface.input, outputs=vggface.get_layer('fc6').output)

In [11]:
def get_fc6_feature(img):
    img = np.expand_dims(img, axis=0)
    img = utils.preprocess_input(img, version=2) 
    fc6_feature = vggface_model.predict(img)
    return fc6_feature[0]

In [12]:
%%time
img_data['fc6_feature'] = img_data['img_array'].apply(get_fc6_feature)

2023-05-17 16:43:02.526087: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




















































CPU times: user 37min 54s, sys: 1min 5s, total: 38min 59s
Wall time: 6min 34s


In [13]:
img_data.head()

Unnamed: 0,bmi,gender,is_training,name,img_array,fc6_feature
0,34.207396,Male,1,img_0.bmp,"[[[176.0, 194.0, 216.0], [174.0, 194.0, 219.0]...","[-1.7464029, -0.5159634, -21.824884, -0.858642..."
1,26.45372,Male,1,img_1.bmp,"[[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...","[-0.25845996, -0.7205424, -64.32637, -0.726638..."
2,34.967561,Female,1,img_2.bmp,"[[[189.0, 111.0, 107.0], [188.0, 109.0, 105.0]...","[-0.6931532, -0.5180315, -25.64009, -0.5815235..."
3,22.044766,Female,1,img_3.bmp,"[[[111.0, 106.0, 100.0], [111.0, 106.0, 100.0]...","[-0.73582923, -0.4978965, -33.2909, -0.4335094..."
4,25.845588,Female,1,img_6.bmp,"[[[255.0, 255.0, 255.0], [255.0, 255.0, 255.0]...","[0.9893446, -0.33865315, -20.944767, -0.479238..."


### predict BMI using SVR Model

In [14]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
train_set = img_data[img_data['is_training']==1]
test_set = img_data[img_data['is_training']!=1]

In [16]:
X_train = train_set['fc6_feature'].to_list()
y_train = train_set['bmi'].to_list()
X_test = test_set['fc6_feature'].to_list()
y_test = test_set['bmi'].to_list()

In [17]:
svr_model = SVR(kernel='rbf', C=0.1, epsilon=1) 
svr_model.fit(X_train, y_train)

In [18]:
y_pred = svr_model.predict(X_test)

In [19]:
from scipy.stats import pearsonr
corr, p = pearsonr(y_test, y_pred)
corr

0.6214479501453156

- the goal is to exceed 0.65, so hyperparameter tuning is needed to find best parameters
- use RandomizedSearch to do this

In [20]:
%%time
from sklearn.model_selection import RandomizedSearchCV

params = {
    'C': [0.1,1,10],
    'epsilon': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
}

svr = SVR()
svr_model = RandomizedSearchCV(svr, params, cv=5, random_state=42)
svr_model.fit(X_train, y_train)

best_params = svr_model.best_params_
best_params

CPU times: user 12min 18s, sys: 4.25 s, total: 12min 23s
Wall time: 12min 23s


In [21]:
%%time
best_svr = svr_model.best_estimator_
y_pred_tuned = best_svr.predict(X_test)

CPU times: user 5.01 s, sys: 36.8 ms, total: 5.05 s
Wall time: 5.05 s


In [22]:
corr_tuned, p_tuned = pearsonr(y_test, y_pred_tuned)
corr_tuned

0.651490754665141

In [24]:
import joblib

joblib.dump(best_svr, 'svr_model.pkl')

['svr_model.pkl']