## Generate a dataset with 10 features, 1 regression target, 150 000 rows

In [1]:
!pip install faker -q
import pandas as pd
import numpy as np
from faker import Faker

from sklearn import preprocessing

faker = Faker()

def make_fake_data(num):
    fake_data = [{'pregnancies': faker.random_int(0, 5),
                  'glucose': faker.pyfloat(min_value=3, max_value=8, right_digits=1),
                  'blood_pressure':faker.random_int(60, 120),
                  'skin_thickness': faker.random_int(0, 100),
                  'insulin': faker.random_int(60, 140),
                  'age': faker.random_int(21, 80),
                  'sex': faker.boolean(),
                  'body_mass_index': faker.random_int(15, 35),
                  'pedigree_function': faker.boolean(), 
                  'blood_sugar': faker.random_int(10, 60),
                  'target':0} for x in range(num)]
    return fake_data

# Genearte fake data
num=150000
df = pd.DataFrame(make_fake_data(num))
df.head()

# Buld a relationship
for i in range(num):
    results=0
    if df.iloc[i]['sex'] == 1:
        results += 5
    if df.iloc[i]['pedigree_function'] == 1:
        results += 10
        df.at[i,'blood_pressure']=np.random.randint(60, 90)
    else:
        df.at[i,'blood_pressure']=np.random.randint(90, 120)
    if df.iloc[i]['glucose'] > 5:
        results += 10
    if df.iloc[i]['body_mass_index'] > 20:
        results += 5
    if df.iloc[i]['skin_thickness'] > 50:
        results += 5
    if df.iloc[i]['insulin'] > 100:
        df.at[i,'blood_pressure']=np.random.randint(60, 90)
        results += 10
    if df.iloc[i]['age'] > 50:
        results += 10
    if df.iloc[i]['blood_sugar'] > 6:
        results += 10
    if df.iloc[i]['blood_pressure'] < 90:
        results += 10
    if df.iloc[i]['pregnancies'] > 3:
        results += 10
    df.at[i,'target']=results
      
df.head()

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,age,sex,body_mass_index,pedigree_function,blood_sugar,target
0,0,6.4,91,9,67,45,False,28,False,20,25
1,1,6.7,97,64,61,48,False,15,False,35,25
2,0,4.2,108,54,65,60,True,20,False,49,30
3,5,5.7,60,97,123,49,False,27,True,39,70
4,0,3.0,103,24,90,39,True,18,False,45,15


In [2]:
# Transform features by scaling each feature to a given range (MinMaxScaler)      
x = df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
df.columns = ['pregnancies', 'glucose', 'blood_pressure', 'skin_thickness', 'insulin', 'age', 'sex','body_mass_index', 'pedigree_function', 'blood_sugar', 'target']
df.head()

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,age,sex,body_mass_index,pedigree_function,blood_sugar,target
0,0.0,0.693878,0.525424,0.09,0.0875,0.40678,0.0,0.65,0.0,0.2,0.2
1,0.2,0.755102,0.627119,0.64,0.0125,0.457627,0.0,0.0,0.0,0.5,0.2
2,0.0,0.244898,0.813559,0.54,0.0625,0.661017,1.0,0.25,0.0,0.78,0.266667
3,1.0,0.55102,0.0,0.97,0.7875,0.474576,0.0,0.6,1.0,0.58,0.8
4,0.0,0.0,0.728814,0.24,0.375,0.305085,1.0,0.15,0.0,0.7,0.066667


## Design Linear Regression

In [3]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

def print_evaluate(true, predicted):  
    mse = mean_squared_error(true, predicted)
    print('MSE:', mse)
    print('__________________________________')
    
X = df[['pregnancies', 'glucose', 'blood_pressure', 'skin_thickness', 'insulin', 'age', 'sex','body_mass_index', 'pedigree_function', 'blood_sugar']]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

print(f'Train X = {round(X_train.shape[0]/df.shape[0]*100)}%')
print(f'Train Y = {round(y_train.shape[0]/df.shape[0]*100)}%')
print(f'Test X = {round(X_test.shape[0]/df.shape[0]*100)}%')
print(f'Test Y = {round(y_test.shape[0]/df.shape[0]*100)}%')
print('__________________________________')

lin_reg = LinearRegression(normalize=True)
lin_reg.fit(X_train,y_train)
pred = lin_reg.predict(X_test)
print_evaluate(y_test, pred)

Train X = 80%
Train Y = 80%
Test X = 20%
Test Y = 20%
__________________________________
MSE: 0.006295969155164378
__________________________________


## Design 3 Dense Neural Networks (1, 5, 25 hidden layers). 
## Prepare 3 optimizers:
- (OPT1) SGD
- (OPT2) RMSProp
- (OPT3) Adam


## Explore the following learning rates:
- (lr1) 0.1
- (lr2) 0.01
- (lr3) 0.001

In [5]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD, RMSprop, Adam
from tensorflow.keras.metrics import MeanSquaredError as MSE
from tensorflow.keras.losses import MeanSquaredError

import matplotlib.pyplot as plt

def create_model(hidden_layers, optimizer, features):
    model = Sequential()
    model.add(Dense(12, activation='relu', input_shape=(32, 10)))
    for i in range(hidden_layers):
        model.add(Dense(8, activation = 'relu'))
    model.add(Dense(1, activation = 'relu'))
    model.compile(optimizer=optimizer, loss='mse', metrics=MSE())
    return model

learning_rates = [0.1, 0.01, 0.001]
optimizers = [SGD, RMSprop, Adam]
layers = [1, 5, 25]

accs = pd.DataFrame()
val_accs = pd.DataFrame()
losses = pd.DataFrame()
val_losses = pd.DataFrame()
models = []

for num_layers in layers:
    for learning_rate in learning_rates:
        for optimizer in optimizers:
            opt = '-'.join(str(optimizer).split("'")[1].split(".")[-2:])
            model = create_model(num_layers, optimizer(learning_rate=learning_rate), 10)
            history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test,y_test), batch_size=32, verbose=1,
          validation_split=0.1).history
            accs[f'{num_layers}-{learning_rate}-{opt}'] = history['mean_squared_error'] 
            val_accs[f'{num_layers}-{learning_rate}-{opt}'] = history['val_mean_squared_error'] 
            losses[f'{num_layers}-{learning_rate}-{opt}'] = history['loss'] 
            val_losses[f'{num_layers}-{learning_rate}-{opt}'] = history['val_loss'] 
            models.append(model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

## Plot learning curves for all combinations

In [9]:
import os

for i in accs.columns:
    x_plot = np.linspace(1, 10, 10)
    fig, axes = plt.subplots()
    axes.plot(x_plot, losses[i], label='loss')
    axes.plot(x_plot, val_losses[i], label='val_loss')
    axes.set_xlabel('epochs')
    axes.set_ylabel('losses')
    fig.legend(['train', 'test'], loc='upper right')
    fig.savefig(os.path.join(f'C:/Users/Iryna.Dosiak/results/{str(i)}.png'))
    plt.close(fig)

## Collect results

In [10]:
results = []
for i in range(27):
    results.append([accs.columns[i], models[i].evaluate(X_test, y_test, verbose=0)[0]])
results_df = pd.DataFrame(results)
writer = pd.ExcelWriter('C:/Users/Iryna.Dosiak/results.xlsx', engine='xlsxwriter')
results_df.to_excel(writer, sheet_name='results', index=False)
writer.save()

