In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
import joblib
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.svm import SVR, LinearSVR
from sklearn.base import clone
import pickle
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler


In [2]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Selcects the features (numerical, categorical or all)
    """

    def __init__(self, select):
        """
        select has to be "num features", "cat features" or "all features"
        """

        if select not in ["num features", "cat features", "all features"]:
            raise TypeError("for select only num features, cat features or all features")

        self.select = select
        self.num_attr = None
        self.cat_attr = None

    def fit(self, x: pd.DataFrame, _y=None):
        """fits the parameter"""

        if not isinstance(x, pd.DataFrame):
            raise TypeError("Selector needs Pandas Dataframe!")

        self.num_attr = list(x.select_dtypes(include=[np.number]).columns)
        self.cat_attr = list(x.select_dtypes(exclude=[np.number]).columns)

        return self

    def transform(self, x: pd.DataFrame, _y=None):
        """does the transformation"""

        if not isinstance(x, pd.DataFrame):
            raise TypeError("Selector needs Pandas Dataframe!")

        if self.select == "num features":
            x_new = x[self.num_attr].copy()
        elif self.select == "cat features":
            x_new = x[self.cat_attr].copy()
        elif self.select == "all features":
            x_new = x[self.num_attr + self.cat_attr].copy()
        else:
            raise TypeError("for select only num features, cat features or all features")

        return x_new

    def get_feature_names_out(self):
        """this method is needed, otherwise we cannot use set_ouput"""
        pass

In [3]:



class NumAttributesAdder(BaseEstimator, TransformerMixin):
    """Adds new numeric features"""

    def __init__(self):
        pass

    def fit(self, x: pd.DataFrame, _y=None):
        """fits the parameter"""

        if not isinstance(x, pd.DataFrame):
            raise TypeError("NumAttributesAdder needs Pandas Dataframe!")

        return self

    @staticmethod  # static because in transform self is not used
    def transform(_self, x: pd.DataFrame, _y=None):
        """does the transformation"""

        if not isinstance(x, pd.DataFrame):
            raise TypeError("NumAttributesAdder needs Pandas Dataframe!")

        x_new = x.copy()
        x_new["rooms_per_household"] = x_new["total_rooms"] / x_new["households"]
        x_new["rooms_per_household"].clip(lower=1, inplace=True)  # hard to find an upper value (consider touristic region with hotels only)
        x_new["population_per_household"] = x_new["population"] / x_new["households"]
        x_new["population_per_household"].clip(0, 10, inplace=True)
        x_new["bedrooms_per_room"] = x_new["total_bedrooms"] / x_new["total_rooms"]
        x_new["bedrooms_per_room"].clip(0, 1, inplace=True)

        return x_new

    def get_feature_names_out(self):
        """this method is needed, otherwise we cannot use set_ouput"""
        pass

In [42]:
df = pd.read_csv('project_1_train.csv')

In [44]:
df.head()

Unnamed: 0,NR,Gender,Age,AgeDecade,AgeMonths,Race1,Race3,Education,MaritalStatus,HHIncome,...,RegularMarij,AgeRegMarij,HardDrugs,SexEver,SexAge,SexNumPartnLife,SexNumPartYear,SameSex,SexOrientation,PregnantNow
0,9217,male,44,40-49,,White,White,High School,Divorced,25000-34999,...,No,,Yes,Yes,17.0,20.0,1.0,No,Heterosexual,
1,7325,male,50,50-59,,White,White,College Grad,NeverMarried,,...,No,,No,Yes,24.0,1.0,0.0,Yes,Homosexual,
2,919,female,59,50-59,718.0,Black,,High School,Widowed,45000-54999,...,No,,No,Yes,17.0,3.0,1.0,No,Heterosexual,
3,5903,female,40,40-49,,White,White,College Grad,Divorced,0-4999,...,,,,,,,,,,No
4,2808,female,13,10-19,166.0,Hispanic,,,,,...,,,,,,,,,,


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 71 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   NR                8000 non-null   int64  
 1   Gender            8000 non-null   object 
 2   Age               8000 non-null   int64  
 3   AgeDecade         7740 non-null   object 
 4   AgeMonths         3975 non-null   float64
 5   Race1             8000 non-null   object 
 6   Race3             3994 non-null   object 
 7   Education         5800 non-null   object 
 8   MaritalStatus     5806 non-null   object 
 9   HHIncome          7359 non-null   object 
 10  HHIncomeMid       7359 non-null   float64
 11  Poverty           7423 non-null   float64
 12  HomeRooms         7943 non-null   float64
 13  HomeOwn           7948 non-null   object 
 14  Work              6241 non-null   object 
 15  Weight            7936 non-null   float64
 16  Height            7727 non-null   float64


In [46]:
df.describe()

Unnamed: 0,NR,Age,AgeMonths,HHIncomeMid,Poverty,HomeRooms,Weight,Height,BMI,Pulse,...,TVHrsDayChild,CompHrsDayChild,AlcoholDay,AlcoholYear,SmokeAge,AgeFirstMarij,AgeRegMarij,SexAge,SexNumPartnLife,SexNumPartYear
count,8000.0,8000.0,3975.0,7359.0,7423.0,7943.0,7936.0,7727.0,7716.0,6862.0,...,507.0,507.0,3936.0,4734.0,2478.0,2339.0,1086.0,4464.0,4600.0,3969.0
mean,5012.55425,36.6675,421.24,57291.411877,2.80781,6.249024,71.160622,161.934088,26.673793,73.484115,...,1.95069,2.230769,2.931911,75.25771,17.761905,17.050876,17.753223,17.433468,14.243261,1.356513
std,2887.242281,22.265459,257.011081,33114.586076,1.684218,2.27817,29.010355,20.15435,7.346642,12.090593,...,1.418238,2.560725,3.187721,102.078749,5.170704,3.944924,4.907613,3.695085,47.260997,2.709867
min,1.0,0.0,0.0,2500.0,0.0,1.0,2.8,83.6,12.89,40.0,...,0.0,0.0,1.0,0.0,6.0,1.0,5.0,9.0,0.0,0.0
25%,2528.75,18.0,204.0,30000.0,1.24,5.0,56.3,157.1,21.6,64.0,...,1.0,0.0,1.0,3.0,15.0,15.0,15.0,15.0,2.0,1.0
50%,4994.0,36.0,420.0,50000.0,2.72,6.0,72.75,166.1,26.0,72.0,...,2.0,1.0,2.0,24.0,17.0,16.0,17.0,17.0,5.0,1.0
75%,7504.25,54.0,619.0,87500.0,4.76,8.0,89.2,174.5,30.93,82.0,...,3.0,6.0,3.0,104.0,19.0,19.0,19.0,19.0,12.0,1.0
max,10000.0,80.0,959.0,100000.0,5.0,13.0,230.7,200.4,81.25,134.0,...,6.0,6.0,82.0,364.0,72.0,48.0,52.0,50.0,1000.0,69.0


In [47]:
df['DirectChol']

0       1.11
1       1.01
2       1.63
3       1.47
4       1.19
        ... 
7995    1.34
7996    1.09
7997    1.84
7998    1.50
7999    1.27
Name: DirectChol, Length: 8000, dtype: float64

In [36]:
# Applying the condition, but leave nan for now
# df['Diabetes'].mask(df['Diabetes'] == 'Yes', 0, inplace=True)
# df['Diabetes'].mask(df['Diabetes'] == 'No', 1, inplace=True)
# df["Diabetes"] = df["Diabetes"].astype(float)

In [37]:
#df["Diabetes"].info()

In [10]:
# numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#
# newdf = df.select_dtypes(include=numerics)
#
# newdf.info()

In [66]:
max_number_of_nas = 1500
newdf = df.loc[:, (df.isnull().sum(axis=0) <= max_number_of_nas)]

newdf.dropna(inplace=True)
newdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4697 entries, 0 to 7996
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   NR           4697 non-null   int64  
 1   Gender       4697 non-null   object 
 2   Age          4697 non-null   int64  
 3   AgeDecade    4697 non-null   object 
 4   Race1        4697 non-null   object 
 5   HHIncome     4697 non-null   object 
 6   HHIncomeMid  4697 non-null   float64
 7   Poverty      4697 non-null   float64
 8   HomeRooms    4697 non-null   float64
 9   HomeOwn      4697 non-null   object 
 10  Weight       4697 non-null   float64
 11  Height       4697 non-null   float64
 12  BMI          4697 non-null   float64
 13  BMI_WHO      4697 non-null   object 
 14  Pulse        4697 non-null   float64
 15  BPSysAve     4697 non-null   float64
 16  BPDiaAve     4697 non-null   float64
 17  BPSys1       4697 non-null   float64
 18  BPDia1       4697 non-null   float64
 19  BPSys2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf.dropna(inplace=True)


In [19]:
y_reg = newdf['DirectChol']

x_reg = newdf.drop(columns=['DirectChol'])



In [20]:
X_train, X_test, y_train, y_test = train_test_split(x_reg, y_reg, test_size=0.2, random_state=123)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(x_clas, y_clas, test_size=0.2, random_state=123)

In [21]:
scores = {}
def display_scores(model_score):
    """print the list of scores, the mean and the standard deviation"""
    print("SCORES OF CROSS VALIDATION:")
    print(np.round(model_score, decimals=1))
    print("MEAN SCORE: %0.1f" % model_score.mean())
    print("STD SCORE: %0.1f\n" % model_score.std())


In [22]:
def lin_reg(x, y):
    """define model 1"""
    start = time.time()

    # numeric feature preparation
    pipeline_num = Pipeline([
        ("selector", Selector("num features")),  # select numeric attributes
        ("imputer", SimpleImputer(strategy="median")),  # missing values are replaced by column median of train_set toDo better to drop with our dataset?
        ("scaler", StandardScaler()),  # scale to mean 0 and std 1 toDo: Maybe min_max_scaler?
    ])

    # categorical feature preparation toDo: is this necessary?
    pipeline_cat = Pipeline([
        ("selector", Selector("cat features")),  # select categorical attributes
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first", dtype=bool)),  # one-hot-encoding
    ])

    # combine numeric and categorical feature preparation
    pipeline_full = FeatureUnion(transformer_list=[
        ("pipeline_num", pipeline_num),
        ("pipeline_cat", pipeline_cat),
    ])

    # set output to pandas, so we can use pandas tools again (e.g. for feature_names_in_)
    pipeline_full.set_output(transform="pandas")

    model = Pipeline([
        ("pipeline_full", pipeline_full),
        ("regressor", LinearRegression()),
    ])

    model.fit(x, y)

    model_predictions = model.predict(x)

    # attention: it is not allowed to use a fitted model (like model) in cross_val_predict because information gets through
    # unfitted model (clone(model)) is used for cross validation

    model_new = clone(model)
    model_scores = cross_val_score(model_new, x, y, scoring="neg_mean_squared_error", cv=3, n_jobs=-1)

    end = time.time()
    model_time = end - start

    # store model
    with open("lin_reg.pkl", "wb") as file:
        pickle.dump(model, file)
    joblib.dump(model_predictions, filename="lin_reg_predictions.pkl")
    joblib.dump(model_scores, filename="lin_reg_scores.pkl")
    joblib.dump(model_time, filename="lin_reg_time.pkl")

In [23]:
lin_reg(X_train, y_train)

In [24]:
model_1 = joblib.load("lin_reg.pkl")
model_1_predictions = joblib.load("lin_reg_predictions.pkl")
model_1_scores = joblib.load("lin_reg_scores.pkl")
model_1_time = joblib.load("lin_reg_time.pkl")

print("TIME: ", round(model_1_time, 2), " sec\n")

print("RMSE: %0.1f\n" % np.sqrt(mean_squared_error(y_train, model_1_predictions)))

scores["model_1"] = [np.sqrt(-model_1_scores).mean(), model_1]
print("VALUE FOR COMPARISON: CV RMSE", round(scores["model_1"][0], 2), "\n")
display_scores(np.sqrt(-model_1_scores))

print("USED FEATURES:", model_1["regressor"].n_features_in_)

model_1_feature_importances = pd.DataFrame(
    {
        "name": model_1["regressor"].feature_names_in_,
        "value": model_1["regressor"].coef_.flatten(),
        "abs_value": np.abs(model_1["regressor"].coef_).flatten()
    }).sort_values(by="abs_value")

print("FEATURE VALUE IMPORTANCE:")
print("LOWEST ABS SCORE:")
print(round(model_1_feature_importances.head(6)[["name", "value"]], 2), "\n")
print("HIGHEST ABS SCORE:")
print(round(model_1_feature_importances.tail(6)[["name", "value"]], 2), "\n")

TIME:  12.22  sec

RMSE: 0.3

VALUE FOR COMPARISON: CV RMSE 0.34 

SCORES OF CROSS VALIDATION:
[0.3 0.3 0.4]
MEAN SCORE: 0.3
STD SCORE: 0.0

USED FEATURES: 48
FEATURE VALUE IMPORTANCE:
LOWEST ABS SCORE:
            name  value
42  HomeOwn_Rent  -0.00
17     UrineVol1   0.00
11        BPSys1   0.00
0             NR  -0.00
8          Pulse  -0.01
12        BPDia1   0.01 

HIGHEST ABS SCORE:
                    name         value
39  HHIncome_75000-99999 -7.485871e+08
40   HHIncome_more 99999 -8.586734e+08
9               BPSysAve -1.259208e+09
14                BPDia2 -1.098259e+10
16                BPDia3 -1.140055e+10
10              BPDiaAve  2.177340e+10 



In [25]:
def decicion_tree_reg(x, y):
    """define model 3"""
    start = time.time()

    # numeric feature preparation
    pipeline_num = Pipeline([
        ("selector", Selector("num features")),
        ("imputer", SimpleImputer(strategy="median")),  # missing values are replaced by column median of train_set
    ])

    # categorical feature preparation
    pipeline_cat = Pipeline([
        ("selector", Selector("cat features")),  # select categorical attributes
        ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first", dtype=bool)),  # one-hot-encoding
    ])

    # combine numeric and categorical feature preparation
    pipeline_full = FeatureUnion(transformer_list=[
        ("pipeline_num", pipeline_num),
        ("pipeline_cat", pipeline_cat),
    ])

    # set output to pandas, so we can use pandas tools again (e.g. for feature_names_in_)
    pipeline_full.set_output(transform="pandas")

    # define full pipeline
    base_model = Pipeline([
        ("pipeline_full", pipeline_full),
        ("regressor", DecisionTreeRegressor(random_state=123)),
    ])

    # possible fine-tuned parameters
    model_param = [
        {
            "regressor__max_depth": [8],
            "regressor__min_samples_leaf": [0.0009],  # [0.0008, 0.0009, 0.001],
        }
    ]

    model_gs = GridSearchCV(base_model, model_param, cv=10, scoring="neg_root_mean_squared_error", n_jobs=-1) #ToDo: how does gS work?

    model_gs.fit(x, y)

    model = model_gs.best_estimator_

    model_best_params = model_gs.best_params_

    model_predictions = model.predict(x)

    # attention: it is not allowed to use a fitted model (like model) in cross_val_predict because information gets through
    # unfitted model (clone(model)) is used for cross validation

    model_new = clone(model)
    model_scores = cross_val_score(model_new, x, y, scoring="neg_mean_squared_error", cv=3, n_jobs=-1)

    end = time.time()
    model_time = end - start

    # store model
    with open("decicion_tree.pkl", "wb") as file:
        pickle.dump(model, file)

    # use the following to store the best params of grid search
    with open("decicion_tree_best_params.pkl", "wb") as ff:
        pickle.dump(model_best_params, ff)

    joblib.dump(model_predictions, filename="decicion_tree_predictions.pkl")
    joblib.dump(model_scores, filename="decicion_tree_scores.pkl")
    joblib.dump(model_time, filename="decicion_tree_time.pkl")

In [26]:
decicion_tree_reg(X_train, y_train)
model_3 = joblib.load("decicion_tree.pkl")

with open("decicion_tree_best_params.pkl", 'rb') as f:
    model_3_best_params = pickle.load(f)

model_3_predictions = joblib.load("decicion_tree_predictions.pkl")
model_3_scores = joblib.load("decicion_tree_scores.pkl")
model_3_time = joblib.load("decicion_tree_time.pkl")

print("TIME: ", round(model_3_time, 2), " sec\n")

print("RMSE: %0.1f\n" % np.sqrt(mean_squared_error(y_train, model_3_predictions)))

scores["model_3"] = [np.sqrt(-model_3_scores).mean(), model_3]
print("VALUE FOR COMPARISON: CV RMSE", round(scores["model_3"][0], 2), "\n")
display_scores(np.sqrt(-model_3_scores))

print("USED FEATURES: ", model_3["regressor"].n_features_in_)

model_3_feature_importances = pd.DataFrame(
    {
        "name": model_3["regressor"].feature_names_in_,
        "value": model_3["regressor"].feature_importances_,
        "abs_value": np.abs(model_3["regressor"].feature_importances_)
    }).sort_values(by="abs_value")

print("FEATURE VALUE IMPORTANCE:")
print("LOWEST ABS SCORE:")
print(round(model_3_feature_importances.head(6)[["name", "value"]], 2), "\n")
print("HIGHEST ABS SCORE:")
print(round(model_3_feature_importances.tail(6)[["name", "value"]], 2), "\n")

print("OPTIMAL PARAMETERS:")
for param, value in model_3_best_params.items():
    print(param, ": ", value, sep="")
print("\n")


TIME:  3.42  sec

RMSE: 0.3

VALUE FOR COMPARISON: CV RMSE 0.38 

SCORES OF CROSS VALIDATION:
[0.4 0.4 0.4]
MEAN SCORE: 0.4
STD SCORE: 0.0

USED FEATURES:  48
FEATURE VALUE IMPORTANCE:
LOWEST ABS SCORE:
                    name  value
23      AgeDecade_ 50-59    0.0
32  HHIncome_15000-19999    0.0
31  HHIncome_10000-14999    0.0
28           Race1_Other    0.0
27         Race1_Mexican    0.0
26        Race1_Hispanic    0.0 

HIGHEST ABS SCORE:
           name  value
6        Height   0.04
18   UrineFlow1   0.05
19  Gender_male   0.06
7           BMI   0.08
1           Age   0.10
5        Weight   0.31 

OPTIMAL PARAMETERS:
regressor__max_depth: 8
regressor__min_samples_leaf: 0.0009




In [109]:

def SVR_class(x, y):
    """Define model 3"""
    start = time.time()

    base_model = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # missing values are replaced by column median of train_set
        ("scaler", MinMaxScaler()),  # scale to interval [0, 1]
        ("pca", PCA()),
        ("classifier", SVC()),
    ])

    # possible fine-tuned parameters
    model_param = [
        # softmax regression
        {
            "pca__n_components": [0.9],
            "classifier__kernel": ["rbf"],
            "classifier__degree": [2],  # [2, 3],
            "classifier__C": [10],  # [1, 1e-1, 1e-2, 10, 100],
        }
    ]

    model_gs = GridSearchCV(base_model, model_param, cv=10, n_jobs=-1, scoring="f1_weighted")
    model_gs.fit(x, y)

    model = model_gs.best_estimator_

    model_best_params = model_gs.best_params_

    model_predictions = model.predict(x)

    # attention: it is not allowed to use a fitted model (like model) in cross_val_predict because information gets through
    # unfitted model (clone(model)) is used for cross validation

    model_new = clone(model)
    model_predictions_cv = cross_val_predict(model_new, x, y, cv=3, n_jobs=-1)

    end = time.time()
    model_time = end - start

    # store model
    with open("SVR_class.pkl", "wb") as ff:
        pickle.dump(model, ff)

    # use the following to store the best params of grid search
    with open("SVR_class_best_params.pkl", "wb") as ff:
        pickle.dump(model_best_params, ff)

    joblib.dump(model_predictions, filename="SVR_class_predictions.pkl")
    joblib.dump(model_predictions_cv, filename="SVR_class_predictions_cv.pkl")
    joblib.dump(model_time, filename="SVR_class_time.pkl")

In [110]:
SVR_class(X_train, y_train)

model_3 = joblib.load("SVR_class.pkl")

with open("SVR_class_best_params.pkl", 'rb') as f:
    model_3_best_params = pickle.load(f)

model_3_predictions = joblib.load("SVR_class_predictions.pkl")
model_3_predictions_cv = joblib.load("SVR_class_predictions_cv.pkl")
model_3_time = joblib.load("SVR_class_time.pkl")

print("TIME: ", round(model_3_time, 2), " sec\n")

print("CONFUSION MATRIX:")
print(confusion_matrix(y_train, model_3_predictions), "\n")

print("CROSS VALIDATION CONFUSION MATRIX:")
print(confusion_matrix(y_train, model_3_predictions_cv), "\n")

print("CLASSIFICATION REPORT:")
print(classification_report(y_train, model_3_predictions))

print("CROSS CLASSIFICATION REPORT:")
print(classification_report(y_train, model_3_predictions_cv))

model_3_score = f1_score(y_train, model_3_predictions_cv, average="weighted")
print("VALUE FOR COMPARISON: WEIGHTED F1_SCORE:", round(model_3_score, 2), "\n")
scores["model_3"] = [model_3_score, model_3]

print("OPTIMAL PARAMETERS:")
for param, value in model_3_best_params.items():
    print(param, ": ", value, sep="")
print("\n")

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\johan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\johan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\johan\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 201, in fit
    y = self._validate_targets(y)
  File "C:\Users\johan\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 745, in _validate_targets
    check_classification_targets(y)
  File "C:\Users\johan\anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 207, in check_classification_targets
    raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'


In [27]:
def random_f_class(x, y):
    """Define model 4"""
    start = time.time()

    base_model = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # missing values are replaced by column median of train_set
        ("scaler", MinMaxScaler()),  # scale to interval [0, 1]
        ("pca", PCA()),
        ("classifier", RandomForestClassifier(random_state=123)),
    ])

    # possible fine-tuned parameters
    model_param = [
        {
            "pca__n_components": [0.9],
            "classifier__n_estimators": [600],  # [10, 50, 100, 200, 400, 600, 800],
            "classifier__max_depth": [20],  # [None, 5, 10, 20, 30, 50],
        }
    ]

    model_gs = GridSearchCV(base_model, model_param, cv=10, n_jobs=-1, scoring="f1_weighted")
    model_gs.fit(x, y)

    model = model_gs.best_estimator_

    model_best_params = model_gs.best_params_

    model_predictions = model.predict(x)

    # attention: it is not allowed to use a fitted model (like model) in cross_val_predict because information gets through
    # unfitted model (clone(model)) is used for cross validation

    model_new = clone(model)
    model_predictions_cv = cross_val_predict(model_new, x, y, cv=3, n_jobs=-1)

    end = time.time()
    model_time = end - start

    # store model
    joblib.dump(model, filename="random_f_class.pkl")

    # use the following to store the best params of grid search
    with open("random_f_class_best_params.pkl", "wb") as ff:
        pickle.dump(model_best_params, ff)

    joblib.dump(model_predictions, filename="random_f_class_predictions.pkl")
    joblib.dump(model_predictions_cv, filename="random_f_class_predictions_cv.pkl")
    joblib.dump(model_time, filename="random_f_class_time.pkl")

In [41]:
newdf

Unnamed: 0,NR,Age,AgeMonths,HHIncomeMid,Poverty,HomeRooms,Weight,Height,BMI,Pulse,...,TVHrsDayChild,CompHrsDayChild,AlcoholDay,AlcoholYear,SmokeAge,AgeFirstMarij,AgeRegMarij,SexAge,SexNumPartnLife,SexNumPartYear
0,9217,44,,30000.0,1.31,8.0,93.3,178.7,29.20,62.0,...,,,1.0,1.0,17.0,17.0,,17.0,20.0,1.0
1,7325,50,,,,7.0,118.0,182.2,35.50,72.0,...,,,1.0,3.0,,,,24.0,1.0,0.0
2,919,59,718.0,50000.0,4.62,7.0,88.1,171.2,30.06,60.0,...,,,2.0,2.0,,,,17.0,3.0,1.0
3,5903,40,,2500.0,0.12,7.0,97.9,167.8,34.80,92.0,...,,,,,,,,,,
4,2808,13,166.0,,,6.0,45.2,151.9,19.59,64.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,9786,80,,50000.0,3.48,6.0,69.8,149.1,31.40,92.0,...,,,,,,,,,,
7996,7764,27,,22500.0,0.89,7.0,97.0,157.4,39.20,70.0,...,,,1.0,12.0,16.0,15.0,,17.0,3.0,1.0
7997,5219,10,,100000.0,5.00,11.0,34.5,142.3,17.00,94.0,...,,,,,,,,,,
7998,1347,23,281.0,,,,51.6,155.7,21.28,80.0,...,,,3.0,72.0,16.0,16.0,,14.0,15.0,3.0


In [80]:
df = pd.read_csv('project_1_train.csv')
#Applying the condition, but leave nan for now
df['Diabetes'].mask(df['Diabetes'] == 'Yes', 0, inplace=True)
df['Diabetes'].mask(df['Diabetes'] == 'No', 1, inplace=True)
df["Diabetes"] = df["Diabetes"].astype(float)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df = df.select_dtypes(include=numerics)



max_number_of_nas = 1500
newdf = df.loc[:, (df.isnull().sum(axis=0) <= max_number_of_nas)]

newdf.dropna(inplace=True)
newdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5142 entries, 0 to 7997
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   NR           5142 non-null   int64  
 1   Age          5142 non-null   int64  
 2   HHIncomeMid  5142 non-null   float64
 3   Poverty      5142 non-null   float64
 4   HomeRooms    5142 non-null   float64
 5   Weight       5142 non-null   float64
 6   Height       5142 non-null   float64
 7   BMI          5142 non-null   float64
 8   Pulse        5142 non-null   float64
 9   BPSysAve     5142 non-null   float64
 10  BPDiaAve     5142 non-null   float64
 11  BPSys1       5142 non-null   float64
 12  BPDia1       5142 non-null   float64
 13  BPSys2       5142 non-null   float64
 14  BPDia2       5142 non-null   float64
 15  BPSys3       5142 non-null   float64
 16  BPDia3       5142 non-null   float64
 17  DirectChol   5142 non-null   float64
 18  UrineVol1    5142 non-null   float64
 19  UrineF

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf.dropna(inplace=True)


In [81]:
y_clas = newdf['Diabetes']

x_clas = newdf.drop(columns=['Diabetes'])
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(x_clas, y_clas, test_size=0.2, random_state=123)

In [82]:
X_train_c

Unnamed: 0,NR,Age,HHIncomeMid,Poverty,HomeRooms,Weight,Height,BMI,Pulse,BPSysAve,BPDiaAve,BPSys1,BPDia1,BPSys2,BPDia2,BPSys3,BPDia3,DirectChol,UrineVol1,UrineFlow1
5211,3897,50,87500.0,3.63,10.0,108.9,180.5,33.43,76.0,125.0,72.0,124.0,70.0,128.0,72.0,122.0,72.0,1.06,64.0,0.753
1475,291,65,100000.0,5.00,7.0,67.8,175.5,22.01,82.0,114.0,53.0,112.0,54.0,114.0,52.0,114.0,54.0,1.60,21.0,0.368
6290,5526,24,50000.0,2.33,4.0,92.6,176.4,29.80,70.0,102.0,56.0,102.0,58.0,102.0,56.0,102.0,56.0,1.06,33.0,0.068
7967,2429,64,100000.0,5.00,10.0,98.0,176.0,31.64,66.0,116.0,67.0,114.0,64.0,116.0,66.0,116.0,68.0,0.93,67.0,0.677
3271,9723,57,70000.0,3.13,5.0,70.8,169.3,24.70,48.0,113.0,65.0,114.0,66.0,116.0,66.0,110.0,64.0,1.14,97.0,0.890
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,407,22,60000.0,1.39,5.0,63.8,162.0,24.31,62.0,119.0,49.0,118.0,46.0,120.0,46.0,118.0,52.0,2.12,242.0,1.485
6275,8296,73,87500.0,2.20,5.0,82.9,169.9,28.70,78.0,123.0,65.0,124.0,62.0,126.0,62.0,120.0,68.0,1.06,23.0,0.299
2020,5615,50,87500.0,5.00,7.0,94.2,175.5,30.60,78.0,110.0,79.0,112.0,84.0,108.0,80.0,112.0,78.0,0.85,56.0,0.339
5340,6515,22,30000.0,1.67,5.0,76.1,154.9,31.70,70.0,119.0,51.0,116.0,50.0,120.0,48.0,118.0,54.0,1.34,54.0,0.551


In [83]:
random_f_class(X_train_c, y_train_c)

In [86]:
from sklearn.metrics import confusion_matrix, f1_score, classification_report
model_4 = joblib.load("random_f_class.pkl")

with open("random_f_class_best_params.pkl", 'rb') as f:
    model_4_best_params = pickle.load(f)

model_4_predictions = joblib.load("random_f_class_predictions.pkl")
model_4_predictions_cv = joblib.load("random_f_class_predictions_cv.pkl")
model_4_time = joblib.load("random_f_class_time.pkl")

print("TIME: ", round(model_4_time, 2), " sec\n")

print("CONFUSION MATRIX:")
print(confusion_matrix(y_train_c, model_4_predictions), "\n")

print("CROSS VALIDATION CONFUSION MATRIX:")
print(confusion_matrix(y_train_c, model_4_predictions_cv), "\n")

print("CLASSIFICATION REPORT:")
print(classification_report(y_train_c, model_4_predictions))

print("CROSS CLASSIFICATION REPORT:")
print(classification_report(y_train_c, model_4_predictions_cv))

model_4_score = f1_score(y_train_c, model_4_predictions_cv, average="weighted")
print("VALUE FOR COMPARISON: WEIGHTED F1_SCORE:", round(model_4_score, 2), "\n")
scores["model_4"] = [model_4_score, model_4]

print("OPTIMAL PARAMETERS:")
for param, value in model_4_best_params.items():
    print(param, ": ", value, sep="")
print("\n")


TIME:  21.17  sec

CONFUSION MATRIX:
[[ 348    0]
 [   0 3765]] 

CROSS VALIDATION CONFUSION MATRIX:
[[ 103  245]
 [  11 3754]] 

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       348
         1.0       1.00      1.00      1.00      3765

    accuracy                           1.00      4113
   macro avg       1.00      1.00      1.00      4113
weighted avg       1.00      1.00      1.00      4113

CROSS CLASSIFICATION REPORT:
              precision    recall  f1-score   support

         0.0       0.90      0.30      0.45       348
         1.0       0.94      1.00      0.97      3765

    accuracy                           0.94      4113
   macro avg       0.92      0.65      0.71      4113
weighted avg       0.94      0.94      0.92      4113

VALUE FOR COMPARISON: WEIGHTED F1_SCORE: 0.92 

OPTIMAL PARAMETERS:
classifier__max_depth: 20
classifier__n_estimators: 600
pca__n_components: 0.9




In [89]:
from sklearn.linear_model import LogisticRegression

def run_model_2(x, y):
    """Define model 2"""
    start = time.time()

    base_model = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # missing values are replaced by column median of train_set
        ("scaler", MinMaxScaler()),  # scale to interval [0, 1]
        ("pca", PCA()),
        ("classifier", LogisticRegression()),
    ])

    # possible fine-tuned parameters
    model_param = [
        # binary fit for each label
        {
            "pca__n_components": [0.9],
            "classifier__multi_class": ["ovr"],  # binary fit for each label
            "classifier__penalty": ["l2"],
            "classifier__solver": ["lbfgs"],
            "classifier__C": [0.1],
        },
        # softmax regression
        {
            "pca__n_components": [0.9],
            "classifier__multi_class": ["multinomial"],  # softmax regression
            "classifier__penalty": ["l2"],
            "classifier__solver": ["saga"],
            "classifier__C": [0.1],
        }
    ]

    model_gs = GridSearchCV(base_model, model_param, cv=10, n_jobs=-1, scoring="f1_weighted")
    model_gs.fit(x, y)

    model = model_gs.best_estimator_

    model_best_params = model_gs.best_params_

    model_predictions = model.predict(x)

    # attention: it is not allowed to use a fitted model (like model) in cross_val_predict because information gets through
    # unfitted model (clone(model)) is used for cross validation

    model_new = clone(model)
    model_predictions_cv = cross_val_predict(model_new, x, y, cv=3, n_jobs=-1)

    end = time.time()
    model_time = end - start

    # store model
    joblib.dump(model, filename="log_reg_cl.pkl")

    # use the following to store the best params of grid search
    with open("log_reg_cl_best_params.pkl", "wb") as ff:
        pickle.dump(model_best_params, ff)

    joblib.dump(model_predictions, filename="log_reg_cl_predictions.pkl")
    joblib.dump(model_predictions_cv, filename="log_reg_cl_predictions_cv.pkl")
    joblib.dump(model_time, filename="log_reg_cl_time.pkl")


In [91]:
run_model_2(X_train_c, y_train_c)


model_2 = joblib.load("log_reg_cl.pkl")

with open("log_reg_cl_best_params.pkl", 'rb') as f:
    model_2_best_params = pickle.load(f)

model_2_predictions = joblib.load("log_reg_cl_predictions.pkl")
model_2_predictions_cv = joblib.load("log_reg_cl_predictions_cv.pkl")
model_2_time = joblib.load("log_reg_cl_time.pkl")

print("TIME: ", round(model_2_time, 2), " sec\n")

print("CONFUSION MATRIX:")
print(confusion_matrix(y_train_c, model_2_predictions), "\n")

print("CROSS VALIDATION CONFUSION MATRIX:")
print(confusion_matrix(y_train_c, model_2_predictions_cv), "\n")

print("CLASSIFICATION REPORT:")
print(classification_report(y_train_c, model_2_predictions))

print("CROSS CLASSIFICATION REPORT:")
print(classification_report(y_train_c, model_2_predictions_cv))

model_2_score = f1_score(y_train_c, model_2_predictions_cv, average="weighted")
scores["model_2"] = [model_2_score, model_2]
print("VALUE FOR COMPARISON: WEIGHTED F1_SCORE:", round(model_2_score, 2), "\n")

print("OPTIMAL PARAMETERS:")
for param, value in model_2_best_params.items():
    print(param, ": ", value, sep="")
print("\n")

TIME:  0.17  sec

CONFUSION MATRIX:
[[   0  348]
 [   0 3765]] 

CROSS VALIDATION CONFUSION MATRIX:
[[   0  348]
 [   0 3765]] 

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       348
         1.0       0.92      1.00      0.96      3765

    accuracy                           0.92      4113
   macro avg       0.46      0.50      0.48      4113
weighted avg       0.84      0.92      0.87      4113

CROSS CLASSIFICATION REPORT:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       348
         1.0       0.92      1.00      0.96      3765

    accuracy                           0.92      4113
   macro avg       0.46      0.50      0.48      4113
weighted avg       0.84      0.92      0.87      4113

VALUE FOR COMPARISON: WEIGHTED F1_SCORE: 0.87 

OPTIMAL PARAMETERS:
classifier__C: 0.1
classifier__multi_class: ovr
classifier__penalty: l2
classifier__solver: lbfgs

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
