In [None]:

import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from matplotlib import cm
from scipy import stats



Here, in contrast to the example we solved in the class, we focus on classification problem.
To do so, we assign house prices to three classes : cheap, medium and expensive price.
On the next step we build the model based on features that predicts class of the house. A robust implementation  must consider feature engineering, data cleaning, and cross-validation.

In [None]:
# scaling and train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder, OneHotEncoder

# components for ANN model
from keras.layers import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from tensorflow.keras.utils import plot_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# evaluation on test data
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score

In [None]:
 from google.colab import drive
drive.mount('/content/drive')

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!ls /content/drive/MyDrive/Colab\ Notebooks/Housing\ data

Read training data with labels into `train_full_df` and data without labels into `new_input_df`



In [None]:
train_full_df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Housing data/train.csv") # data with labels
new_input_df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Housing data/test.csv") # data without labels

In [None]:
train_full_df

In [None]:
train_full_df.shape

In [None]:
new_input_df.shape

# Data pre-processing

As first step to develop ML model, we pre-process data similarly how it was performed for regression problem in class assigment.

In [None]:
target = ['SalePrice']
features = train_full_df.drop(['Id'] + target, axis=1).columns
#features = train_df.drop(target, axis=1).columns

dataset_types = pd.DataFrame(train_full_df[features].dtypes, columns=['datatype'])
dataset_types.reset_index(inplace=True)
dataset_types

We need to sort out numerical and categorical features, since they are treated differently by scalling and models

In [None]:
numeric_features = dataset_types.rename(columns={"index" : "feature"}).feature[(dataset_types.datatype == 'float64') | (dataset_types.datatype == 'int64')]
#num_data = train_full_df[numeric_features]
num_data = train_full_df[numeric_features]
num_features = num_data.fillna(num_data.mean()).values


We apply
`
StandardScaler
`
to adjust numerical features  by removing the mean and scaling to unit variance.


In [None]:
scaler = StandardScaler()
num_features_scaled = scaler.fit_transform(num_features)
num_features_scaled

We apply
`
LabelEncoder
`
and
`
OneHotEncoder`


to adjust categorical features by encoding target labels with value between 0 and n_classes-1 and as a one-hot numeric array.




In [None]:
def encode_one_categorical_feature(column):
    le = LabelEncoder()
    ohe = OneHotEncoder(sparse_output=False)
    num_encoded = le.fit_transform(column.fillna('unk'))
    oh_encoded = ohe.fit_transform(num_encoded.reshape(-1, 1))
    return oh_encoded

In [None]:
categorical_features = dataset_types.rename(columns={"index" : "feature"}).feature[(dataset_types.datatype == 'object')]
cat_data = train_full_df[categorical_features]
cat_data_new = new_input_df[categorical_features]

cat_data_combined=pd.concat([cat_data,cat_data_new],ignore_index=True)
cat_features_combined = np.hstack([encode_one_categorical_feature(cat_data_combined[column]) for column in cat_data.columns])


In [None]:
cat_features=cat_features_combined[len(cat_data)]

In [None]:
cat_features_combined[:len(cat_data),:].shape

In [None]:
cat_features_combined[len(cat_data):,:].shape

In [None]:
len(cat_data)

In [None]:
cat_data = train_full_df[categorical_features]
cat_data_new = new_input_df[categorical_features]
cat_data_combined=pd.concat([cat_data,cat_data_new],ignore_index=True)

print(cat_data.shape, cat_data_new.shape, cat_data_combined.shape)

In [None]:
categorical_features = dataset_types.rename(columns={"index" : "feature"}).feature[(dataset_types.datatype == 'object')]
cat_data = train_full_df[categorical_features]
#cat_data = train_df[categorical_features]
cat_features = np.hstack([encode_one_categorical_feature(train_full_df[column]) for column in cat_data.columns])
cat_features

In [None]:
cat_data.shape

In [None]:
new_input_df[categorical_features].shape

In [None]:
categorical_features

In [None]:
for i, val in enumerate(categorical_features):
  print(val)
  print( set(train_full_df[val].unique())- set(new_input_df[val].unique()) )

In [None]:
train_full_df["Condition2"].unique()

In [None]:
new_input_df["Condition2"].unique()

In [None]:
print("There are {} features in this dataset".format(len(train_full_df.columns)))
print("{} features are numeric".format(len(numeric_features)))
print("{} features are categorical.".format(len(categorical_features)))
print("The last two are the target, which is numeric, and the id column.")

Next we stack together numerical and categorical features into
`
X
` variable and targets into
`
y
`. Next we split into train and test splits.





In [None]:
X = np.hstack((num_features_scaled, cat_features))


Next we need to transform numerical SalePrice target to categorical. For this we first calculate quantiles of SalePrice.
Next, make categories depending to which quantile SalePrice belongs.
We label cheap houses with 0, medium with 1 and expansive  with 2

In [None]:
qn=np.quantile(train_full_df[target].values,[0,0.25,0.5,0.75,1])

We create `Pricelabel` column with the default value 0. This would represent the lowest "cheap" houses class with label `0`.



In [None]:
train_full_df["Pricelabel"]=0


We define `Pricelabel`between 1st and 3rd quantile to have a medium price class with label `1`.




In [None]:
train_full_df.loc[(train_full_df["SalePrice"]>qn[1]) & (train_full_df["SalePrice"]<qn[3]),"Pricelabel" ]=1

We define `Pricelabel` larger than 3rd quantile to belong to the expansive price class with label `2`.


In [None]:
train_full_df.loc[train_full_df["SalePrice"]>=qn[3],"Pricelabel" ]=2


In [None]:
len(train_full_df[train_full_df["Pricelabel"]==2])

In [None]:
len(train_full_df)

# Question part 1



1. To which quantile of SalePrice belong data with Pricelabel = 0?
2. How many samples (data points) are there in each class. Is it balanced?



In [None]:
y = train_full_df["Pricelabel"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=404)


# ANN model

We create function to plot validation and training accuracy as function of number of epochs to perform cross-validation.

In [None]:
def plot_history(history):
    plt.plot(history.history['loss'], 'b')
    plt.plot(history.history['val_loss'], 'r')
    plt.title('model accuracy')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

Next we create a Keras model. It takes in train and test splits, compiles model, fits model, plots training history, calculates model error on test data and returns trained model and training history

In [None]:
def keras_model(X_train, X_test, y_train, y_test):
    NUM_EPOCHS = 50 # set number of epochs for training
    BATCH_SIZE = 128 # for faster calculations and model training we split data into batches. This would specify the batch size

    inputs = Input(shape=(303, )) # input layer of NN. Here we sepcify number of inputs
    x = Dropout(0.2)(inputs) # to make our NN more robust against overfitting we introduce dropout layer
                              # this randomly switches off 0.2 (20%) of the connections between layers

    x = Dense(128)(x) # next dense layer has 256 neurons
    x = Activation("relu")(x) # we specify activation funcion of the layer as RELU function
    x = Dropout(0.2)(x) # we introduce droput function between following layers

    x = Dense(128)(x)
    x = Activation("relu")(x)
    x = Dropout(0.2)(x)

    x = Dense(128)(x)
    x = Activation("relu")(x)
    x = Dropout(0.2)(x)


    x = Dense(3)(x) # the last layer has 3 neuron, since we are interested in the prediction of 3 classess
    predictions= Activation("softmax")(x) # the last activation is softmax for multiple classes, that is analogy of sigmoid for logistic regressions

    model = Model(inputs=[inputs], outputs=[predictions]) # this collects the model together with specification of the input and output

    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam") # here we compile model and specify loss function as well as optimiser.
                                                # For classification problems the common choice would be sparse_categorical_crossentropy.
    history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_split=0.2, verbose=1)
    # Here we fit the model with specified train set, batch size, number of epochs and automatically validation split:
    # The fit function splits 0.2 of train set into validation set. Important this set is different from test set, that is used for final error calculation

    plot_history(history) # plot training curves

    score = model.evaluate(X_test, y_test, verbose=0) # evaluate model on the hold-out test set, that was not used during training and validation
    print("Test MSE is {:.2e}".format(score))
    return history, model

In [None]:
model, history = keras_model(X_train, X_test, y_train, y_test)

# Question part 2



1.   Summarize Keras implementation differences between regression model and classification.
2.   Analyse train and test training curves. At which epoch shall we stop training ?



# Model performance analysis

In the next section we analyse prediction of the model on the test (hold-out) set.

In [None]:
predicted = model.model.predict(X_test) # prediction of the model on hold-out test data set


As a result of prediction we get matrix with 3 columns with numbers from 0 to 1. The higher the number, the higher is pseudo-probability to belong to one of 3 classes.

# Question part 3


1. Print `
predicted
` matrix.  How would you interpret 3 column values in
` predicted
`? Can they be negative? Their sum is close to which number?

In order to assign the exact class we apply
`
np.argmax
`
function, that returns the class with highest probability.


In [None]:
predicted_classes = np.argmax(predicted, axis=-1)


Next we plot visually confusion matrix. For this we apply
```
confusion_matrix((y_true, y_predicted)
```


In [None]:
conf_matrix=confusion_matrix(y_test, predicted_classes)
print(conf_matrix)

In the next section we write function that

In [None]:
def plot_conf_matrix(conf_matrix):
  fig, ax = plt.subplots(figsize=(10, 10))
  ax.matshow(conf_matrix, cmap=cm.jet, alpha=0.3)
  for i in range(conf_matrix.shape[0]):
      for j in range(conf_matrix.shape[1]):
          ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

  plt.xlabel('Predictions', fontsize=18)
  plt.ylabel('Actuals', fontsize=18)
  plt.title('Confusion Matrix', fontsize=18)
  plt.show()

plot_conf_matrix(conf_matrix)

# Your code part 1

Analyse confusion matrix and fill in numbers for the print out statements below

In [None]:
print('Number of correct predicted cheap houses: ',     )
print('Number of correct predicted medium houses: ',     )
print('Number of correct predicted expensive houses: ',     )

print('Number of predicted false negative cheap houses: ',     )
print('Number of predicted false positives cheap houses: ',     )

print('Number of predicted false negative medium houses: ',     )
print('Number of predicted false positives medium houses: ',     )

print('Number of predicted false negative expensive houses: ',     )
print('Number of predicted false positives expensive houses: ',     )

print('Class with the best accuracy: ',     )
print('Class with the worst accuracy: ',     )

# Your code part 2

In order to how good is our classification we can use following metric:


*   precision - the number of true positives
divided by the total number of elements labeled as belonging to the positive class: `precision=true_positive/(true_positive+false_positive)`
*   recall - the number of true positives
divided by the total number of elements that actually belong to the positive class: `recall=true_positive/(true_positive+false_negative)`
*    F1 score - is given by the ratio of numerator:
`2*precision*recall` to denominator: `(precision + recall)`. This is a better choice for unbalanced classes.

Implement below this metrics for each class based on the confusion matrix above. Calculate average between 3 classes of F1 score.

In [None]:
precision_class_1=
precision_class_2=
precision_class_3=

recall_class_1=
recall_class_2=
recall_class_3=

F1_class_1=
F1_class_2=
F1_class_3=

F1_average=

# Your code part 3



1.  Implement below following architectures of ANNs for classification:


*   3 hidden layers with 256 neurons each, droupot with 40% probability and relu activation function
*    5 hidden layers with 128 neurons each, droupot with 20% probability and relu activation function
*    2 hidden layers with 512 neurons each, droupot with 20% probability and relu activation function

2.   Analyse performance of the models above based on the test data set:

*   Plot training curves for each model
*   Calculate and draw confusion matrix
*   Calculate average F1 score for each of the ANNs above
*   Compare training curves, confusion matrix and F1 score between the models

3. Chose the best best model based on F1 score. On which epoch it should be stopped ?

4. Apply the best performing model to for the prediction on the `new_input_df` dataframe, that we read from `test.csv` at the beginning of the assigment. Important: apply same pre-processing of the input features, such that it is compatiable with developed model.





# Question part 3

Summarize below possible application of classification model you have just build:


1.   In which case it could be applied? When one should prefer classification model over regression model for houses price prediction?
2.   How one can improve this model? What additional data one can collect and include ?
3.   How you can imagine to deploy or use such a model?
4.   Suggest a way to measure business impact of using the model for houses price prediction.



