In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets
from numpy.linalg import eig

* **1.Download and read the "Real estate valuation data set.xlsx" dataset from the UCI repository. You can find the description of the features and targets on the UCI repository website. Split the dataset in train and test set (use your choice of splitting). Train a linear regression model and report the performance (use your choice of at least four performance metrics)**

In [2]:
#Read data
df = pd.read_excel('Real estate valuation data set.xlsx')

In [3]:
df.shape

(414, 8)

In [4]:
df.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [5]:
#Get overview of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   No                                      414 non-null    int64  
 1   X1 transaction date                     414 non-null    float64
 2   X2 house age                            414 non-null    float64
 3   X3 distance to the nearest MRT station  414 non-null    float64
 4   X4 number of convenience stores         414 non-null    int64  
 5   X5 latitude                             414 non-null    float64
 6   X6 longitude                            414 non-null    float64
 7   Y house price of unit area              414 non-null    float64
dtypes: float64(6), int64(2)
memory usage: 26.0 KB


In [6]:
df.describe()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,207.5,2013.148953,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,119.655756,0.281995,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,1.0,2012.666667,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,104.25,2012.916667,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,207.5,2013.166667,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,310.75,2013.416667,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,414.0,2013.583333,43.8,6488.021,10.0,25.01459,121.56627,117.5


In [7]:
df.isnull().sum()

No                                        0
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
#Standardize the dataset
scaler = StandardScaler()
scaler.fit(df)
df.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
#Get features and target
X = df.drop('Y house price of unit area', axis = 1)
y = df['Y house price of unit area']

In [12]:
#Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
lr_model = LinearRegression()

In [15]:
lr_model.fit(X_train, y_train)

In [16]:
lr_pred = lr_model.predict(X_test)

In [17]:
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report

print(f'Mean absolute error: {mean_absolute_error(y_test, lr_pred):.2f}')
print(f'Mean squared error:{mean_squared_error(y_test, lr_pred):.2f}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test, lr_pred)):.2f}')
print(f'R2 score: {r2_score(y_test, lr_pred):.2f}')

Mean absolute error: 6.56
Mean squared error:71.58
Root mean squared error: 8.46
R2 score: 0.58


* **2.Apply PCA on the dataset and select the first three principal components. Split the dataset into train and test using the same method used in Q1. Compare the performance of this model with the performance obtained in Q1.  Explain the outcome.**

In [18]:
from sklearn.decomposition import PCA

In [19]:
pca = PCA(n_components=3)

In [20]:
#train the model after implementing PCA
X_train_PCA = pca.fit_transform(X_train)
X_test_PCA = pca.transform(X_test)
lm = LinearRegression()
lm.fit(X_train_PCA, y_train)
y_PCA_pred = lm.predict(X_test_PCA)

In [21]:
#Let's see how those 3 components contribute to the whole set
cov_matrix = np.cov(X.T)

eigenvalues, eigenvectors = eig(cov_matrix)
total_values = sum(eigenvalues)
var_percentage = [(e/total_values)*100 for e in sorted(eigenvalues, reverse=True)]
print(var_percentage)

percentage_3_components = 0
for i in range(3):
    percentage_3_components += round(var_percentage[i],2)
print('The first 3 components would contribute {}% of the dataset'.format(percentage_3_components ))

[99.1010104906357, 0.8905803130362797, 0.008063017235087759, 0.00034126691072898513, 4.901063393515153e-06, 6.399684720224462e-09, 4.719125410964692e-09]
The first 3 components would contribute 100.0% of the dataset


In [22]:
#Get metric scores
print(f'Mean absolute error: {mean_absolute_error(y_test, y_PCA_pred):.2f}')
print(f'Mean squared error:{mean_squared_error(y_test, y_PCA_pred):.2f}')
print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_test, y_PCA_pred)):.2f}')
print(f'R2 score: {r2_score(y_test, y_PCA_pred):.2f}')

Mean absolute error: 7.21
Mean squared error:92.03
Root mean squared error: 9.59
R2 score: 0.46


As we can see that the errors after doing PCA are greater than not applying. This could be because the collinearity was lost during the dimnensional reduction.

* **3.Load "IRIS " datasets from SKlearn and follow this link for the data description (features and target variable). Apply PCA on the dataset and select the first three principal components. Split the dataset in train and test set (use your choice of splitting). Train a logistic regression model and report the performance (use your choice of at least 4 performance metric)**

In [23]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [24]:
iris_dataset = datasets.load_iris()

In [25]:
X_iris = iris_dataset.data
y_iris = iris_dataset.target

In [26]:
scaler = StandardScaler()
X_iris = scaler.fit_transform(X_iris)

In [27]:
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, test_size=0.3, random_state=101)

* *Training the model without PCA*

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
lgr = LogisticRegression()

In [30]:
lgr.fit(X_iris_train, y_iris_train)

In [31]:
lgr_predict = lgr.predict(X_iris_test)

In [32]:
print(f'Classification report: {classification_report(y_iris_test, lgr_predict)}')

Classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.95      0.95      0.95        20
           2       0.92      0.92      0.92        12

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45



* *Training the model after implementing PCA*

In [33]:
pca = PCA(n_components=3)

In [34]:
X_PCA = pca.fit_transform(X_iris)

In [35]:
X_iris_PCA_train, X_iris_PCA_test, y_iris_PCA_train, y_iris_PCA_test = train_test_split(X_PCA, y_iris, test_size=0.3, random_state=101)
lgr_PCA = LogisticRegression()
lgr_PCA.fit(X_iris_PCA_train, y_iris_PCA_train)
y_iris_PCA_pred = lgr_PCA.predict(X_iris_PCA_test)

In [36]:
print(f'Classification report: {classification_report(y_iris_test, y_iris_PCA_pred)}')

Classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.95      0.97        20
           2       0.92      1.00      0.96        12

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



* **4. Apply L1 or L2 regulariser on the logistic regression model developed using the same train and test data used in Q3 and calculate performance of the new model. Compare performance of this model with the performance reported in Q3. Explain the outcome**

In [37]:
#Get the scores of the training and test set of linear regression
print(f"Linear Regression-Training set score: {lgr.score(X_iris_train, y_iris_train):.2f}")
print(f"Linear Regression-Test set score: {lgr.score(X_iris_test, y_iris_test):.2f}")

Linear Regression-Training set score: 0.97
Linear Regression-Test set score: 0.96


In [55]:
#Get the scores of training and test set from L2 regularization
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso

alpha = []
ridge_score = []

#find the suitable alpha for Ridge formula
for i in range(1, 20):
    ridge_model = Ridge(alpha = i * 0.1)
    ridge_model.fit(X_iris_train, y_iris_train)
    scores = cross_val_score(ridge_model, X_iris, y_iris, cv = 10)
    avg_score = np.mean(scores)*100
    ridge_score.append(avg_score)
    alpha.append(i * 0.1)
    
for i in range(0, len(alpha)):
    print(str(alpha[i])+' : '+str(ridge_score[i]))

0.1 : 15.98775363515023
0.2 : 15.992832918654104
0.30000000000000004 : 15.997307640793862
0.4 : 16.001236268097912
0.5 : 16.004669810651244
0.6000000000000001 : 16.00765298816279
0.7000000000000001 : 16.010225183868876
0.8 : 16.012421229849682
0.9 : 16.014272057430173
1.0 : 16.015805238887804
1.1 : 16.01704544103893
1.2000000000000002 : 16.01801480695582
1.3 : 16.01873327873785
1.4000000000000001 : 16.019218871677918
1.5 : 16.019487908147518
1.6 : 16.019555217937633
1.7000000000000002 : 16.0194343105376
1.8 : 16.01913752383547
1.9000000000000001 : 16.01867615292471


It looks like that alpha=1.6 returns the highest score.

In [57]:
ridge = Ridge(alpha=1.6).fit(X_iris_train, y_iris_train)
print(f"Ridge Regression-Training set score: {ridge.score(X_iris_train, y_iris_train):.2f}")
print(f"Ridge Regression-Test set score: {ridge.score(X_iris_test, y_iris_test):.2f}")

Ridge Regression-Training set score: 0.94
Ridge Regression-Test set score: 0.90


In [62]:
#find the suitable lambda for lasso regularisation
lasso_score = []
Lambda = []
 
for i in range(1, 20):
    lasso_model = Lasso(alpha = i * 0.1, tol = 0.0925)
    lasso_model.fit(X_iris_train, y_iris_train)
    scores = cross_val_score(lasso_model, X_iris, y_iris, cv = 10)
    avg_score = np.mean(scores)*100
    lasso_score.append(avg_score)
    Lambda.append(i * 0.1)

for i in range(0, len(Lambda)):
    print(str(Lambda[i])+' : '+str(lasso_score[i]))

0.1 : 15.238024702993075
0.2 : 14.501493748671296
0.30000000000000004 : 12.918036680481556
0.4 : 10.257741553964566
0.5 : 6.585704626068341
0.6000000000000001 : 1.842939042722297
0.7000000000000001 : -4.011659034536805
0.8 : -10.81629335384771
0.9 : 7.654320987654314
1.0 : 7.654320987654314
1.1 : 7.654320987654314
1.2000000000000002 : 7.654320987654314
1.3 : 7.654320987654314
1.4000000000000001 : 7.654320987654314
1.5 : 7.654320987654314
1.6 : 7.654320987654314
1.7000000000000002 : 7.654320987654314
1.8 : 7.654320987654314
1.9000000000000001 : 7.654320987654314


It looks like that alpha=0.1 returns the highest score.

In [65]:
#Get the scores of training and test set from L1 regularization
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1).fit(X_iris_train, y_iris_train)
print(f"Lasso Regression-Training set score: {lasso.score(X_iris_train, y_iris_train):.2f}")
print(f"Lasso Regression-Test set score: {lasso.score(X_iris_test, y_iris_test):.2f}")

Lasso Regression-Training set score: 0.92
Lasso Regression-Test set score: 0.89


The weight of lambda is still a bit too much. Let's try smaller lambda

In [68]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01).fit(X_iris_train, y_iris_train)
print(f"Lasso Regression-Training set score: {lasso.score(X_iris_train, y_iris_train):.2f}")
print(f"Lasso Regression-Test set score: {lasso.score(X_iris_test, y_iris_test):.2f}")

Lasso Regression-Training set score: 0.94
Lasso Regression-Test set score: 0.90


As we can see that, in this case both scores from L1 and L2 regularization are lower than linear regression. This could be because the iris dataset is pretty small and regularization may affect the training.