In [1]:
# Libraries to be used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from sklearn import preprocessing
from sklearn.cluster import KMeans

### Task 1.
In this exercise, you will perform tasks faced by a data scientist working in the real estate
industry. Your job is to build a predictive model to estimate selling price for houses. 
You need to perform the following tasks:
- Exploratory data analysis: try to understand the different variables in the data. Identify the variables, based on exploratory data analysis methods, that you think have an effect on the price of the house
- Develop a regression model that the company can use to predict the selling price for new houses on the market. The model should contain only the variables that were found as potentially important in the previous step.

In [3]:
# Loading data
housing = pd.read_csv('./1/Data for practice exercises/Housing.csv')
housing.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [4]:
# Structure, missing values
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       10000 non-null  int64  
 1   numberOfRooms      10000 non-null  int64  
 2   hasYard            10000 non-null  int64  
 3   hasPool            10000 non-null  int64  
 4   floors             10000 non-null  int64  
 5   cityCode           10000 non-null  int64  
 6   cityPartRange      10000 non-null  int64  
 7   numPrevOwners      10000 non-null  int64  
 8   made               10000 non-null  int64  
 9   isNewBuilt         10000 non-null  int64  
 10  hasStormProtector  10000 non-null  int64  
 11  basement           10000 non-null  int64  
 12  attic              10000 non-null  int64  
 13  garage             10000 non-null  int64  
 14  hasStorageRoom     10000 non-null  int64  
 15  hasGuestRoom       10000 non-null  int64  
 16  price              1000

In [5]:
# Correlation, it seems square meeters is the only highly correlated variable
housing.corr()['price']

squareMeters         0.999999
numberOfRooms        0.009591
hasYard             -0.006119
hasPool             -0.005070
floors               0.001654
cityCode            -0.001539
cityPartRange        0.008813
numPrevOwners        0.016619
made                -0.007210
isNewBuilt          -0.010643
hasStormProtector    0.007496
basement            -0.003967
attic               -0.000600
garage              -0.017229
hasStorageRoom      -0.003485
hasGuestRoom        -0.000644
price                1.000000
Name: price, dtype: float64

In [None]:
# Some other plots, such as boxplot, but I would just conclude to square meters as the only predictor based on this dataset

In [6]:
# Create predictor and target
housing_X, housing_y = housing[['squareMeters']], housing.price

In [7]:
# Create the regression model

reg_1 = sm.add_constant(housing_X)
est_1 = sm.OLS(housing_y, reg_1)
est_1fit = est_1.fit()

# We can create a perfect model using only one predictor variable
# It seems in the area only the size of the real estate is important
print(est_1fit.summary())

  x = pd.concat(x[::order], 1)


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 7.775e+09
Date:                Mon, 21 Mar 2022   Prob (F-statistic):               0.00
Time:                        18:57:48   Log-Likelihood:                -95093.
No. Observations:               10000   AIC:                         1.902e+05
Df Residuals:                    9998   BIC:                         1.902e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         6460.8797     65.294     98.950   

### Task 2.
In this exercise, you will have to analyze a dataset (haberman.csv) that contains cases from
a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on
patients who had undergone surgery for breast cancer. The task is to determine if the patient survived
5 years or longer (positive) or if the patient died within 5 year (negative).
- Build a logistic regression classification model using all the three predictor variables (Age, Year of operation, Number of positive axillary nodes detected). Divide the data set into training (70 %) and test set (30 %), use random state = 0, and follow the process of building a classification model as discussed in the course.
- Create the confusion matrix, calculate classification performance measures, and check the accuracy for the test set.
- Perform the previous steps but now using only the two variables Age, and Number of positive axillary nodes detected. How did the accuracy of the model on the test set change? Based on this, do you think Year is an important predictor variable?

In [8]:
# Load the data

haberman = pd.read_csv('./1/Data for practice exercises/haberman.csv')
haberman.head()

Unnamed: 0,Age,Year_of_operation,Number_of_aux_nodes,Outcome
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [9]:
haberman.Outcome.value_counts()

1    225
2     81
Name: Outcome, dtype: int64

In [10]:
# Create data for model building

X = haberman[['Age', ' Year_of_operation', 'Number_of_aux_nodes']]
y = haberman.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train

Unnamed: 0,Age,Year_of_operation,Number_of_aux_nodes
213,58,58,3
188,55,69,22
137,51,59,3
249,63,63,0
76,44,61,0
...,...,...,...
251,63,61,9
192,56,60,0
117,49,62,0
47,41,59,8


In [11]:
# Creating the model

logistic_regression = LogisticRegression(solver = 'lbfgs')

model_1 = logistic_regression.fit(X_train, y_train)

# Predict for train and test set

y_train_predict = logistic_regression.predict(X_train)
y_test_predict = logistic_regression.predict(X_test)

# Print results

print(metrics.classification_report(y_train, y_train_predict))

print(metrics.classification_report(y_test, y_test_predict))


              precision    recall  f1-score   support

           1       0.80      0.96      0.87       167
           2       0.50      0.15      0.23        47

    accuracy                           0.78       214
   macro avg       0.65      0.55      0.55       214
weighted avg       0.73      0.78      0.73       214

              precision    recall  f1-score   support

           1       0.63      0.98      0.77        58
           2       0.50      0.03      0.06        34

    accuracy                           0.63        92
   macro avg       0.57      0.51      0.41        92
weighted avg       0.58      0.63      0.51        92



In [13]:
# Model without year

X = haberman[['Age', 'Number_of_aux_nodes']]
y = haberman.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

logistic_regression = LogisticRegression(solver = 'lbfgs')

model_1 = logistic_regression.fit(X_train, y_train)

# Predict for train and test set

y_train_predict = logistic_regression.predict(X_train)
y_test_predict = logistic_regression.predict(X_test)

# Print results
# We get better results without year in terms of accuracy at least
print(metrics.classification_report(y_train, y_train_predict))

print(metrics.classification_report(y_test, y_test_predict))

              precision    recall  f1-score   support

           1       0.80      0.96      0.88       167
           2       0.54      0.15      0.23        47

    accuracy                           0.79       214
   macro avg       0.67      0.56      0.55       214
weighted avg       0.74      0.79      0.73       214

              precision    recall  f1-score   support

           1       0.64      0.98      0.78        58
           2       0.67      0.06      0.11        34

    accuracy                           0.64        92
   macro avg       0.65      0.52      0.44        92
weighted avg       0.65      0.64      0.53        92

