# Homework

In [1]:
# Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

from IPython.display import display

# Data preparation

- Select only the features from above and fill in the missing values with 0.
- Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
- Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
- Create a new column population_per_household by dividing the column population by the column households from dataframe.

In [2]:
# import dataset
df = pd.read_csv('/Users/admin/Documents/Data Science/ml-zoomcamp/Module 3 ML for classification/californaia-housing-price.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# convert the observations of ocean_proximity to lower case and replace the empty space with _

df['ocean_proximity'] = df['ocean_proximity'].str.replace(' ', '_').str.lower()

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,near_bay
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,near_bay
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,near_bay
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,near_bay
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,near_bay


In [5]:
#check for missing values

df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
# filling the missing values with zero (0)
df['total_bedrooms'] = df['total_bedrooms'].fillna(0)

In [7]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [8]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

In [9]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,near_bay,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,near_bay,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,near_bay,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,near_bay,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,near_bay,6.281853,0.172096,2.181467


# Question 1

What is the most frequent observation (mode) for the column ocean_proximity?

Options:

- NEAR BAY
- <1H OCEAN
- INLAND
- NEAR OCEAN

In [10]:
df['ocean_proximity'].mode()

0    <1h_ocean
Name: ocean_proximity, dtype: object

# Split the data

- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (median_house_value) is not in your dataframe.


In [11]:
# Splitting the data into train/val/test dataset
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Resetting the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [12]:
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-119.67,34.43,39.0,1467.0,381.0,1404.0,374.0,2.3681,241400.0,<1h_ocean,3.92246,0.259714,3.754011
1,-118.32,33.74,24.0,6097.0,794.0,2248.0,806.0,10.1357,500001.0,near_ocean,7.564516,0.130228,2.789082
2,-121.62,39.13,41.0,1317.0,309.0,856.0,337.0,1.6719,64100.0,inland,3.908012,0.234624,2.540059
3,-118.63,34.24,9.0,4759.0,924.0,1884.0,915.0,4.8333,277200.0,<1h_ocean,5.201093,0.194158,2.059016
4,-122.3,37.52,38.0,2769.0,387.0,994.0,395.0,5.5902,417000.0,near_ocean,7.010127,0.139762,2.516456


# Question 2

- Create the correlation matrix for the numerical features of your train dataset.
- In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

Options:

- total_bedrooms and households
- total_bedrooms and total_rooms
- population and households
- population_per_household and total_rooms

In [13]:
# correlation matrix

df_train.select_dtypes('number').corr().drop(columns='median_house_value')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.925005,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
latitude,-0.925005,1.0,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
housing_median_age,-0.099812,0.002477,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,0.036449,-0.025914,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,0.06384,-0.05973,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,0.09167,-0.100272,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,0.049762,-0.063529,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.016426,-0.076805,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
median_house_value,-0.044328,-0.144455,0.106235,0.128442,0.041154,-0.033653,0.056161,0.689202,0.179062,-0.256201,-0.030561
rooms_per_household,-0.034814,0.119118,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801


total_bedrooms and households with 0.979399 correlation coefficient

# Make median_house_value binary

- We need to turn the median_house_value variable from numeric into binary.
- Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.

In [14]:
mean_vale = df['median_house_value'].mean()

In [15]:
df_train['above_average'] = (df_train['median_house_value'] > mean_vale).astype('int')
df_test['above_average'] = (df_test['median_house_value'] > mean_vale).astype('int')
df_val['above_average'] = (df_val['median_house_value'] > mean_vale).astype('int')

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [16]:
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,-119.67,34.43,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1h_ocean,3.92246,0.259714,3.754011,1
1,-118.32,33.74,24.0,6097.0,794.0,2248.0,806.0,10.1357,near_ocean,7.564516,0.130228,2.789082,1
2,-121.62,39.13,41.0,1317.0,309.0,856.0,337.0,1.6719,inland,3.908012,0.234624,2.540059,0
3,-118.63,34.24,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1h_ocean,5.201093,0.194158,2.059016,1
4,-122.3,37.52,38.0,2769.0,387.0,994.0,395.0,5.5902,near_ocean,7.010127,0.139762,2.516456,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 20640 non-null  float64
 1   latitude                  20640 non-null  float64
 2   housing_median_age        20640 non-null  float64
 3   total_rooms               20640 non-null  float64
 4   total_bedrooms            20640 non-null  float64
 5   population                20640 non-null  float64
 6   households                20640 non-null  float64
 7   median_income             20640 non-null  float64
 8   median_house_value        20640 non-null  float64
 9   ocean_proximity           20640 non-null  object 
 10  rooms_per_household       20640 non-null  float64
 11  bedrooms_per_room         20640 non-null  float64
 12  population_per_household  20640 non-null  float64
dtypes: float64(12), object(1)
memory usage: 2.0+ MB


# Question 3

- Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
- What is the value of mutual information?
- Round it to 2 decimal digits using round(score, 2)

Options:

- 0.263
- 0.00001
- 0.101
- 0.15555

In [18]:
round(mutual_info_score(df_train.above_average, df_train.ocean_proximity), 2)

0.1

# Question 4

- Now let's train a logistic regression
- Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
- model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

Options:

- 0.60
- 0.72
- 0.84
- 0.95


In [19]:
# getting y vector
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [20]:
# Encoding categorical variable

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [21]:
# Instantiate Logistic Regression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

# Fit model
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [22]:
# Model accuracy score
round(model.score(X_val, y_val), 2)

0.84

In [23]:
# Model accuracy score
acc_score = accuracy_score(y_val, model.predict(X_val))
round(acc_score, 2)

0.84

# Question 5

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?

    - total_rooms
    - total_bedrooms
    - population
    - households
    
    
- note: the difference doesn't have to be positive



In [24]:
# Drop columns
df_total_room = df_train.drop(columns='total_rooms')
df_total_bedrooms = df_train.drop(columns='total_bedrooms')
df_population = df_train.drop(columns='population')
df_households = df_train.drop(columns='households')

In [25]:
# Accuracy score with all features

# Encoding categorical variable
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Instantiate Logistic Regression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
# Fit model
model.fit(X_train, y_train)

# Model accuracy score
original_score = accuracy_score(y_val, model.predict(X_val))
original_score = round(original_score, 4)
original_score

0.836

In [26]:
# Accuracy score without total_rooms

# Encoding categorical variable
dv = DictVectorizer(sparse=False)
train_dict = df_total_room.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Instantiate Logistic Regression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
# Fit model
model.fit(X_train, y_train)

# Model accuracy score
total_rooms_score = accuracy_score(y_val, model.predict(X_val))
total_rooms_score = round(total_rooms_score, 4)
total_rooms_score

0.8375

In [27]:
# Accuracy score without total_bedrooms

# Encoding categorical variable
dv = DictVectorizer(sparse=False)
train_dict = df_total_bedrooms.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Instantiate Logistic Regression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
# Fit model
model.fit(X_train, y_train)

# Model accuracy score
total_bedrooms_score = accuracy_score(y_val, model.predict(X_val))
total_bedrooms_score = round(total_bedrooms_score, 4)
total_bedrooms_score

0.8362

In [28]:
# Accuracy score without population

# Encoding categorical variable
dv = DictVectorizer(sparse=False)
train_dict = df_population.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Instantiate Logistic Regression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
# Fit model
model.fit(X_train, y_train)

# Model accuracy score
population_score = accuracy_score(y_val, model.predict(X_val))
population_score = round(population_score, 6)
population_score

0.826308

In [29]:
# Accuracy score without households

# Encoding categorical variable
dv = DictVectorizer(sparse=False)
train_dict = df_households.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Instantiate Logistic Regression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
# Fit model
model.fit(X_train, y_train)

# Model accuracy score
households_score = accuracy_score(y_val, model.predict(X_val))
households_score = round(households_score, 6)
households_score

0.833333

In [30]:
# Difference between original accuracy and accuracy with feature

original_total_rooms = original_score - total_rooms_score
original_total_bedrooms = original_score - total_bedrooms_score
original_populations = original_score - population_score
original_households = original_score - households_score

print('The difference between original accuracy score and accuracy score without total_rooms :', round(original_total_rooms, 4))
print('The difference between original accuracy score and accuracy score without total_bedrooms :', round(original_total_bedrooms, 4))
print('The difference between original accuracy score and accuracy score without populations :', round(original_populations, 4))
print('The difference between original accuracy score and accuracy score without households :', round(original_households, 4))

The difference between original accuracy score and accuracy score without total_rooms : -0.0015
The difference between original accuracy score and accuracy score without total_bedrooms : -0.0002
The difference between original accuracy score and accuracy score without populations : 0.0097
The difference between original accuracy score and accuracy score without households : 0.0027


total_bedrooms has the smallest difference

# Question 6

- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
- If there are multiple options, select the smallest alpha.

Options:

- 0
- 0.01
- 0.1
- 1
- 10

In [31]:
# Splitting the data into train/val/test dataset
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Resetting the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# we make it our y vector
y_train = np.log1p(df_train['median_house_value']).values
y_val = np.log1p(df_val['median_house_value']).values
y_test = np.log1p(df_test['median_house_value']).values

# We drop msrp from our dataset to make them our X features
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [32]:
# Encoding categorical variable
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [33]:
alpha_para = [0, 0.01, 0.1, 1, 10]

In [34]:
for p in alpha_para:
    
    # Instantiate model
    model = Ridge(alpha=p, solver="sag", random_state=42)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # predict X_val
    pred = model.predict(X_val)
    
    # RMSE
    mse = mean_squared_error(y_val, pred)
    rmse = round(np.sqrt(mse), 3)

    print('The RMSE of',p, ':', rmse)

The RMSE of 0 : 0.524
The RMSE of 0.01 : 0.524
The RMSE of 0.1 : 0.524
The RMSE of 1 : 0.524
The RMSE of 10 : 0.524
