In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
df = pd.read_csv('AB_NYC_2019.csv')
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


### Exploratory Data Analysis

In [3]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [4]:
features = ['neighbourhood_group', 'room_type','latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 
            'calculated_host_listings_count', 'availability_365']

In [5]:
df = df[features]
df['above_average'] = np.where(df.price >= 152, 1,0)
df = df.fillna(0)
df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365,0
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355,1
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,0.0,1,365,0
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194,0
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.1,1,0,0


### Question 1

What is the most frequent observation (mode) for the column ```neighbourhood_group```?

In [6]:
df.neighbourhood_group.mode()

0    Manhattan
dtype: object

### Split the data

- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the ```train_test_split``` function) and set the seed to 42.
- Make sure that the target value ('price') is not in your dataframe.

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42) #20% of 80%

In [9]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

### Question 2

- Create the correlation matrix for the numerical features of your train dataset.
    - In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [10]:
df_corr = df.corr()

In [11]:
df_corr

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
latitude,1.0,0.084788,0.033939,0.024869,-0.015389,-0.018758,0.019517,-0.010983,0.056385
longitude,0.084788,1.0,-0.150019,-0.062747,0.059094,0.138516,-0.114713,0.082731,-0.267001
price,0.033939,-0.150019,1.0,0.042799,-0.047954,-0.050564,0.057472,0.081829,0.41815
minimum_nights,0.024869,-0.062747,0.042799,1.0,-0.080116,-0.124905,0.12796,0.144303,0.034715
number_of_reviews,-0.015389,0.059094,-0.047954,-0.080116,1.0,0.589407,-0.072376,0.172028,-0.051785
reviews_per_month,-0.018758,0.138516,-0.050564,-0.124905,0.589407,1.0,-0.047312,0.163732,-0.05604
calculated_host_listings_count,0.019517,-0.114713,0.057472,0.12796,-0.072376,-0.047312,1.0,0.225701,0.170075
availability_365,-0.010983,0.082731,0.081829,0.144303,0.172028,0.163732,0.225701,1.0,0.104255
above_average,0.056385,-0.267001,0.41815,0.034715,-0.051785,-0.05604,0.170075,0.104255,1.0


In [12]:
corr = df.corr().abs().unstack()
corr.sort_values(ascending = False)[:12]

above_average                   above_average                     1.000000
availability_365                availability_365                  1.000000
longitude                       longitude                         1.000000
price                           price                             1.000000
minimum_nights                  minimum_nights                    1.000000
reviews_per_month               reviews_per_month                 1.000000
calculated_host_listings_count  calculated_host_listings_count    1.000000
number_of_reviews               number_of_reviews                 1.000000
latitude                        latitude                          1.000000
number_of_reviews               reviews_per_month                 0.589407
reviews_per_month               number_of_reviews                 0.589407
price                           above_average                     0.418150
dtype: float64

### Make price binary

- We need to turn the price variable from numeric into binary.
- Let's create a variable ```above_average``` which is ```1``` if the price is above (or equal to) ```152```.

### Question 3

- Calculate the mutual information score with the (binarized) price for the two categorial variables that we have. Use the training set only.
- Which of these two variables has bigger score?
- Round it to 2 decimal digit using ```round(score, 2)```

In [13]:
from sklearn.metrics import mutual_info_score

In [14]:
def mutual_info_churn_score(series):
    return mutual_info_score(series,df_full_train.above_average)

In [15]:
mi = df_full_train[['neighbourhood_group', 'room_type']].apply(mutual_info_churn_score) #mutual information
mi.sort_values(ascending = False)

room_type              0.142390
neighbourhood_group    0.046223
dtype: float64

### Question 4

- Now let's train a logistic regression
- Remember that we have two categorical variables in the data. Include them using one-hot encoding.
- Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters.
    - ```model = LogisticRegression(solver='lbfgs', C = 1.0, random_state=42)```
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [16]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [17]:
y_train_bin = df_train.above_average.values
y_val_bin = df_val.above_average.values
y_test_bin = df_test.above_average.values

In [18]:
categorical = ['neighbourhood_group', 'room_type']
numerical = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

In [19]:
dv = DictVectorizer(sparse = False)

train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [20]:
model = LogisticRegression(solver='lbfgs', C = 1.0, random_state=42, max_iter=10000)
model.fit(X_train,y_train_bin)

LogisticRegression(max_iter=10000, random_state=42)

In [21]:
model.coef_[0].round(3)

array([ 3.000e-03,  4.000e-03, -5.201e+00, -2.827e+00, -1.100e-02,
       -2.590e-01,  1.710e-01,  1.573e+00, -8.000e-03, -1.530e+00,
       -3.000e-03, -4.300e-02,  1.908e+00, -8.710e-01, -1.089e+00])

In [22]:
model.intercept_[0]

-0.0553244698209447

In [23]:
y_pred = model.predict_proba(X_val)[:,1]

In [24]:
price_decision = (y_pred >= 0.5).astype(int)

In [25]:
y_val_bin

array([0, 0, 1, ..., 0, 0, 0])

In [26]:
accuracy = (y_val_bin == price_decision).mean()
round(accuracy,2)

0.79

### Question 5

- We have 9 featuresL 7 numerical features and 2 categorical.
- Let's find the least useful one using the *feature elimination* technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference>
    - ```neighbourhood_group```
    - ```room_type```
    - ```number_of_reviews```
    - ```reviews_per_month```
    
note: the difference doesn't have to be positive.

In [27]:
def calculate_accuracy(features):
    dv = DictVectorizer(sparse = False)

    train_dicts = df_train[features].to_dict(orient = 'records')
    X_train = dv.fit_transform(train_dicts)

    val_dicts = df_val[features].to_dict(orient = 'records')
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='lbfgs', C = 1.0, random_state=42, max_iter=10000)
    model.fit(X_train,y_train_bin)
    
    y_pred = model.predict_proba(X_val)[:,1]
    price_decision = (y_pred >= 0.5).astype(int)
    accuracy = (y_val_bin == price_decision).mean()
    
    return accuracy

In [28]:
all_features = categorical + numerical

In [29]:
original_accuracy = calculate_accuracy(all_features)

In [30]:
accuracy_df = pd.DataFrame()
accuracy_df['index'] = ['accuracy', 'difference']

for i in range(len(all_features)):
    filtered_features = np.array(all_features[0:i] + all_features[i+1:])
    accuracy = calculate_accuracy(filtered_features)
    accuracy_df[all_features[i]] = [accuracy, np.abs(accuracy - original_accuracy)]
accuracy_df = accuracy_df.set_index('index').T

In [31]:
accuracy_df.sort_values(by=['difference'])

index,accuracy,difference
reviews_per_month,0.790572,0.000205
minimum_nights,0.791185,0.000818
number_of_reviews,0.79139,0.001023
calculated_host_listings_count,0.789345,0.001023
longitude,0.78689,0.003477
latitude,0.786788,0.003579
availability_365,0.781266,0.009101
neighbourhood_group,0.750997,0.03937
room_type,0.728807,0.06156


### Question 6 

- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column ```price```. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data.
- This model has a parameter ```alpha```. Let's try the following values: ```[0, 0.01, 0.1, 1, 10]```
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimals digits.

If there are multiple options, select the smallest ```alpha```.

In [32]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [33]:
dv = DictVectorizer(sparse = False)

train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [34]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)
y_test_log = np.log1p(y_test)

In [35]:
RMSE_df = pd.DataFrame()
RMSE_df['index'] = ['RMSE']

alpha = [0, 0.01, 0.1, 1, 10]

for i in alpha:
    model = Ridge(alpha = i)
    model.fit(X_train,y_train_log)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val_log,y_pred, squared = False)
    
    RMSE_df[str(i)] = round(rmse,3)
    
RMSE_df = RMSE_df.set_index('index').T

In [36]:
RMSE_df.sort_values(by = ['RMSE'])

index,RMSE
0.0,0.497
0.01,0.497
0.1,0.497
1.0,0.497
10.0,0.498
