## Homework

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_file = '../data/airbnb_data.csv'
feature_set = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'neighbourhood_group', 'room_type', 'price']

df = pd.read_csv(data_file)
df = df[feature_set]
df = df.fillna(0)
df.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group,room_type,price
0,40.64749,-73.97237,1,9,0.21,6,365,Brooklyn,Private room,149
1,40.75362,-73.98377,1,45,0.38,2,355,Manhattan,Entire home/apt,225
2,40.80902,-73.9419,3,0,0.0,1,365,Manhattan,Private room,150
3,40.68514,-73.95976,1,270,4.64,1,194,Brooklyn,Entire home/apt,89
4,40.79851,-73.94399,10,9,0.1,1,0,Manhattan,Entire home/apt,80


In [3]:
df.dtypes

latitude                          float64
longitude                         float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
neighbourhood_group                object
room_type                          object
price                               int64
dtype: object

### Question 1
What is the most frequent observation (mode) for the column 'neighbourhood_group'?

In [4]:
df.groupby('neighbourhood_group').neighbourhood_group.agg(['count'])

Unnamed: 0_level_0,count
neighbourhood_group,Unnamed: 1_level_1
Bronx,1091
Brooklyn,20104
Manhattan,21661
Queens,5666
Staten Island,373


Answer: **Manhattan**

In [5]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['price'] >= 152
y_val =  df_val['price'] >= 152
y_test =  df_test['price'] >= 152

del df_train['price']
del df_val['price']
del df_test['price']

### Question 2
What are the two features that have the biggest correlation in this dataset?

In [6]:
df_train.corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


Answer: **'number_of_reviews' and 'reviews_per_month'**

### Question 3
Which of these two variables has bigger mutual info score?

In [7]:
from sklearn.metrics import mutual_info_score

In [8]:
score1 = mutual_info_score(y_train, df_train.neighbourhood_group)
round(score1, 2)

0.05

In [9]:
score2 = mutual_info_score(y_train, df_train.room_type)
round(score2, 2)

0.14

Answer: **'room_type'**

### Question 4

In [10]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

In [11]:
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [12]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=10000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=42)

In [13]:
model.intercept_[0]

-0.062178018760393215

In [14]:
model.coef_[0].round(3)

array([ 3.000e-03,  4.000e-03, -5.784e+00, -3.147e+00, -1.100e-02,
       -1.860e-01,  1.690e-01,  1.620e+00,  1.800e-02, -1.679e+00,
       -3.000e-03, -4.300e-02,  1.992e+00, -7.870e-01, -1.264e+00])

In [15]:
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)
accuracy = (y_val == price_decision).mean()
round(accuracy, 2)

0.79

In [16]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, price_decision)
round(accuracy, 2)

0.79

### Question 5

In [18]:
for idx in range(len(df_train.columns)):
    excl_feat = df_train.columns[idx]
    print('excluding', excl_feat)
    subset_feat = np.delete(df_train.columns.copy().values, idx)
    
    subset_dv = DictVectorizer(sparse=False)
    
    subset_train_dict = df_train[subset_feat].to_dict(orient='records')
    subset_val_dict = df_val[subset_feat].to_dict(orient='records')

    X_subset_train = subset_dv.fit_transform(subset_train_dict)
    X_subset_val = subset_dv.fit_transform(subset_val_dict)
    
    subset_model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=10000)
    subset_model.fit(X_subset_train, y_train)
    
    y_subset_pred = subset_model.predict_proba(X_subset_val)[:, 1]
    subset_price_decision = (y_subset_pred >= 0.5)
    subset_accuracy = accuracy_score(y_val, subset_price_decision)
    print("Accuracy:", round(accuracy, 4), round(subset_accuracy, 4), round(accuracy - subset_accuracy, 4))
    print()

excluding latitude
Accuracy: 0.7907 0.7868 0.0039

excluding longitude
Accuracy: 0.7907 0.7869 0.0038

excluding minimum_nights
Accuracy: 0.7907 0.7913 -0.0006

excluding number_of_reviews
Accuracy: 0.7907 0.7914 -0.0007

excluding reviews_per_month
Accuracy: 0.7907 0.7906 0.0001

excluding calculated_host_listings_count
Accuracy: 0.7907 0.7897 0.001

excluding availability_365
Accuracy: 0.7907 0.7812 0.0095

excluding neighbourhood_group
Accuracy: 0.7907 0.751 0.0397

excluding room_type
Accuracy: 0.7907 0.7288 0.0619



## Question 6