In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("AB_NYC_2019.csv")
var = ['neighbourhood_group','room_type','latitude','longitude','price','minimum_nights',
      'number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']

df = df[var]

# fill missing values with zero
def fillMissing(df):
    for var in df.columns:
        df[[var]] = df[[var]].fillna(0)
    
    return df

data = fillMissing(df)

In [3]:
# Question 1: Mode for 'neighbourhood_group'
data[['neighbourhood_group']].mode()

Unnamed: 0,neighbourhood_group
0,Manhattan


In [4]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.4, random_state=42)
data_test, data_val = train_test_split(data_test, test_size=0.5, random_state=42)

data_train = data_train.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

y_train = data_train.price.values
y_val = data_val.price.values
y_test = data_test.price.values

del data_train['price']
del data_val['price']
del data_test['price']

len(data_train), len(data_val), len(data_test)

(29337, 9779, 9779)

In [5]:
# Question 2: correlation matrix 
data_train.corr() # reviews_per_month * number_of_reviews, availability_365*calculated_host_listings_count

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.087732,0.027252,-0.01008,-0.014699,0.019442,-0.005975
longitude,0.087732,1.0,-0.067251,0.058775,0.132226,-0.116669,0.080776
minimum_nights,0.027252,-0.067251,1.0,-0.085092,-0.127316,0.12406,0.141089
number_of_reviews,-0.01008,0.058775,-0.085092,1.0,0.581124,-0.072687,0.176481
reviews_per_month,-0.014699,0.132226,-0.127316,0.581124,1.0,-0.047254,0.166533
calculated_host_listings_count,0.019442,-0.116669,0.12406,-0.072687,-0.047254,1.0,0.222986
availability_365,-0.005975,0.080776,0.141089,0.176481,0.166533,0.222986,1.0


In [6]:
#Make price binary

above_average = (y_train > 152).astype(int)

In [7]:
# Question 3: mutual information score 
from sklearn.metrics import mutual_info_score

(
    round(mutual_info_score(y_train, data_train.neighbourhood_group), 2), # neighbourhood_group
    round(mutual_info_score(y_train, data_train.room_type), 2) # room_type
)

(0.11, 0.32)

In [8]:
# Question 4: logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

numerical = ['latitude','longitude','minimum_nights', 'number_of_reviews','reviews_per_month',
             'calculated_host_listings_count','availability_365']
categorical = ['neighbourhood_group','room_type']

# one hot encode input variables
dv = DictVectorizer(sparse=False)

train_dict = data_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = data_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [9]:
# Logistic regression

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train>152)

# Prediction and accuracy
y_pred = model.predict(X_val)
accuracy_base = ((y_val > 152).astype(int) == y_pred).mean()
accuracy_base

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7937416913794867

In [10]:
# Question 5: feature elimination

# 'neighbourhood_group'
numerical = ['latitude','longitude','minimum_nights', 'number_of_reviews','reviews_per_month',
             'calculated_host_listings_count','availability_365']
categorical = ['room_type']
train_dict1 = data_train[categorical + numerical].to_dict(orient='records')
X_train1 = dv.fit_transform(train_dict1)
test_dict1 = data_val[categorical + numerical].to_dict(orient='records')
X_val1 = dv.transform(test_dict1)

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train1, y_train>152)    
y_pred1 = model.predict(X_val1)
accuracy_new = ((y_val > 152).astype(int) == y_pred1).astype(int).mean()
print("neighbourhood_group" + ": " + str(abs(accuracy_base - accuracy_new)))

    
# 'room_type'
numerical = ['latitude','longitude','minimum_nights', 'number_of_reviews','reviews_per_month',
             'calculated_host_listings_count','availability_365']
categorical = ['neighbourhood_group']
train_dict2 = data_train[categorical + numerical].to_dict(orient='records')
X_train2 = dv.fit_transform(train_dict2)
test_dict2 = data_val[categorical + numerical].to_dict(orient='records')
X_val2 = dv.transform(test_dict2)

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train2, y_train>152)    
y_pred2 = model.predict(X_val2)
accuracy_new = ((y_val > 152).astype(int) == y_pred2).astype(int).mean()
print("room_type" + ": " + str(abs(accuracy_base - accuracy_new)))


# 'number_of_reviews'
numerical = ['latitude','longitude','minimum_nights', 'reviews_per_month',
             'calculated_host_listings_count','availability_365']
categorical = ['neighbourhood_group', 'room_type']
train_dict3 = data_train[categorical + numerical].to_dict(orient='records')
X_train3 = dv.fit_transform(train_dict3)
test_dict3 = data_val[categorical + numerical].to_dict(orient='records')
X_val3 = dv.transform(test_dict3)

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train3, y_train>152)    
y_pred3 = model.predict(X_val3)
accuracy_new = ((y_val > 152).astype(int) == y_pred3).astype(int).mean()
print("number_of_reviews" + ": " + str(abs(accuracy_base - accuracy_new)))


# 'reviews_per_month'
numerical = ['latitude','longitude','minimum_nights', 'number_of_reviews',
             'calculated_host_listings_count','availability_365']
categorical = ['neighbourhood_group', 'room_type']
train_dict4 = data_train[categorical + numerical].to_dict(orient='records')
X_train4 = dv.fit_transform(train_dict4)
test_dict4 = data_val[categorical + numerical].to_dict(orient='records')
X_val4 = dv.transform(test_dict4)

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train4, y_train>152)    
y_pred4 = model.predict(X_val4)
accuracy_new = ((y_val > 152).astype(int) == y_pred4).astype(int).mean()
print("reviews_per_month" + ": " + str(abs(accuracy_base - accuracy_new)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


neighbourhood_group: 0.044687595868698304


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


room_type: 0.07464975968912979


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


number_of_reviews: 0.0003067798343387995
reviews_per_month: 0.00040903977911843636


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Question 6:  Ridge regression model 
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

for r in [0, 0.01, 0.1, 1, 10]:
    ridge_model = Ridge(alpha=r)
    ridge_model.fit(X_train, np.log1p(y_train))
    y_pred =  ridge_model.predict(X_val)
    rmsval = rmse(np.log1p(y_val), y_pred)
    print('%6s' %r, rmsval)

     0 0.4921342046671336
  0.01 0.4921313886801161
   0.1 0.4921300301810149
     1 0.49212964470349824
    10 0.4927778569958547
