In [110]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection    import train_test_split
from sklearn.metrics            import mutual_info_score
from sklearn.linear_model       import LogisticRegression, Ridge
from sklearn.feature_extraction import DictVectorizer

from functools import partial

%matplotlib inline

In [19]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'
!wget $data

--2021-09-26 15:15:32--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7077973 (6.8M) [text/plain]
Saving to: ‘AB_NYC_2019.csv.1’


2021-09-26 15:15:33 (11.9 MB/s) - ‘AB_NYC_2019.csv.1’ saved [7077973/7077973]



In [87]:
df = pd.read_csv('AB_NYC_2019.csv')

categorical_columns = [ 
    'neighbourhood_group',
    'room_type'
]

numerical_columns = [
    'latitude',
    'longitude',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'    
]

target_column = ['price']
data_columns = categorical_columns + numerical_columns + target_column
df = df[data_columns]

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price
0,brooklyn,private_room,40.64749,-73.97237,1,9,0.21,6,365,149
1,manhattan,entire_home/apt,40.75362,-73.98377,1,45,0.38,2,355,225
2,manhattan,private_room,40.80902,-73.9419,3,0,,1,365,150
3,brooklyn,entire_home/apt,40.68514,-73.95976,1,270,4.64,1,194,89
4,manhattan,entire_home/apt,40.79851,-73.94399,10,9,0.1,1,0,80


In [88]:
df.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
price                               int64
dtype: object

## Missing Values

In [89]:
df.isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
price                                 0
dtype: int64

In [90]:
df.reviews_per_month = df.reviews_per_month.fillna(0)

## Q1. Most frequent value of neighbourhood_group

In [91]:
df.neighbourhood_group.value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

## Split Data

In [92]:
prop_val   = 0.2
prop_test  = 0.2
prop_train = 1.0 - prop_test - prop_val
seed = 42

df_full_train, df_test = train_test_split(df           , test_size=prop_test                          , random_state=seed)
df_train, df_val       = train_test_split(df_full_train, test_size=prop_val / (prop_train + prop_test), random_state=seed)

def setup_tensors(df):
    df = df.reset_index(drop=True)
    y  = df.price.values
    del df['price']
    return df, y

df_full_train, y_full_train = setup_tensors(df_full_train)
df_train     , y_train      = setup_tensors(df_train     )
df_val       , y_val        = setup_tensors(df_val       )
df_test      , y_test       = setup_tensors(df_test      )

## Q2: Correlation matrix

In [93]:
corr = df_train[numerical_columns].corr()
corr

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


## Binary price

In [94]:
average_price = 152.0
y2_full_train = (y_full_train >= average_price).astype(int)
y2_train      = (y_train      >= average_price).astype(int)
y2_val        = (y_val        >= average_price).astype(int)
y2_test       = (y_test       >= average_price).astype(int)

## Q3 Mutual information

In [95]:
df_train[t].apply(partial(mutual_info_score,y2_train)).round(2)

neighbourhood_group    0.05
room_type              0.14
dtype: float64

## Q4 Logistical regression

In [102]:
dv = DictVectorizer(sparse=False)
def transform_set(columns, df):
    dicts = df[columns].to_dict(orient='records')
    X     = dv.fit_transform(dicts)
    return dicts, X

dicts_train, X_train = transform_set(categorical_columns + numerical_columns, df_train)
dicts_val  , X_val   = transform_set(categorical_columns + numerical_columns, df_val  )
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=bronx',
 'neighbourhood_group=brooklyn',
 'neighbourhood_group=manhattan',
 'neighbourhood_group=queens',
 'neighbourhood_group=staten_island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=entire_home/apt',
 'room_type=private_room',
 'room_type=shared_room']

In [79]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=seed)
model.fit(X_train,y2_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [101]:
y2_val_pred = model.predict_proba(X_val)[:, 1]
val_acc = ((y2_val_pred >= 0.5) == y2_val).mean().round(2)
val_acc

0.79

## Q5 Feature of minimum importance

In [109]:
full_columns = categorical_columns + numerical_columns

def test_model(columns):
    dv = DictVectorizer(sparse=False)
    dicts_train, X_train = transform_set(columns, df_train)
    dicts_val  , X_val   = transform_set(columns, df_val  )
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=seed)
    model.fit(X_train,y2_train)
    y2_val_pred = model.predict_proba(X_val)[:, 1]
    val_acc = ((y2_val_pred >= 0.5) == y2_val).mean()
    return val_acc

base_accuracy = test_model(full_columns)
exclusion_accuracies = [base_accuracy - test_model(list(filter(lambda x: x != c, full_columns))) for c in full_columns]

for k in zip(full_columns,exclusion_accuracies):
    print(k)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

('neighbourhood_group', 0.035381940893751906)
('room_type', 0.07004806217404635)
('latitude', 0.0)
('longitude', -0.0004090397791185474)
('minimum_nights', 0.0006135596686777101)
('number_of_reviews', -0.0005112997238981842)
('reviews_per_month', 0.0009203395030166206)
('calculated_host_listings_count', -0.00030677983433891054)
('availability_365', 0.004806217404642599)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Q6: Scikit linear regression

In [123]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

def test_ridge_model(alpha):
    dv = DictVectorizer(sparse=False)
    dicts_train, X_train = transform_set(full_columns, df_train)
    dicts_val  , X_val   = transform_set(full_columns, df_val  )
    
    model = Ridge(alpha=alpha, random_state=seed)
    model.fit(X_train,np.log1p(y_train))
    
    y_val_pred = model.predict(X_val)
    score = rmse(y_val_pred,np.log1p(y_val)).round(3)
    return score

[ (alpha,test_ridge_model(alpha)) for alpha in [0, 0.01, 0.1, 1, 10] ]

[(0, 0.497), (0.01, 0.497), (0.1, 0.497), (1, 0.497), (10, 0.498)]