In [1]:
import pandas as pd
import numpy as np

In [2]:
"""
read file csv and save to variable df
"""
df = pd.read_csv('AB_NYC_2019.csv')

In [5]:
df_features = df[['latitude','longitude','price','minimum_nights','number_of_reviews','reviews_per_month'
                  ,'calculated_host_listings_count','availability_365']]

In [6]:
"""
1. number of missing values
"""
df_features.isnull().sum()

latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [13]:
"""
2. median for minimum nights

method 1: using series and median
method 2: using numpy percentile 50
"""
df_features['minimum_nights'].median()

np.percentile(df_features['minimum_nights'], 50)

3.0
3.0


In [37]:
"""
Split the data
    Shuffle the initial dataset, use seed 42.
    Split your data in train/val/test sets, with 60%/20%/20% distribution.
    Make sure that the target value ('price') is not in your dataframe.
    Apply the log transformation to the price variable using the np.log1p() function.
"""

n = len(df_features)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test


df_train = df_features.iloc[:n_train]
df_val = df_features.iloc[n_train:n_train+n_val]
df_test = df_features.iloc[n_train+n_val:]

idx = np.arange(n)

np.random.seed(42)
np.random.shuffle(idx)

df_train = df_features.iloc[idx[:n_train]]
df_val = df_features.iloc[idx[n_train:n_train+n_val]]
df_test = df_features.iloc[idx[n_train+n_val:]]


df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

In [45]:
"""
3. Best way to fill NA?
    We need to deal with missing values for the column from Q1.
    We have two options: fill it with 0 or with the mean of this variable.
    Try both options. For each, train a linear regression model without regularization using the code from the lessons.
    For computing the mean, use the training only!
    Use the validation dataset to evaluate the models and compare the RMSE of each option.
    Round the RMSE scores to 2 decimal digits using round(score, 2)
    Which option gives better RMSE?
"""

def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

# fill NA with 0
X_train_0 = df_train.fillna(0).values
w0_0, w_0 = train_linear_regression(X_train_0, y_train)
y_pred_0 = w0_0 + X_train_0.dot(w_0)

# fill NA with mean
X_train_mean = df_train.fillna(value=df_train.mean()).values
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)
y_pred_mean = w0_mean + X_train_mean.dot(w_mean)


def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return round(np.sqrt(mse),2)

In [48]:
print("RMSE Score fill NA with 0 :{} ".format(rmse(y_train, y_pred_0)))
print("RMSE Score fill NA with mean :{} ".format(rmse(y_train, y_pred_mean)))

RMSE Score fill NA with 0 :0.64 
RMSE Score fill NA with mean :0.64 


In [54]:
"""
4. Best regularization parameter r
    Now let's train a regularized linear regression.
    For this question, fill the NAs with 0.
    Try different values of r from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
    Use RMSE to evaluate the model on the validation dataset.
    Round the RMSE scores to 2 decimal digits.
    Which r gives the best RMSE?

    If there are multiple options, select the smallest r.
"""
def prepare_X(df):
    df_num = df
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

for r in [0.0, 0.00001, 0.0001, 0.001, 0.1, 1, 10]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

0.0 -419.9126551485841 0.64
1e-05 -419.41376041078365 0.64
0.0001 -414.97648935850873 0.64
0.001 -375.27364997733105 0.64
0.1 -32.562560537850175 0.68
1 -3.4992168409932445 0.68
10 -0.35127675989597107 0.68


In [87]:
"""
5. STD of RMSE scores for different seeds 
    We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
    Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
    For each seed, do the train/validation/test split with 60%/20%/20% distribution.
    Fill the missing values with 0 and train a model without regularization.
    For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
    What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
    Round the result to 3 decimal digits (round(std, 3))
"""

def split_different_seed(seeds):
    n = len(df_features)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test


    df_train = df_features.iloc[:n_train]
    df_val = df_features.iloc[n_train:n_train+n_val]
    df_test = df_features.iloc[n_train+n_val:]

    idx = np.arange(n)

    np.random.seed(seeds)
    np.random.shuffle(idx)

    df_train = df_features.iloc[idx[:n_train]]
    df_val = df_features.iloc[idx[n_train:n_train+n_val]]
    df_test = df_features.iloc[idx[n_train+n_val:]]


    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = np.log1p(df_train.price.values)
    y_val = np.log1p(df_val.price.values)
    y_test = np.log1p(df_test.price.values)

    del df_train['price']
    del df_val['price']
    del df_test['price']
    return df_train, y_train, df_val, y_val, df_test, y_test

def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

xlist = []
for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    df_train, y_train, df_val, y_val, df_test, y_test = split_different_seed(i)

    # fill NA with 0
    X_train_0 = df_train.fillna(0).values
    w0_0, w_0 = train_linear_regression(X_train_0, y_train)
    y_pred_0 = w0_0 + X_train_0.dot(w_0)
    xlist.append(rmse(y_train, y_pred_0))

round(np.std(np.array(xlist)),3)

0.004

In [82]:
"""
6. RMSE on test
    Split the dataset like previously, use seed 9.
    Combine train and validation datasets.
    Fill the missing values with 0 and train a model with r=0.001.
    What's the RMSE on the test dataset?
"""

df_train, y_train, df_val, y_val, df_test, y_test = split_different_seed(9)
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
X_full_train = prepare_X(df_full_train)

y_full_train = np.concatenate([y_train, y_val])
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

0.6452771345741725