# Transform Test Data and apply the model trained on Train Data 

## Importing dependencies

In [74]:
import os
import sys

app_path = os.getcwd().rsplit(os.sep, 1)[0]

if app_path not in sys.path:
    sys.path.insert(0, app_path)

import math
import numpy as np
import pandas as pd
import pickle

from datetime import datetime, timedelta
from geopy.distance import great_circle
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

from utils.paths import *
from utils.helpers import *

"""
DEPENDENCIES:
    - pandas
    - numpy
    - geopy
"""


'\nDEPENDENCIES:\n    - pandas\n    - numpy\n    - geopy\n'

In [75]:

def transform_last_review(df: pd.DataFrame) -> pd.DataFrame:
    """Transforms last_review column from datetime obj to no of days from latest review -> int"""

    try:
        df['last_review']

    except TypeError:
        print(" NO last_review COLUMN FOUND")
        return
    
    lastest_review = datetime.strptime(df.last_review.sort_values(ascending=False).iloc[0], "%Y-%m-%d")

    def get_timedelta(reference: str, origin=lastest_review) -> int:
        reference = datetime.strptime(reference, "%Y-%m-%d")
        delta = origin - reference
        return delta.days

    last_review_days = []

    for review in df.last_review:
        last_review_days.append(get_timedelta(review))

    df['last_review'] = last_review_days
    return df

In [76]:
def create_radius_feature(df: pd.DataFrame) -> pd.DataFrame:
    """Calculates distance of each listing with the centre of NYC. stored in radius column"""

    try:
        lat_lon = df[['latitude', 'longitude']]

    except TypeError:
        print(" NO latitue or longitude COLUMN FOUND ")
        return

    def get_radius(lat, lon):
        nyc_centre    = (74.0060, 40.7128)
        accommodation = (lat, lon)
        return great_circle(nyc_centre, accommodation).km

    radii = []

    for value in lat_lon.values:
        dist = get_radius(value[0], value[1])
        radii.append(round(dist, 3))

    df['radius'] = radii
    
    return df

In [77]:
def run_model(model, df: pd.DataFrame, to_csv=False, filename='submit.csv') -> pd.Series:

    scaler = MinMaxScaler()
    df     = scaler.fit_transform(df)

    y_pred      = model.predict(df)
    test_path   = raw_data_path('Test.csv')       
    test_df     = pd.read_csv(test_path)
    predictions = pd.DataFrame({'id': test_df.id, 'price': y_pred}, columns=['id', 'price'])

    if to_csv:
        submit_path = processed_data_path(filename)
        predictions.to_csv(submit_path, index=False)

    return predictions


In [78]:
def import_trained_model(model_name):
    model_path = storage_path(model_name)

    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)

    return model



In [79]:
def transform_df(df: pd.DataFrame) -> pd.DataFrame:
    # Renaming columns
    df.rename(columns={'calculated_host_listings_count': 'listing_per_host'}, inplace=True)
    # Replacing last_review NaN with '2001-01-01'
    lastest_review = datetime.strptime(df.last_review.sort_values(ascending=False).iloc[0], "%Y-%m-%d")
    # print(" LATEST REVIEW DATE: ", lastest_review)
    df.last_review = df.last_review.fillna('2001-01-01')  # Replace with latest review date
    df = transform_last_review(df)
    # Calculating radius of each listing from centre of NYC
    df = create_radius_feature(df)
    # Dropping unwated columns
    unwanted_cols = [
        'id',
        'host_id', 
        'name',
        'host_name', 
        'latitude',
        'longitude',
        'neighbourhood',
        'reviews_per_month',
        'last_review',
        ]
    df.drop(unwanted_cols, 1, inplace=True)
    # Label encoding
    # label_cols = ['neighbourhood']
    # df = label_encode(df, label_cols)
    # One-Hot Encoding categorical columns
    one_hot_cols = ['room_type', 'neighbourhood_group']
    df = one_hot_encode(df, one_hot_cols)

    return df
    # Importing Trained model
    # model = import_trained_model('lasso_1005.obj')
    # Running model on test_df
    # return run_model(model, df, to_csv=True, filename='submit.csv')


## Importing Train and Test Data

In [80]:
test_path = raw_data_path('Test.csv')
test_df   = pd.read_csv(test_path)

train_path = raw_data_path('Train.csv')
train_df   = pd.read_csv(train_path)

## Transforming Train Data

In [81]:

# train_df.head()

In [82]:
train_df = transform_df(train_df)

X = train_df.drop('price', 1)
y = train_df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

scaler  = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# run_regression_models(X, y)

std_scaler = StandardScaler()
X_train    = std_scaler.fit_transform(X_train)
X_test     = std_scaler.transform(X_test)

model = Lasso()

lasso_params = {'alpha': [0.089, 0.90, 0.91, 0.92]}

lasso_grid = GridSearchCV(model, lasso_params, cv=3, n_jobs=-1)
lasso_grid.fit(X_train, y_train)

alpha = lasso_grid.best_params_['alpha']

# alpha = 0.1

# 0.9 = 0.0974
# 1.1 = 0.09207
# 1.2 = 0.0915
# 1.3 = 0.0913
# 2   = 0.0896
# 3   = 0.0873
# 5   = 0.080
# 6   = 0.076

model = Lasso(alpha=alpha)  # alpha)

print(" BEST ALPHA: ", alpha)
# print(lasso_grid.best_score_)

# print(lasso_grid)

model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(score)



BEST ALPHA:  0.9
0.09076874217760078


In [83]:
model

Lasso(alpha=0.9, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [86]:
train_df.head()

Unnamed: 0,price,minimum_nights,number_of_reviews,listing_per_host,availability_365,radius,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island
0,120,7,3,3,0,6374.418,0,1,0,0,0,1,0,0
1,1500,1,0,1,0,6365.273,1,0,0,0,0,1,0,0
2,200,2,4,1,365,6368.323,1,0,0,0,0,0,1,0
3,30,2,0,4,82,6389.364,0,0,1,0,0,0,0,1
4,115,1,16,1,365,6363.764,0,0,1,0,0,1,0,0


## Transforming Test Data

In [84]:
test_df = transform_df(test_df)

# scaler  = MinMaxScaler()
test_df = scaler.transform(test_df)

## Predict and save CSV

In [85]:
y_pred      = model.predict(test_df)

raw_test_df = pd.read_csv(test_path)
predictions = pd.DataFrame({'id': raw_test_df.id, 'price': y_pred}, columns=['id', 'price'])

submit_path = processed_data_path('lasso-339.csv')
predictions.to_csv(submit_path, index=False)
