# Transform Test Data and apply the model trained on Train Data 

## Importing dependencies

In [1]:
import os
import sys

app_path = os.getcwd().rsplit(os.sep, 1)[0]

if app_path not in sys.path:
    sys.path.insert(0, app_path)

import math
import numpy as np
import pandas as pd
import pickle

from datetime import datetime, timedelta
from geopy.distance import great_circle
from utils.paths import *
from utils.helpers import *

"""
DEPENDENCIES:
    - pandas
    - numpy
    - geopy
"""


'\nDEPENDENCIES:\n    - pandas\n    - numpy\n    - geopy\n'

In [2]:

def transform_last_review(df: pd.DataFrame) -> pd.DataFrame:
    """Transforms last_review column from datetime obj to no of days from latest review -> int"""

    try:
        df['last_review']

    except TypeError:
        print(" NO last_review COLUMN FOUND")
        return
    
    lastest_review = datetime.strptime(df.last_review.sort_values(ascending=False).iloc[0], "%Y-%m-%d")

    def get_timedelta(reference: str, origin=lastest_review) -> int:
        reference = datetime.strptime(reference, "%Y-%m-%d")
        delta = origin - reference
        return delta.days

    last_review_days = []

    for review in df.last_review:
        last_review_days.append(get_timedelta(review))

    df['last_review'] = last_review_days
    return df

In [14]:
def create_radius_feature(df: pd.DataFrame) -> pd.DataFrame:
    """Calculates distance of each listing with the centre of NYC. stored in radius column"""

    try:
        lat_lon = df[['latitude', 'longitude']]

    except TypeError:
        print(" NO latitue or longitude COLUMN FOUND ")
        return

    def get_radius(lat, lon):
        nyc_centre    = (74.0060, 40.7128)
        accommodation = (lat, lon)
        return great_circle(nyc_centre, accommodation).km

    radii = []

    for value in lat_lon.values:
        dist = get_radius(value[0], value[1])
        radii.append(round(dist, 3))

    df['radius'] = radii
    
    return df

In [40]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

def run_model(model, df: pd.DataFrame, to_csv=False, filename='submit.csv') -> pd.Series:

    scaler = MinMaxScaler()
    df = scaler.fit_transform(df)

    y_pred      = model.predict(df)
    test_path   = raw_data_path('Test.csv')       
    test_df     = pd.read_csv(test_path)
    predictions = pd.DataFrame({'id': test_df.id, 'price': y_pred}, columns=['id', 'price'])

    if to_csv:
        submit_path = processed_data_path(filename)
        predictions.to_csv(submit_path, index=False)

    return predictions


In [41]:
def import_trained_model(model_name):
    model_path = storage_path(model_name)

    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)

    return model



In [43]:
def main() -> pd.Series:
    path = raw_data_path('Test.csv')
    df   = pd.read_csv(path)
    # Renaming columns
    df.rename(columns={'calculated_host_listings_count': 'listing_per_host'}, inplace=True)
    # Replacing last_review NaN with '2001-01-01'
    df.last_review = df.last_review.fillna('2001-01-01')
    df = transform_last_review(df)
    # Calculating radius of each listing from centre of NYC
    df = create_radius_feature(df)
    # Dropping unwated columns
    unwanted_cols = [
        'id', 
        'host_id', 
        'name', 
        'host_name', 
        'latitude', 
        'longitude', 
        'neighbourhood', 
        'reviews_per_month'
        ]
    df.drop(unwanted_cols, 1, inplace=True)
    # One-Hot Encoding categorical columns
    one_hot_cols = ['room_type', 'neighbourhood_group']
    df = one_hot_encode(df, one_hot_cols)
    # Importing Trained model
    model = import_trained_model('lasso_1005.obj')
    # Running model on test_df
    return run_model(model, df, to_csv=True, filename='submit.csv')

main()



ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 16 is different from 14)