In [1]:
import os
import sys

app_path = os.getcwd().rsplit(os.sep, 1)[0]

if app_path not in sys.path:
    sys.path.insert(0, app_path)

In [2]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from utils.paths import raw_data_path
from utils.helpers import *

In [3]:
path = raw_data_path("train.csv")
df = pd.read_csv(path)

test_path = raw_data_path("test.csv")
test_df = pd.read_csv(test_path)

In [4]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,5728806,Large private room in Nolita,4271676,Nat,Manhattan,Nolita,40.72217,-73.99481,Private room,120,7,3,2015-09-01,0.06,3,0
1,2243769,Super Bowl New York City Apartment,11460768,Brian,Manhattan,Upper West Side,40.8002,-73.96045,Entire home/apt,1500,1,0,,,1,0
2,35515415,COZY APT CLOSE TO LGA & JFK AND TRANSPORTATION,267193767,Jenny,Queens,East Elmhurst,40.75558,-73.89316,Entire home/apt,200,2,4,2019-07-05,4.0,1,365
3,36202006,Stream-Pressed Paradise (Laundry Room Setup),43392243,Rita,Staten Island,Concord,40.6011,-74.0783,Shared room,30,2,0,,,4,82
4,3780951,Charming Harlem apartment,16065171,Gina,Manhattan,Harlem,40.81022,-73.94266,Shared room,115,1,16,2019-05-26,0.27,1,365


## Replacing NaN values 

In [5]:
df.isnull().sum()

id                                   0
name                                12
host_id                              0
host_name                           13
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       6982
reviews_per_month                 6982
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In [6]:
df = df.drop("name", 1)
df.host_name = df.host_name.fillna("AAA")

mean_rpm = df.reviews_per_month.mean()
df.reviews_per_month = df.reviews_per_month.fillna(mean_rpm)
df.last_review = df.last_review.fillna('2001-01-01')



In [7]:
df.isnull().sum()


id                                0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

## LabelEncoding DF 

In [8]:
df = label_encode(df)

## Running models

In [9]:
X = df.drop(['price'], 1)
y = df.price

result = run_regression_models(X, y, None)

Linear Regression : 0.10171007672054144
Ridge : 0.10166474422099392
Lasso : 0.09117489189423755
Decision Tree : -0.1945214722904769
Random Forest : 0.22718570076696043
SVR : -0.030817805278376076


## Saving models to a list

In [10]:
model_scores = result.values()
models = []

for model_score in model_scores:
    models.append(model_score['model'])

## Transforming Test Data

In [11]:
# Loading test data

df = test_df

df = df.drop("name", 1)
df.host_name = df.host_name.fillna("AAA")

mean_rpm = df.reviews_per_month.mean()
df.reviews_per_month = df.reviews_per_month.fillna(mean_rpm)
df.last_review = df.last_review.fillna('2001-01-01')

df = label_encode(df)

df.head()

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,1525602,1200603,267,1,49,40.66751,-73.95867,0,2,121,1351,1.7,1,39
1,30430185,224414117,1596,2,93,40.75655,-73.9969,1,1,18,1325,2.49,30,364
2,21354525,11743513,1858,1,24,40.69252,-73.99121,1,1,87,1367,4.29,1,108
3,35995074,4128829,4258,3,52,40.77292,-73.90101,1,9,0,0,1.364172,2,365
4,34392081,259630588,164,1,200,40.71863,-73.9498,1,1,28,1377,14.0,2,20


## Running models on test data