In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns

In [2]:
# Importing the dataset
df = pd.read_csv('Datasets/AB_NYC_2019.csv')

#Drop unnecesary columns
drop_cols= ['host_id','last_review','host_name','latitude','longitude','calculated_host_listings_count']

df = df.drop(drop_cols, axis=1)
df = df.rename(columns={"id": "ID",
                                  "name": "Name",
                                  "neighbourhood_group": "Borough",
                                  "neighbourhood": "Neighborhood",
                                  "room_type": "Room_Type",
                                  "price": "Price",
                                  "minimum_nights": "Minimum_Nights",
                                  "number_of_reviews":"Num_of_Reviews",
                                  "reviews_per_month": "Reviews_per_Month",
                                  "availability_365": "Days_Available_A_Year"})


df

Unnamed: 0,ID,Name,Borough,Neighborhood,Room_Type,Price,Minimum_Nights,Num_of_Reviews,Reviews_per_Month,Days_Available_A_Year
0,2539,Clean & quiet apt home by the park,Brooklyn,Kensington,Private room,149,1,9,0.21,365
1,2595,Skylit Midtown Castle,Manhattan,Midtown,Entire home/apt,225,1,45,0.38,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,Harlem,Private room,150,3,0,,365
3,3831,Cozy Entire Floor of Brownstone,Brooklyn,Clinton Hill,Entire home/apt,89,1,270,4.64,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,Manhattan,East Harlem,Entire home/apt,80,10,9,0.10,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,Manhattan,Murray Hill,Entire home/apt,200,3,74,0.59,129
6,5121,BlissArtsSpace!,Brooklyn,Bedford-Stuyvesant,Private room,60,45,49,0.40,0
7,5178,Large Furnished Room Near B'way,Manhattan,Hell's Kitchen,Private room,79,2,430,3.47,220
8,5203,Cozy Clean Guest Room - Family Apt,Manhattan,Upper West Side,Private room,79,2,118,0.99,0
9,5238,Cute & Cozy Lower East Side 1 bdrm,Manhattan,Chinatown,Entire home/apt,150,1,160,1.33,188


In [3]:
#Predict price by room type
for room in ['Room_Type']:
    print("Unique Values in column: {}\n".format(room))
    print(df[room].value_counts(),'\n')

Unique Values in column: Room_Type

Entire home/apt    25409
Private room       22326
Shared room         1160
Name: Room_Type, dtype: int64 



In [4]:
#Dealing with around 49,000 unique listings
df.shape 

(48895, 10)

In [5]:
df.dtypes

ID                         int64
Name                      object
Borough                   object
Neighborhood              object
Room_Type                 object
Price                      int64
Minimum_Nights             int64
Num_of_Reviews             int64
Reviews_per_Month        float64
Days_Available_A_Year      int64
dtype: object

In [40]:
#Predict by room type for now, exclude borough and neighborhood
drop_locations = ["Name", "Borough", "Neighborhood","Reviews_per_Month", "Minimum_Nights", "Num_of_Reviews", "Days_Available_A_Year"]
no_location = df.drop(drop_locations, axis = 1)

#New dataframe with excluded columns
no_location = pd.DataFrame(no_location)

no_location[10:20]
#no_location[no_location['Room_Type']=='Private Room']

Unnamed: 0,ID,Room_Type,Price
10,5295,Entire home/apt,135
11,5441,Private room,85
12,5803,Private room,89
13,6021,Private room,85
14,6090,Entire home/apt,120
15,6848,Entire home/apt,140
16,7097,Entire home/apt,215
17,7322,Private room,140
18,7726,Entire home/apt,99
19,7750,Entire home/apt,190


In [43]:
# Import LabelEncoder
from sklearn import preprocessing

room_variable = no_location.iloc[:,1].values
# #creating labelEncoder
# le = preprocessing.LabelEncoder()
# # Converting string labels into numbers.
# room_encoded=le.fit_transform(no_location['Room_Type'])
# print(room_encoded)

room_variable[0:8]

array(['Private room', 'Entire home/apt', 'Private room',
       'Entire home/apt', 'Entire home/apt', 'Entire home/apt',
       'Private room', 'Private room'], dtype=object)

In [7]:
#Converting nominal features into numerical features requires encoding them as dummy variables

#Get the Dummy Variables

nominal_column = ["Room_Type"]
dummy_room = pd.get_dummies(no_location[nominal_column])

no_location = pd.concat([dummy_room, no_location], axis=1)
no_location = no_location.drop(nominal_column, axis=1)
no_location = pd.DataFrame(no_location)

no_location[10:20]

Unnamed: 0,Room_Type_Entire home/apt,Room_Type_Private room,Room_Type_Shared room,ID,Price
10,1,0,0,5295,135
11,0,1,0,5441,85
12,0,1,0,5803,89
13,0,1,0,6021,85
14,1,0,0,6090,120
15,1,0,0,6848,140
16,1,0,0,7097,215
17,0,1,0,7322,140
18,1,0,0,7726,99
19,1,0,0,7750,190


In [15]:
#Split into train and test variables

#Another dummy variable method we can use in case this one doesnt work out
## Ref: https://www.datacamp.com/community/tutorials/k-nearest-neighbor-classification-scikit-learn

#cant call dummy encoded columns
X = df.iloc[:,4].values
y = df.iloc[:,5].values

In [17]:
# # Splitting the dataset into the Training set and Test set
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# # Feature Scaling
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)