In [240]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [241]:
X_full = pd.read_csv('AB_NYC_2019.csv', index_col=False)

In [242]:
X_full.index

RangeIndex(start=0, stop=29987, step=1)

In [243]:
X_full.head()

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9.0,19/10/18,0.21,6.0,365
1,2595,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45.0,21/05/19,0.38,2.0,355
2,3647,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0.0,,,1.0,365
3,3831,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270.0,05/07/19,4.64,1.0,194
4,5022,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9.0,19/11/18,0.1,1.0,0


In [244]:
X_full.shape

(29987, 15)

In [245]:
missing_val_by_col = (X_full.isnull().sum())

In [246]:
print(missing_val_by_col[missing_val_by_col>0])

host_name                           18
neighbourhood_group                  2
latitude                             2
longitude                            3
room_type                            3
price                               26
minimum_nights                      14
number_of_reviews                   17
last_review                       4981
reviews_per_month                 4981
calculated_host_listings_count       2
dtype: int64


In [247]:
cols = X_full.columns[X_full.isnull().any()]

In [248]:
cols

Index(['host_name', 'neighbourhood_group', 'latitude', 'longitude',
       'room_type', 'price', 'minimum_nights', 'number_of_reviews',
       'last_review', 'reviews_per_month', 'calculated_host_listings_count'],
      dtype='object')

In [249]:
reduced_X = (X_full.drop(['last_review','reviews_per_month'], axis=1))

In [250]:
reduced_X.head()

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,2539,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9.0,6.0,365
1,2595,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45.0,2.0,355
2,3647,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0.0,1.0,365
3,3831,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270.0,1.0,194
4,5022,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9.0,1.0,0


In [251]:
# reduced_X = reduced_X.fillna(0)

In [252]:
reduced_X.shape

(29987, 13)

In [253]:
missing_val_by_col = (reduced_X.isnull().sum())

In [254]:
missing_val_by_col

id                                 0
host_id                            0
host_name                         18
neighbourhood_group                2
neighbourhood                      0
latitude                           2
longitude                          3
room_type                          3
price                             26
minimum_nights                    14
number_of_reviews                 17
calculated_host_listings_count     2
availability_365                   0
dtype: int64

In [255]:
X = reduced_X.iloc[:,:-4]

In [256]:
X

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price
0,2539,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149
1,2595,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225
2,3647,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150
3,3831,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89
4,5022,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80
...,...,...,...,...,...,...,...,...,...
29982,23084751,67375976,Mymy,Manhattan,Midtown,40.75544,-73.96901,Entire home/apt,180
29983,23085504,24796602,Wesly/Jessica,Brooklyn,Crown Heights,40.67180,-73.92678,Private room,55
29984,23086483,86771798,Daia,Manhattan,Murray Hill,40.74742,-73.97660,Entire home/apt,365
29985,23087403,5788047,Andrea,Manhattan,Lower East Side,40.71269,-73.98713,Entire home/apt,150


In [257]:
Y = reduced_X.iloc[:,9:]

In [258]:
Y

Unnamed: 0,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,1,9.0,6.0,365
1,1,45.0,2.0,355
2,3,0.0,1.0,365
3,1,270.0,1.0,194
4,10,9.0,1.0,0
...,...,...,...,...
29982,4,0.0,1.0,0
29983,2,34.0,4.0,341
29984,5,14.0,2.0,50
29985,2,10.0,1.0,6


In [259]:
from sklearn.linear_model import LinearRegression

In [260]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, shuffle= False)

In [261]:
X_train

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price
0,2539,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149
1,2595,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225
2,3647,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150
3,3831,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89
4,5022,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80
...,...,...,...,...,...,...,...,...,...
20086,16085436,61804662,Niki,Queens,Flushing,40.76904,-73.81687,Private room,48
20087,16085817,104835356,Jerry,Brooklyn,Sunset Park,40.65563,-74.00197,Private room,73
20088,16086039,104838182,Larissa,Manhattan,Upper West Side,40.80130,-73.96669,Private room,85
20089,16086320,104835356,Jerry,Brooklyn,Sunset Park,40.65477,-74.00100,Private room,69


In [262]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in X_train.columns:
    if X_train[column_name].dtype == object:
         X_train[column_name] = le.fit_transform(X_train[column_name])
    else:
        pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [263]:
X_train = X_train.fillna(0)

In [264]:
# X_train[X_train.isnull() == True]
# X_train.fillna(method = 'ffill', inplace = True)
X_train.isnull().sum()

id                     0
host_id                0
host_name              0
neighbourhood_group    0
neighbourhood          0
latitude               0
longitude              0
room_type              0
price                  0
dtype: int64

In [265]:
X_train

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price
0,2539,2787,2671,1,97,40.64749,-73.97237,1,66
1,2595,2845,2557,2,115,40.75362,-73.98377,0,158
2,3647,4632,1552,2,85,40.80902,-73.94190,1,68
3,3831,4869,3321,1,36,40.68514,-73.95976,0,446
4,5022,7192,3166,2,55,40.79851,-73.94399,0,429
...,...,...,...,...,...,...,...,...,...
20086,16085436,61804662,4170,3,70,40.76904,-73.81687,1,345
20087,16085817,104835356,2582,1,173,40.65563,-74.00197,1,414
20088,16086039,104838182,3152,2,184,40.80130,-73.96669,1,439
20089,16086320,104835356,2582,1,173,40.65477,-74.00100,1,407


In [266]:
print(X_train.isnull().any())

id                     False
host_id                False
host_name              False
neighbourhood_group    False
neighbourhood          False
latitude               False
longitude              False
room_type              False
price                  False
dtype: bool


In [267]:
Y_train = Y_train.fillna(0)

In [268]:
print(Y_train.isnull().any())

minimum_nights                    False
number_of_reviews                 False
calculated_host_listings_count    False
availability_365                  False
dtype: bool


In [269]:
Y_train

Unnamed: 0,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,1,9.0,6.0,365
1,1,45.0,2.0,355
2,3,0.0,1.0,365
3,1,270.0,1.0,194
4,10,9.0,1.0,0
...,...,...,...,...
20086,1,54.0,2.0,113
20087,1,10.0,3.0,121
20088,7,14.0,1.0,0
20089,1,32.0,3.0,292


In [270]:


# lineReg = LinearRegression()
# lineReg.fit(X_train, y_train)
# lineReg.score(X_test, y_test )
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

classifier = MultiOutputRegressor(SVR())
classifier.fit(X_train,Y_train)

MultiOutputRegressor(estimator=SVR())

In [272]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in X_test.columns:
    if X_test[column_name].dtype == object:
         X_test[column_name] = le.fit_transform(X_test[column_name])
    else:
        pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [277]:
print(X_test.isnull().any())
X_test = X_test.fillna(0)
print(X_test.isnull().any())

id                     False
host_id                False
host_name              False
neighbourhood_group    False
neighbourhood          False
latitude               False
longitude              False
room_type              False
price                  False
dtype: bool
id                     False
host_id                False
host_name              False
neighbourhood_group    False
neighbourhood          False
latitude               False
longitude              False
room_type              False
price                  False
dtype: bool


In [278]:
print(Y_test.isnull().any())
Y_test = Y_test.fillna(0)
print(Y_test.isnull().any())

minimum_nights                     True
number_of_reviews                  True
calculated_host_listings_count     True
availability_365                  False
dtype: bool
minimum_nights                    False
number_of_reviews                 False
calculated_host_listings_count    False
availability_365                  False
dtype: bool


In [288]:
result = classifier.predict(X_test)

In [289]:
# result = result*10

In [290]:
print(result)

[[ 2.10166732 23.19823723  1.10032768 78.70763491]
 [ 2.68409897  6.87284261  1.1000476   7.11010745]
 [ 2.25485829  8.99109001  1.10016879 17.83953453]
 ...
 [ 2.1542312  17.03707826  1.09902482 54.28357865]
 [ 3.09605343  8.40867751  1.0990644  -0.55793008]
 [ 3.46111057 11.76853098  1.09929917 19.88746183]]


In [291]:
print(Y_test)

      minimum_nights  number_of_reviews  calculated_host_listings_count  \
20091              1               24.0                             3.0   
20092              2                1.0                             3.0   
20093              3                1.0                            12.0   
20094              3                2.0                            12.0   
20095              4               39.0                             8.0   
...              ...                ...                             ...   
29982              4                0.0                             1.0   
29983              2               34.0                             4.0   
29984              5               14.0                             2.0   
29985              2               10.0                             1.0   
29986              3               16.0                             3.0   

       availability_365  
20091               324  
20092                 0  
20093               3