In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# import the dataframe
montreal_listing =pd.read_csv('montreal_airbnb.csv')

# clean Data
montreal_listing = montreal_listing.drop(['name','id','neighbourhood_group','host_name','last_review'], axis=1)
montreal_listing.isnull().sum()
montreal_listing.dropna(how='any',inplace=True)

#creating a sub-dataframe with no extreme values / less than 400 
sub_montreal_listing=montreal_listing[montreal_listing.price < 400]

# Features Engineering
feature_sub_montreal_listing = sub_montreal_listing.copy()
feature_sub_montreal_listing.drop(['latitude','longitude'],axis=1,inplace=True)

# Encoding categorical features (proposed 1)
categorical_features=['room_type', 'neighbourhood']

for feature in categorical_features:
    labels_ordered=feature_sub_montreal_listing.groupby([feature])['price'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    feature_sub_montreal_listing[feature]=feature_sub_montreal_listing[feature].map(labels_ordered)

# Normalise Dataframe (proposed 2)
#num_features=['host_id','reviews_per_month','number_of_reviews','calculated_host_listings_count', 'minimum_nights', 'availability_365', 'price']

feature_scale=[feature for feature in feature_sub_montreal_listing.columns if feature not in ['host_id','price']]
data = pd.DataFrame()
for feature in feature_scale:
    data[feature] = (feature_sub_montreal_listing[feature] - feature_sub_montreal_listing[feature].mean())/ (feature_sub_montreal_listing[feature].std())

data.insert(loc=0, column='host_id', value=feature_sub_montreal_listing['host_id'])
data.insert(loc=1, column='price', value=feature_sub_montreal_listing['price'])
feature_sub_montreal_listing = data.copy()

# Feature selection
# Data filtering
# Filter the dataset for prices between 0 and $120
feature_sub_montreal_listing = feature_sub_montreal_listing.loc[(feature_sub_montreal_listing['price'] < 120)]

## Split data and feature slection data (proposed 1)
from sklearn.model_selection import train_test_split

x_train = feature_sub_montreal_listing.iloc[0:10000]
y_train = feature_sub_montreal_listing.iloc[0:10000]['price'].values
#y_train = np.log10(y_train)
x_test = feature_sub_montreal_listing.iloc[10000:]
y_test = feature_sub_montreal_listing.iloc[10000:]['price'].values
#y_test = np.log10(y_test)

selected_feat = ['neighbourhood', 'room_type', 'availability_365']
x_train=x_train[selected_feat]
x_test =x_test[selected_feat] 

# LR Prediction Model
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import r2_score
#from sklearn.metrics import * # importer tout les metrics d'erreurs

#Prepare a Linear Regression (LR) Model
reg=LinearRegression()
reg.fit(x_train,y_train)

# Saving model to disk
pickle.dump(reg, open('model.pkl','wb')) 

In [25]:
y_pred=reg.predict(x_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# Compare predicted and actual values
print('Comparison between the 10 real annonces prices and the predicted prices ')
df = pd.DataFrame({'Actual': np.round(10 ** y_test, 0), 
                   'Predicted': np.round(10 ** y_pred, 0), 'neighbourhood': x_test['neighbourhood'], 'room_type': x_test['room_type']})
df.head(10)

Comparison between the 10 real annonces prices and the predicted prices 


Unnamed: 0,Actual,Predicted,neighbourhood,room_type
15787,-5296233161787703296,2.4947890000000002e+48,0.935247,-1.52824
15790,3136633892082024448,2.1323979999999998e+43,-0.876285,-1.52824
15791,0,2.7965699999999998e+51,0.572941,-1.52824
15792,0,1.241202e+70,-1.480129,0.585261
15793,0,1.372382e+68,-1.480129,0.585261
15795,0,1.372382e+68,-1.480129,0.585261
15797,0,6.311624000000001e+75,0.572941,0.585261
15799,0,3.1128020000000004e+77,0.572941,0.585261
15800,3875820019684212736,6.864469000000001e+38,-1.480129,-1.52824
15802,687399551400673280,2.2520059999999997e+51,0.572941,-1.52824


In [10]:
data_test = pd.DataFrame({'neighbourhood': ['0.935484'], 'room_type': ['-1.528240'], 'availability_365': ['0.496609']})
y_data_pred=reg.predict(data_test)
print('The prediction price is:',np.round(10 ** y_data_pred, 0) )

The prediction price is: [44.]


In [7]:
x_test['availability_365']

15787   -0.157239
15790    0.543313
15791    1.664196
15792   -0.040481
15793   -0.850007
           ...   
20379    0.496609
20381    0.488825
20383    0.465474
20391    1.780954
20397    1.010347
Name: availability_365, Length: 2225, dtype: float64

In [23]:
y_pred=reg.predict(x_test)
y_pred

array([48.39703381, 43.32886827, 51.4466257 , ..., 49.90193195,
       51.7287941 , 79.84527926])