In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# import the dataframe
montreal_listing =pd.read_csv('montreal_airbnb.csv')

# clean Data
montreal_listing = montreal_listing.drop(['name','id','neighbourhood_group','host_name','last_review'], axis=1)
montreal_listing.isnull().sum()
montreal_listing.dropna(how='any',inplace=True)

#creating a sub-dataframe with no extreme values / less than 400 
sub_montreal_listing=montreal_listing[montreal_listing.price < 400]

# Features Engineering
feature_sub_montreal_listing = sub_montreal_listing.copy()
feature_sub_montreal_listing.drop(['latitude','longitude'],axis=1,inplace=True)

# Encoding categorical features (proposed 1)
categorical_features=['room_type', 'neighbourhood']

for feature in categorical_features:
    labels_ordered=feature_sub_montreal_listing.groupby([feature])['price'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    feature_sub_montreal_listing[feature]=feature_sub_montreal_listing[feature].map(labels_ordered)

# Normalise Dataframe (proposed 2)

feature_scale=[feature for feature in feature_sub_montreal_listing.columns if feature not in ['host_id','price']]
data = pd.DataFrame()
for feature in feature_scale:
    data[feature] = (feature_sub_montreal_listing[feature] - feature_sub_montreal_listing[feature].mean())/ (feature_sub_montreal_listing[feature].std())

data.insert(loc=0, column='host_id', value=feature_sub_montreal_listing['host_id'])
data.insert(loc=1, column='price', value=feature_sub_montreal_listing['price'])
feature_sub_montreal_listing = data.copy()

# Feature selection
# Data filtering
# Filter the dataset for prices between 0 and $120
feature_sub_montreal_listing = feature_sub_montreal_listing.loc[(feature_sub_montreal_listing['price'] < 120)]

## Split data and feature slection data (proposed 1)
from sklearn.model_selection import train_test_split

x_train = feature_sub_montreal_listing.iloc[0:10000]
y_train = feature_sub_montreal_listing.iloc[0:10000]['price'].values
#y_train = np.log10(y_train)
x_test = feature_sub_montreal_listing.iloc[10000:]
y_test = feature_sub_montreal_listing.iloc[10000:]['price'].values
#y_test = np.log10(y_test)

selected_feat = ['neighbourhood', 'room_type', 'availability_365']
x_train=x_train[selected_feat]
x_test =x_test[selected_feat] 

# LR Prediction Model
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import r2_score
#from sklearn.metrics import * # importer tout les metrics d'erreurs

#Prepare a Linear Regression (LR) Model
reg=LinearRegression()
reg.fit(x_train,y_train)

# Saving model to disk
pickle.dump(reg, open('model.pkl','wb')) 

In [265]:
y_pred=reg.predict(x_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# Compare predicted and actual values
#print('Comparison between the 10 real annonces prices and the predicted prices ')
df = pd.DataFrame({'Actual': np.round(y_test, 0), 
                   'Predicted': np.round(y_pred, 0), 'neighbourhood': x_test['neighbourhood'], 'room_type': x_test['room_type']})
df.head(10)
#y_pred

Unnamed: 0,Actual,Predicted,neighbourhood,room_type
15787,55,48.0,0.935247,-1.52824
15790,35,43.0,-0.876285,-1.52824
15791,78,51.0,0.572941,-1.52824
15792,87,70.0,-1.480129,0.585261
15793,90,68.0,-1.480129,0.585261
15795,74,68.0,-1.480129,0.585261
15797,80,76.0,0.572941,0.585261
15799,101,77.0,0.572941,0.585261
15800,21,39.0,-1.480129,-1.52824
15802,38,51.0,0.572941,-1.52824


In [266]:
data_test = pd.DataFrame({'neighbourhood': ['0.935484'], 'room_type': ['-1.528240'], 'availability_365': ['0.496609']})
y_data_pred=reg.predict(data_test)
print('The prediction price is:',np.round(y_data_pred, 0) )

The prediction price is: [50.]


In [267]:
def normalise_predect_feat(features):
    # import the dataframe
    montreal_listing =pd.read_csv('montreal_airbnb.csv')

    # clean Data
    montreal_listing = montreal_listing.drop(['name','id','neighbourhood_group','host_name','last_review'], axis=1)
    montreal_listing.isnull().sum()
    montreal_listing.dropna(how='any',inplace=True)

    #creating a sub-dataframe with no extreme values / less than 400 
    sub_montreal_listing=montreal_listing[montreal_listing.price < 400]

    # Features Engineering
    feature_sub_montreal_listing = sub_montreal_listing.copy()
    feature_sub_montreal_listing.drop(['latitude','longitude'],axis=1,inplace=True)

    feature_sub_montreal_listing

    # Encoding categorical features (proposed 1)
    categorical_features=['room_type', 'neighbourhood']

    for feature in categorical_features:
        labels_ordered=feature_sub_montreal_listing.groupby([feature])['price'].mean().sort_values().index
        labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
        feature_sub_montreal_listing[feature]=feature_sub_montreal_listing[feature].map(labels_ordered)

    # Normalise Dataframe (proposed 2)
    # add the predict features at the end of data
    # Pass the row elements as key value pairs to append() function 
    feature_sub_montreal_listing = feature_sub_montreal_listing.append({'neighbourhood' : features[0], 'room_type' : features[1], 'availability_365': features[2]} , ignore_index=True)

    feature_scale=[feature for feature in feature_sub_montreal_listing.columns if feature not in ['host_id','price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count']]
    data = pd.DataFrame()
    for feature in feature_scale:
        data[feature] = (feature_sub_montreal_listing[feature] - feature_sub_montreal_listing[feature].mean())/ (feature_sub_montreal_listing[feature].std())
    
    features_data = pd.DataFrame()
    row = len(data)-1
    features_data = data.loc[row,:]
    return(features_data.to_numpy()) # to_numpy(): convert df to array

In [268]:
int_features = [26,1,226]
final_features_1 = np.array(int_features)
final_features_normalise = normalise_predect_feat(final_features_1)
final_features_normalise

array([ 0.57291701, -1.52808199,  0.90910515])

In [269]:
final_features_data = pd.DataFrame()
final_features_data = [np.array(final_features_normalise)]
final_features_data
y_pred=reg.predict(final_features_data)
y_pred

array([49.62399375])

In [270]:
int_features = [26,1,226]

In [271]:
type(int_features)

list