In [29]:
from airbnb_mysql import *
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [30]:
port_name = '3306'
user_name = 'root'
user_password = 'root'
host_name = 'localhost'
db = 'airbnb'

connection = create_db_connection(user=user_name, password=user_name, host=host_name, db_name=db, port=port_name)

query = """
        SELECT * FROM rented_apartment;
"""

col_name = ["id", "neighbour", "lat", "long", "r_type", "price", "month", "city"]
data = db_to_df(connection, query, col_name)

MySQL Database connection successful


In [31]:
data['neighbour'].value_counts()


Buttes-Montmartre      10450
Popincourt              8855
Vaugirard               7244
Batignolles-Monceau     6774
EntrepÃ´t               6314
MÃ©nilmontant           5525
Buttes-Chaumont         5471
Passy                   5398
Temple                  4329
OpÃ©ra                  4269
Reuilly                 4027
Observatoire            3706
Gobelins                3356
Bourse                  3181
PanthÃ©on               3166
HÃ´tel-de-Ville         2980
Luxembourg              2754
Palais-Bourbon          2729
Ãx89lysÃ©e              2660
Louvre                  1958
Name: neighbour, dtype: int64

# Preprocessing


In [3]:
df = data.drop(["id","city"],axis = 1)

In [4]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import RobustScaler

In [5]:
def encoding(df_col):
    X = df_col
    encoder = LabelBinarizer()
    X_t = encoder.fit_transform(X)
    column = encoder.classes_
    return X_t, column

In [6]:
def scaling(df_col):
    X = df_col
    X = X.to_numpy()
    X = np.reshape(X,(-1,1))
    scaler = RobustScaler().fit(X)
    X_t = scaler.transform(X)
    return X_t

In [7]:
encod_room, room_col = encoding(df['r_type'])
room = pd.DataFrame(encod_room,columns=room_col)

encod_month, month_col = encoding(df['month'])
month = pd.DataFrame(encod_month,columns=month_col)

encod_neigh, neigh_col = encoding(df['neighbour'])
neigh = pd.DataFrame(encod_neigh,columns=neigh_col)
scaled_price = scaling(df['price'])

In [14]:
frames = [df,room,neigh,month]
clean_df = pd.concat(frames,axis=1)
clean_df = clean_df.drop(["neighbour","r_type"],axis=1)

clean_df['scaled_price']= scaled_price
clean_df = clean_df.drop(['price'],axis=1)
clean_df = clean_df.drop(['month'],axis=1)
clean_df = clean_df.drop(['lat','long'],axis =1)

# Train Test Split


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train_set,test_set = train_test_split(clean_df,test_size=0.3,random_state=0)
y_train = train_set['scaled_price']
X_train = train_set.drop('scaled_price',axis=1)
y_test = test_set['scaled_price']
X_test = test_set.drop('scaled_price',axis=1)

In [17]:
train_set

Unnamed: 0,Entire home/apt,Hotel room,Private room,Shared room,Batignolles-Monceau,Bourse,Buttes-Chaumont,Buttes-Montmartre,EntrepÃ´t,Gobelins,...,february,january,july,june,march,may,november,october,september,scaled_price
60464,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,-0.433333
47267,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,-0.083333
66440,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,2.616667
30967,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,-0.500000
8115,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,3.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0.333333
45891,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,-0.333333
42613,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,-0.583333
43567,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,-0.583333


# Gradient boosting



In [18]:
from sklearn.ensemble import GradientBoostingRegressor

In [19]:

reg = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)


In [21]:
reg.score(X_test, y_test)

0.030732015433430293