<a href="https://colab.research.google.com/github/hanifaawd/airbnb-predict/blob/main/airbnb_amsterdam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### import library yang akan digunakan

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
### membaca dataset dari file airbnb_amsterdam.xlsx dan menampilkan 5 data pertama

df = pd.read_excel('airbnb_amsterdam.xlsx')
df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,calculated_host_listings_count,guests_included,host_listings_count,latitude,longitude,minimum_nights,number_of_reviews,d_centre,instant_bookable_t,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price_category
0,3,1.0,2,1,2,1,52.369802,4.85893,3,3,2.938979,0,1,0,0,Low
1,4,1.0,3,1,1,1,52.353641,4.90481,2,27,2.862356,0,1,0,0,Medium
2,2,1.0,1,2,2,2,52.367143,4.982494,2,13,5.792076,0,0,1,0,Low
3,3,1.0,1,1,2,1,52.351166,4.894002,3,6,3.135803,1,1,0,0,Low
4,2,1.0,1,1,1,1,52.354419,4.934173,6,34,3.624058,0,0,1,0,Low


In [None]:
### data preprocessing: melihat apakah dataset memiliki missing values atau duplicated values

missing_data = df.isnull().sum()
duplicated_data = df.duplicated().sum()
print(missing_data)
print(duplicated_data)

accommodates                      0
bathrooms                         0
bedrooms                          0
calculated_host_listings_count    0
guests_included                   0
host_listings_count               0
latitude                          0
longitude                         0
minimum_nights                    0
number_of_reviews                 0
d_centre                          0
instant_bookable_t                0
room_type_Entire home/apt         0
room_type_Private room            0
room_type_Shared room             0
price_category                    0
dtype: int64
0


In [None]:
### Menghapus kolom yang kurang relevan dengan prediksi yang diinginkan

df.drop(df.columns[[3, 5, 6, 7, 11, 12, 13, 14]], axis=1, inplace=True)
df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,guests_included,minimum_nights,number_of_reviews,d_centre,price_category
0,3,1.0,2,2,3,3,2.938979,Low
1,4,1.0,3,1,2,27,2.862356,Medium
2,2,1.0,1,2,2,13,5.792076,Low
3,3,1.0,1,2,3,6,3.135803,Low
4,2,1.0,1,1,6,34,3.624058,Low


In [None]:
### Memberi Label Encode pada kolom price category

df['price_category']= LabelEncoder().fit_transform(df['price_category'])
df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,guests_included,minimum_nights,number_of_reviews,d_centre,price_category
0,3,1.0,2,2,3,3,2.938979,1
1,4,1.0,3,1,2,27,2.862356,2
2,2,1.0,1,2,2,13,5.792076,1
3,3,1.0,1,2,3,6,3.135803,1
4,2,1.0,1,1,6,34,3.624058,1


In [None]:
### Membagi dataset menjadi 2 bagian, yaitu 80% data training dan 20% data testing

x = df.iloc[:, :7]
y = df['price_category']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

In [None]:
### Implementasi metode naive bayes pada model dan melakukan prediksi

model = GaussianNB()
model = model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(y_pred)
print(y_test)

[1 0 2 3 2 0 1 0 3 2 3 3 2 1 1 2 1 1 0 3 3 2 1 1 1 1 1 1 1 1 3 1 1 2 2 2 1
 2 1 3 3 1 3 2 3 1 3 3 0 3 3 1 1 1 2 1 1 0 3 0 1 2 1 0 1 3 3 1 3 1 1 2 1 2
 0 2 1 1 1 3 1 1 1 3 3 0 1 1 1 3 3 0 0 1 0 3 0 1 2 2 1 2 1 3 0 3 1 0 1 3 3
 2 3 2 3 3 3 0 1 1 1 3 3 1 3 1 2 1 3 2 3 3 3 1 3 2 3 2 2 3 1 0 1 2 1 3 1 3
 2 1 3 1 3 0 0 0 2 1 1 2]
110    0
503    0
113    1
413    3
775    1
      ..
513    0
652    2
767    3
625    2
359    3
Name: price_category, Length: 160, dtype: int64


In [None]:
### Menghitung nilai akurasi

y_pred = model.predict(x_test)
accuracy_score(y_test, y_pred)

0.55

In [None]:
### Menghitung confusion matrix

confusion_matrix(y_pred, y_test)

array([[12,  1,  1,  7],
       [ 1, 43, 16,  2],
       [ 3,  9,  7, 11],
       [14,  0,  7, 26]])

In [None]:
### Menghitung classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.40      0.47        30
           1       0.69      0.81      0.75        53
           2       0.23      0.23      0.23        31
           3       0.55      0.57      0.56        46

    accuracy                           0.55       160
   macro avg       0.51      0.50      0.50       160
weighted avg       0.54      0.55      0.54       160



In [None]:
df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,guests_included,minimum_nights,number_of_reviews,d_centre,price_category
0,3,1.0,2,2,3,3,2.938979,1
1,4,1.0,3,1,2,27,2.862356,2
2,2,1.0,1,2,2,13,5.792076,1
3,3,1.0,1,2,3,6,3.135803,1
4,2,1.0,1,1,6,34,3.624058,1


In [None]:
'''
Predict single sample 
accomodates = 2, bathrooms = 1, bedrooms = 1, guests included = 1
minimum nights = 2, number of reviews = 2, d centre = 3.135971
'''

data_sample = [[2, 1, 1, 1, 2, 2, 3.135971]]
prediction = model.predict(data_sample)
print("Data Sample = %s, Predicted = %s" % (data_sample[0], prediction[0]))

### Predicted = 1 -> Low

Data Sample = [2, 1, 1, 1, 2, 2, 3.135971], Predicted = 1


  "X does not have valid feature names, but"


In [None]:
'''
Predict single sample 
accomodates = 4, bathrooms = 1, bedrooms = 2, guests included = 2
minimum nights = 2, number of reviews = 5, d centre = 2.988217
'''

data_sample = [[4, 1, 2, 2, 2, 5, 2.988217]]
prediction = model.predict(data_sample)
print("Data Sample = %s, Predicted = %s" % (data_sample[0], prediction[0]))

### Predicted = 2 -> Medium

Data Sample = [4, 1, 2, 2, 2, 5, 2.988217], Predicted = 2


  "X does not have valid feature names, but"
