# Importing the library

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [2]:
dataA = pd.read_csv('belgium.csv')
dataA.head()

Unnamed: 0,Row Labels,Count of Property Type
0,Apartment,5698.0
1,Bed & Breakfast,178.0
2,Boat,2.0
3,Boutique hotel,3.0
4,Cabin,7.0


In [3]:
dataA.shape

(25, 2)

In [4]:
dataA.columns

Index(['Row Labels', 'Count of Property Type'], dtype='object')

In [5]:
data = dataA[['Price','Room Type','Property Type','Bedrooms','Host Total Listings Count']]
data

KeyError: "None of [Index(['Price', 'Room Type', 'Property Type', 'Bedrooms',\n       'Host Total Listings Count'],\n      dtype='object')] are in the [columns]"

In [None]:
data.info()

In [None]:
data['Price']=data['Price'].fillna(data['Price'].mean())
data['Host Total Listings Count']=data['Host Total Listings Count'].fillna(data['Host Total Listings Count'].mean())
data['Bedrooms']=data['Bedrooms'].fillna(data['Bedrooms'].mean())

In [None]:
data.info()

In [None]:
# data = data[(data['Property Type'] == 'Apartment') | (data['Property Type'] == 'House') | (data['Property Type'] == 'Loft') | (data['Property Type'] == 'Bed & Breakfast')]
# data

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['Room Type']=le.fit_transform(data['Room Type'])
data['Property Type']=le.fit_transform(data['Property Type'])

In [None]:
Bedrooms_mean = data['Bedrooms'].mean()
print(Bedrooms_mean)
data["Bedrooms"].fillna(Bedrooms_mean, inplace = True)

In [None]:
data

In [None]:
X = data.drop(columns = ['Property Type'])
y = data['Property Type']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(X_train)
scaled_x_test = scaler.transform(X_test)

# KNN classifier model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 4)
knn_model.fit(scaled_x_train,y_train)

In [None]:
y_pred = knn_model.predict(scaled_x_test)
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
accuracy_score(y_test,y_pred)
print(classification_report(y_test,y_pred))

# Elbow method for chossing the value of k

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
test_error_rates = []
for k in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors = k)
    knn_model.fit(scaled_x_train,y_train)
    y_pred_test = knn_model.predict(scaled_x_test)
    test_error = 1 - accuracy_score(y_test,y_pred_test)
    test_error_rates.append(test_error)

In [None]:
test_error_rates

In [None]:
plt.figure(figsize =(8,7),dpi = 200)
plt.plot(range(1,30) , test_error_rates, label = 'Test_error')
plt.legend()
plt.xlabel('k.value')
plt.ylabel('error rate')

From the above graph we can say optimal value for k is 6 because the graph of error rate increases after that

# Full cross validation grid search for k values

In [None]:
scaler = StandardScaler()
knn = KNeighborsClassifier()
knn.get_params().keys()

In [None]:
operation = [('scaler',scaler),('knn',knn)]

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline(operation)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
k_values = list(range(1,20))
k_values

In [None]:
param_grid = {'knn__n_neighbors': k_values}
full_cv_classifier = GridSearchCV(pipe,param_grid,cv = 11, scoring = 'accuracy' )
full_cv_classifier.fit(X_train,y_train)

In [None]:
full_cv_classifier.best_estimator_.get_params()

# Optimal value for k is 6 from the above method so we can bulid the model according to this

In [None]:
scaler = StandardScaler()
knn10 = KNeighborsClassifier(n_neighbors = 10)
operation = [('scaler',scaler),('knn10',knn10)]

In [None]:
pipe = Pipeline(operation)
pipe.fit(X_train,y_train)

In [None]:
y_pred_knn = knn10.predict(scaled_x_test)

# Model Evaluation

In [None]:
print(classification_report(y_test,y_pred_knn))

In [None]:
conf_mat = confusion_matrix(y_test,y_pred_knn)
conf_mat

In [None]:
import sklearn
acc = sklearn.metrics.accuracy_score(y_test, y_pred_knn)
prec = round(sklearn.metrics.precision_score(y_test, y_pred_knn ,average= 'micro'),2)
rec = round(sklearn.metrics.recall_score(y_test, y_pred_knn ,average= 'micro'),2)
f1 = round(sklearn.metrics.f1_score(y_test, y_pred_knn,average='micro'),2)
print('accuracy =',acc, ' precision =', prec, ' recall =', rec, ' f1 =',f1)

In [None]:
pd.DataFrame({'Actual':y_test,'Predicted':y_pred_knn})

# Naive_bayes model

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
y_pred_NB = gnb.predict(X_test)

# Model Evaluation

In [None]:
print(classification_report(y_test,y_pred_NB))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_NB)
cm

In [None]:
import sklearn
acc = sklearn.metrics.accuracy_score(y_test, y_pred_NB)
prec = round(sklearn.metrics.precision_score(y_test, y_pred_NB ,average= 'micro'),2)
rec = round(sklearn.metrics.recall_score(y_test, y_pred_NB ,average= 'micro'),2)
f1 = round(sklearn.metrics.f1_score(y_test, y_pred_NB,average='micro'),2)
print('accuracy =',acc, ' precision =', prec, ' recall =', rec, ' f1 =',f1)

In [None]:
pd.DataFrame({'Actual':y_test,'Predicted':y_pred_NB})

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred_l = logreg.predict(X_test)

# Model Evaluation

In [None]:
print(classification_report(y_test,y_pred_l))

In [None]:
cm = confusion_matrix(y_test, y_pred_l)
cm

In [None]:
import sklearn
acc = sklearn.metrics.accuracy_score(y_test, y_pred_l)
prec = round(sklearn.metrics.precision_score(y_test, y_pred_l ,average= 'micro'),2)
rec = round(sklearn.metrics.recall_score(y_test, y_pred_l ,average= 'micro'),2)
f1 = round(sklearn.metrics.f1_score(y_test, y_pred_l,average='micro'),2)
print('accuracy =',acc, ' precision =', prec, ' recall =', rec, ' f1 =',f1)

In [None]:
pd.DataFrame({'Actual':y_test,'Predicted':y_pred_l})

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
rf = RandomForestClassifier(random_state=42)      
rf = rf.fit(X_train,y_train)
y_pred_RF=rf.predict(X_test)

# Model Evaluation

In [None]:
print(classification_report(y_test,y_pred_RF))

In [None]:
cm = confusion_matrix(y_test, y_pred_RF)
cm

In [None]:
import sklearn
acc = sklearn.metrics.accuracy_score(y_test, y_pred_RF)
prec = round(sklearn.metrics.precision_score(y_test, y_pred_RF ,average= 'micro'),2)
rec = round(sklearn.metrics.recall_score(y_test, y_pred_RF ,average= 'micro'),2)
f1 = round(sklearn.metrics.f1_score(y_test, y_pred_RF,average='micro'),2)
print('accuracy =',acc, ' precision =', prec, ' recall =', rec, ' f1 =',f1)

In [None]:
pd.DataFrame({'Actual':y_test,'Predicted':y_pred_RF})

In [None]:
df_res = pd.DataFrame({'Actual':y_test,'Predicted_Knn':y_pred_knn,'Predicted_logistic':y_pred_l,'Predicted_NB':y_pred_NB,'Predicted_RF':y_pred_RF})

In [None]:
df_res.head(15)