In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Airbnb_Price_Prediction_cleanedData.csv')

In [3]:
x = df.drop('log_price', axis = 1)
y = df['log_price']

In [4]:
x.columns

Index(['property_type', 'room_type', 'accommodates', 'bathrooms',
       'cancellation_policy', 'bedrooms', 'beds'],
      dtype='object')

In [5]:
df.head(10)

Unnamed: 0,log_price,property_type,room_type,accommodates,bathrooms,cancellation_policy,bedrooms,beds
0,5.010635,0,0,3,1.0,2,1.0,1.0
1,5.129899,0,0,7,1.0,2,3.0,3.0
2,4.976734,0,0,5,1.0,1,1.0,3.0
3,6.620073,17,0,4,1.0,0,2.0,2.0
4,4.744932,0,0,2,1.0,1,0.0,1.0
5,4.442651,0,1,2,1.0,2,1.0,1.0
6,4.418841,0,0,3,1.0,1,1.0,1.0
7,4.787492,11,0,2,1.0,1,1.0,1.0
8,4.787492,17,1,2,1.0,1,1.0,1.0
9,3.583519,17,1,2,1.0,1,1.0,1.0


In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=99833214)

In [8]:
def randomForest():
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(n_estimators=500,min_samples_split=72,min_samples_leaf=3,max_features=7,max_depth=90,bootstrap=True)
    model.fit(x_train,y_train)
    return model

In [7]:
def evaluate_model(model, name):
    predictions = model.predict(x_test)
    
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    
    # create a dictionary to hold the result along with the model name
    result = {
        "name": name,
        "MAE": mean_absolute_error(y_test, predictions),
        "MSE": mean_squared_error(y_test, predictions),
        "RMSE": np.sqrt(mean_squared_error(y_test, predictions)),
        "R2 Score": r2_score(y_test, predictions)
    }
    
    return result

In [10]:
evaluate_model(randomForest(),"Random Forest")

{'name': 'Random Forest',
 'MAE': 0.36749367726961146,
 'MSE': 0.2359513887380514,
 'RMSE': 0.4857482771333846,
 'R2 Score': 0.5515306328552049}

In [None]:

import pickle
# create the file to store the model
# w: write
# b: binary data
file1 = open('price_predict_model_rf.pkl', 'wb')
model = randomForest()
# store the model using pickle
pickle.dump(model, file1)

# save the file
file1.close()

In [11]:
def simple_linear():
    from sklearn.linear_model import LinearRegression

    # create the model
    model = LinearRegression()

    # train the model
    model.fit(x_train, y_train)
    
    return model


In [12]:
def ridge():
    from sklearn.linear_model import Ridge

    # create the model
    model = Ridge()

    # train the model
    model.fit(x_train, y_train)
    
    return model

In [13]:
def lasso():
    from sklearn.linear_model import Lasso

    # create the model
    model = Lasso()

    # train the model
    model.fit(x_train, y_train)
    
    return model

In [14]:
def grdBoost():
    from sklearn.ensemble import GradientBoostingRegressor

    # create the model
    model = GradientBoostingRegressor()

    # train the model
    model.fit(x_train, y_train)

    return model

In [8]:
def xgboost():
    from xgboost import XGBRegressor

    # create a model
    model = XGBRegressor()

    # train the model
    model.fit(x_train, y_train)
    
    return model

In [16]:
def knn():
    from sklearn.neighbors import KNeighborsRegressor
    model = KNeighborsRegressor(n_neighbors = 13)
    model.fit(x_train,y_train)
    return model

In [17]:
def svm():
    from sklearn.svm import SVR
    model = SVR()
    model.fit(x_train,y_train)
    return model

In [18]:
def polynomial():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures

    poly = PolynomialFeatures(degree = 4)
    X_poly = poly.fit_transform(x_train)

    poly.fit(X_poly, y_train)
    model = LinearRegression()
    model.fit(X_poly, y_train)
    return model

In [19]:
evaluate_model(simple_linear(),"simple linear")

{'name': 'simple linear',
 'MAE': 0.3803190826071599,
 'MSE': 0.25200157618468383,
 'RMSE': 0.5019975858355136,
 'R2 Score': 0.5210242754006289}

In [20]:
evaluate_model(ridge(),"ridge")

{'name': 'ridge',
 'MAE': 0.38031919783372437,
 'MSE': 0.2520021210101956,
 'RMSE': 0.5019981284927222,
 'R2 Score': 0.521023239858716}

In [21]:
evaluate_model(lasso(),"lasso")

{'name': 'lasso',
 'MAE': 0.5663872503362921,
 'MSE': 0.5261907151472683,
 'RMSE': 0.7253900434574962,
 'R2 Score': -0.0001230265338401093}

In [22]:
evaluate_model(grdBoost(),"Gradient boost")

{'name': 'Gradient boost',
 'MAE': 0.36828148558566165,
 'MSE': 0.23616708396848035,
 'RMSE': 0.48597025008582606,
 'R2 Score': 0.5511206640730587}

In [9]:
evaluate_model(xgboost(),"XGBoost")

{'name': 'XGBoost',
 'MAE': 0.3671176168945255,
 'MSE': 0.23652457201699756,
 'RMSE': 0.4863379195754713,
 'R2 Score': 0.550441191747264}

In [24]:
evaluate_model(knn(),"KNN")

{'name': 'KNN',
 'MAE': 0.3802949427910731,
 'MSE': 0.2532701092532842,
 'RMSE': 0.5032594850107489,
 'R2 Score': 0.5186131930776128}

In [25]:
evaluate_model(svm(),"SVM")

{'name': 'SVM',
 'MAE': 0.3693693751342988,
 'MSE': 0.2419739643156757,
 'RMSE': 0.49190849181090146,
 'R2 Score': 0.5400836111939872}

In [11]:
import pickle

# create the file to store the model
# w: write
# b: binary data

# file = open('price_predict_model.pkl', 'wb')
model = xgboost()
# store the model using pickle
# pickle.dump(model, file)

pickle.dump(model, open("pima.pickle.dat", "wb"))

# save the file
# file.close()

In [12]:
print(5)

5


In [13]:
loaded_model = pickle.load(open("pima.pickle.dat", "rb"))

In [15]:
y_pred = loaded_model.predict(x_test)

In [17]:
accuracy = r2_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

NameError: name 'r2_score' is not defined

In [None]:
rf_random.best_params_
# {'bootstrap': True,
#  'max_depth': 70,
#  'max_features': 'auto',
#  'min_samples_leaf': 4,
#  'min_samples_split': 10,
#  'n_estimators': 400}

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(base_model, test_features, test_labels)

In [None]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

In [None]:
features = x.columns
type(features)

In [None]:
import matplotlib.pyplot as plt

In [None]:
for feature in features:
    plt.hist(df[feature], 50)
    print(feature + ":")
    plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
 
# Read Data from CSv
 
# Initialise the Scaler
scaler = StandardScaler()
 
# To scale data
scaled_accomodates= scaler.fit_transform(df[['accommodates']])

In [None]:
type(df)

In [None]:
scaled_df = scaler.fit_transform(df)

In [None]:
scaled_df = pd.DataFrame(scaled_df)
type(scaled_df)

In [None]:
df.head()

In [None]:
scaled_df.head()

In [None]:
for feature in features:
    plt.hist(scaled_df[feature], 50)
    print(feature + ":")
    plt.show()

In [None]:
plt.hist(scaled_accomodates, 50)
plt.show()

In [None]:
plt.hist(df['accommodates'], 50)
plt.show()