In [11]:
# Libraries use 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline

In [2]:
df = pd.read_csv('./data/dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28132 entries, 0 to 28131
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   host_response_time           28132 non-null  object 
 1   host_response_rate           28132 non-null  object 
 2   host_acceptance_rate         28132 non-null  object 
 3   host_is_superhost            28132 non-null  object 
 4   host_listings_count          28132 non-null  float64
 5   property_type                28132 non-null  object 
 6   room_type                    28132 non-null  object 
 7   accommodates                 28132 non-null  int64  
 8   bedrooms                     28132 non-null  float64
 9   beds                         28132 non-null  float64
 10  price                        28132 non-null  float64
 11  minimum_nights               28132 non-null  int64  
 12  review_scores_rating         28132 non-null  float64
 13  review_scores_ac

In [3]:
df_dum = pd.get_dummies(df)

In [4]:
X = df_dum.drop('price', axis=1)
y = df_dum.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=list(X_train.columns))

kbest_features = SelectKBest(f_regression, k=5).fit(X_train, y_train)

In [6]:
kbest_scores_df = pd.DataFrame(kbest_features.scores_)
kbest_columns = pd.DataFrame(X_train.columns)
kbest_features_scores = pd.concat([kbest_columns, kbest_scores_df], axis=1)
kbest_features_scores.columns = ['Features', 'Score']
kbest_features_scores.nlargest(10,'Score')

Unnamed: 0,Features,Score
1,accommodates,1918.631589
2,bedrooms,1415.570453
3,beds,986.950745
33,property_type_Secondary unit,914.112204
37,room_type_Private room,721.137489
35,room_type_Entire home/apt,706.142705
32,property_type_House,331.476505
29,property_type_Apartment,266.27816
31,property_type_Boutique hotel,177.618471
13,host_response_time_unknown,73.633584


In [7]:
df_learn = df_dum[['accommodates', 'bedrooms', 'beds', 'property_type_Secondary unit', 'room_type_Private room',
                    'room_type_Entire home/apt', 'property_type_House', 'property_type_Apartment', 'property_type_Boutique hotel',
                    'host_response_time_unknown', 'price']]

In [9]:
X_learn = df_learn.drop('price', axis=1)
y_learn = df_learn.price
X_learn = pd.DataFrame(scaler.fit_transform(X_learn), columns=list(X_learn.columns))

In [17]:
X_learn_train, X_learn_test, y_learn_train, y_learn_test = train_test_split(X_learn, y_learn, test_size=.15, random_state=42)

# Model 1: Linear Regression
regr_model = LinearRegression()
regr_model.fit(X_learn_train, y_learn_train)

# Predict and score the model
y_learn_test_preds = regr_model.predict(X_learn_test)

#Scoring model
test_r2 = round(r2_score(y_learn_test, y_learn_test_preds),4)
test_mse = round(mean_squared_error(y_learn_test, y_learn_test_preds),4)

print('r-squared score for test set was {}.'.format(test_r2))
print('MSE score for test set was {}.'.format(test_mse))

r-squared score for test set was 0.2882.
MSE score for test set was 22549.5265.
