# Superhost Classification ML Models

In [1]:
import pprint
import pandas as pd

pp = pprint.PrettyPrinter(indent=4)

df = pd.read_csv("../csv_data/listings_classification.csv")
df.head()

Unnamed: 0,host_is_superhost,host_response_time,host_response_rate,host_listings_count,host_has_profile_pic,host_identity_verified,zipcode,latitude,longitude,is_location_exact,...,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,reviews_per_month,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,total_hosting_years,region_by_zipcode
0,SUPERHOST,within a few hours,1.0,3.0,1,1,78702,30.2775,-97.71398,0,...,strict_14_with_grace_period,0,0,0.19,3,2,1,0,11,East
1,SUPERHOST,within a few hours,1.0,3.0,1,1,78702,30.27577,-97.71379,1,...,strict_14_with_grace_period,0,0,0.07,3,2,1,0,11,East
2,SUPERHOST,within an hour,1.0,1.0,1,1,78702,30.26112,-97.73448,1,...,strict_14_with_grace_period,0,1,3.89,1,1,0,0,10,East
3,SUPERHOST,within an hour,1.0,2.0,1,1,78702,30.24773,-97.72584,1,...,moderate,1,1,2.4,2,2,0,0,8,East
4,SUPERHOST,within an hour,1.0,1.0,1,1,78702,30.26775,-97.72695,1,...,moderate,0,0,2.33,1,1,0,0,8,East


## Preprocessing of the data

In [2]:
categorical_cols = ["host_response_time", "zipcode", "property_type", "room_type", "bed_type", 
                    "cancellation_policy", "region_by_zipcode"]

# separate categorical features into own dataframe
categorical_df = df[categorical_cols]
categorical_df["zipcode"] = categorical_df["zipcode"].astype(str)

# separate numerical features into own dataframe
numerical_df = df.drop(categorical_cols, axis=1)
numerical_df_cols = list(numerical_df.columns)
numerical_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,host_is_superhost,host_response_rate,host_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,is_location_exact,accommodates,bathrooms,...,review_scores_value,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,reviews_per_month,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,total_hosting_years
0,SUPERHOST,1.0,3.0,1,1,30.2775,-97.71398,0,4,2.0,...,9.0,0,0,0,0.19,3,2,1,0,11
1,SUPERHOST,1.0,3.0,1,1,30.27577,-97.71379,1,2,1.0,...,9.0,0,0,0,0.07,3,2,1,0,11
2,SUPERHOST,1.0,1.0,1,1,30.26112,-97.73448,1,3,1.0,...,10.0,0,0,1,3.89,1,1,0,0,10
3,SUPERHOST,1.0,2.0,1,1,30.24773,-97.72584,1,3,1.0,...,10.0,1,1,1,2.4,2,2,0,0,8
4,SUPERHOST,1.0,1.0,1,1,30.26775,-97.72695,1,10,3.0,...,10.0,1,0,0,2.33,1,1,0,0,8


In [3]:
# dummy encode categorical features into numerical types with descriptive header
categorical_df_encoded = pd.get_dummies(categorical_df)
categorical_df_encoded.head()

Unnamed: 0,host_response_time_a few days or more,host_response_time_unknown,host_response_time_within a day,host_response_time_within a few hours,host_response_time_within an hour,zipcode_78701,zipcode_78702,zipcode_78703,zipcode_78704,zipcode_78705,...,cancellation_policy_super_strict_60,region_by_zipcode_Central,region_by_zipcode_East,region_by_zipcode_Far East,region_by_zipcode_North,region_by_zipcode_North Suburbs,region_by_zipcode_South,region_by_zipcode_South Central,region_by_zipcode_South East,region_by_zipcode_West
0,0,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [4]:
# combine dataframes into one again
listings_class_df = pd.concat([numerical_df, categorical_df_encoded], axis=1)
listings_class_df.head()

Unnamed: 0,host_is_superhost,host_response_rate,host_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,is_location_exact,accommodates,bathrooms,...,cancellation_policy_super_strict_60,region_by_zipcode_Central,region_by_zipcode_East,region_by_zipcode_Far East,region_by_zipcode_North,region_by_zipcode_North Suburbs,region_by_zipcode_South,region_by_zipcode_South Central,region_by_zipcode_South East,region_by_zipcode_West
0,SUPERHOST,1.0,3.0,1,1,30.2775,-97.71398,0,4,2.0,...,0,0,1,0,0,0,0,0,0,0
1,SUPERHOST,1.0,3.0,1,1,30.27577,-97.71379,1,2,1.0,...,0,0,1,0,0,0,0,0,0,0
2,SUPERHOST,1.0,1.0,1,1,30.26112,-97.73448,1,3,1.0,...,0,0,1,0,0,0,0,0,0,0
3,SUPERHOST,1.0,2.0,1,1,30.24773,-97.72584,1,3,1.0,...,0,0,1,0,0,0,0,0,0,0
4,SUPERHOST,1.0,1.0,1,1,30.26775,-97.72695,1,10,3.0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
# define feature set and outputs
X = listings_class_df.drop("host_is_superhost", axis=1)
y = listings_class_df["host_is_superhost"].values

## Split Training, Test Sets and Scale the Data

In [6]:
# split data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 41)

In [7]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Decision Tree

In [8]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)

print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score:  {clf.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score:  0.8503881439661256


In [9]:
clf_ranked_features = sorted(zip(clf.feature_importances_, X.columns.values), reverse=True)

print("Top 10 features are:\n")
pp.pprint(clf_ranked_features[:10]) # top 10 features

print("\nBottom 50 features are:\n")
pp.pprint(clf_ranked_features[-50:]) # bottom 10 features

Top 10 features are:

[   (0.301442893905848, 'number_of_reviews_ltm'),
    (0.11196718083863051, 'calculated_host_listings_count'),
    (0.09628135869360062, 'review_scores_rating'),
    (0.03879706771566033, 'total_hosting_years'),
    (0.036693408880191555, 'number_of_reviews'),
    (0.03428316973075462, 'host_listings_count'),
    (0.034056592384790516, 'calculated_host_listings_count_entire_homes'),
    (0.03097480212275662, 'host_response_rate'),
    (0.02477806963720157, 'price'),
    (0.023855586547785895, 'reviews_per_month')]

Bottom 50 features are:

[   (0.0, 'zipcode_78739'),
    (0.0, 'zipcode_78738'),
    (0.0, 'zipcode_78734'),
    (0.0, 'zipcode_78733'),
    (0.0, 'zipcode_78732'),
    (0.0, 'zipcode_78730'),
    (0.0, 'zipcode_78728'),
    (0.0, 'zipcode_78725'),
    (0.0, 'zipcode_78724'),
    (0.0, 'zipcode_78719'),
    (0.0, 'zipcode_78717'),
    (0.0, 'zipcode_78712'),
    (0.0, 'zipcode_78704'),
    (0.0, 'room_type_Shared room'),
    (0.0, 'room_type_Private roo

In [10]:
from sklearn.metrics import classification_report
print(classification_report(clf.predict(X_test), y_test))

              precision    recall  f1-score   support

        HOST       0.50      0.72      0.59      1305
   SUPERHOST       0.62      0.40      0.49      1529

    accuracy                           0.55      2834
   macro avg       0.56      0.56      0.54      2834
weighted avg       0.57      0.55      0.53      2834



## Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=20)
rf = rf.fit(X_train, y_train)

print(f"Training Data Score: {rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

Training Data Score: 0.6472941176470588
Testing Data Score: 0.6541990119971771


In [12]:
rf_ranked_features = sorted(zip(rf.feature_importances_, X.columns.values), reverse=True)
print("Top 10 features are:\n")
pp.pprint(rf_ranked_features[:10]) # top 10 features

print("\nBottom 20 features are:\n")
pp.pprint(rf_ranked_features[-20:]) # bottom 20 features

Top 10 features are:

[   (0.0720541818111767, 'number_of_reviews'),
    (0.06753734072974449, 'number_of_reviews_ltm'),
    (0.06413376777535637, 'reviews_per_month'),
    (0.047130544891275813, 'availability_365'),
    (0.042695272840785406, 'review_scores_rating'),
    (0.03923094588178423, 'host_response_time_unknown'),
    (0.03914005453717647, 'host_response_rate'),
    (0.037979715752641426, 'calculated_host_listings_count'),
    (0.034982574233973315, 'host_response_time_within an hour'),
    (0.03330881506887057, 'calculated_host_listings_count_entire_homes')]

Bottom 20 features are:

[   (4.37176001932122e-05, 'property_type_Bus'),
    (4.331601222910799e-05, 'property_type_Barn'),
    (4.2066333262656404e-05, 'bed_type_Airbed'),
    (3.649321622866755e-05, 'property_type_Boutique hotel'),
    (3.4046264106443684e-05, 'property_type_Yurt'),
    (2.9586874164898936e-05, 'cancellation_policy_super_strict_60'),
    (2.4732300947134725e-05, 'zipcode_78730'),
    (2.4303426184159

In [13]:
print(classification_report(rf.predict(X_test), y_test))

              precision    recall  f1-score   support

        HOST       0.93      0.90      0.91      1913
   SUPERHOST       0.80      0.85      0.82       921

    accuracy                           0.88      2834
   macro avg       0.86      0.87      0.87      2834
weighted avg       0.88      0.88      0.88      2834



## Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg = logreg.fit(X_train, y_train)

print(f"Training Data Score: {logreg.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {logreg.score(X_test_scaled, y_test)}")

Training Data Score: 0.3591764705882353
Testing Data Score: 0.3507410021171489


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
print(classification_report(logreg.predict(X_test), y_test))

              precision    recall  f1-score   support

        HOST       0.95      0.72      0.82      2463
   SUPERHOST       0.29      0.77      0.42       371

    accuracy                           0.73      2834
   macro avg       0.62      0.74      0.62      2834
weighted avg       0.87      0.73      0.77      2834



## SVM Classification

In [16]:
from sklearn.svm import SVC

svc = SVC(kernel = 'rbf')
svc = svc.fit(X_train, y_train)

print(f"Training Data Score: {svc.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {svc.score(X_test_scaled, y_test)}")

Training Data Score: 0.6472941176470588
Testing Data Score: 0.6541990119971771


In [17]:
print(classification_report(svc.predict(X_test), y_test))

              precision    recall  f1-score   support

        HOST       1.00      0.65      0.79      2834
   SUPERHOST       0.00      0.00      0.00         0

    accuracy                           0.65      2834
   macro avg       0.50      0.33      0.40      2834
weighted avg       1.00      0.65      0.79      2834



  _warn_prf(average, modifier, msg_start, len(result))
