In [22]:
import pandas as pd
import numpy as np

In [44]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [72]:
data=pd.read_csv("train.csv")

In [6]:
data.head()

Unnamed: 0,id,region,latitude,longitude,accommodation_type,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability
0,13232,Manhattan,40.71854,-74.00439,Entire home/apt,170,5,7,0.56,929983,1,0
1,246,Brooklyn,40.64446,-73.9503,Entire home/apt,65,3,238,2.3,281764,1,0
2,19091,Queens,40.78573,-73.81062,Private room,85,1,0,,19923341,1,1
3,34305,Manhattan,40.73863,-73.98002,Private room,210,30,0,,200380610,65,1
4,444,Manhattan,40.82426,-73.9463,Shared room,75,3,38,0.42,745069,3,1


In [7]:
data.describe()

Unnamed: 0,id,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability
count,2870.0,2870.0,2870.0,2870.0,2870.0,2870.0,2194.0,2870.0,2870.0,2870.0
mean,26760.657143,40.731224,-73.950158,195.943206,11.530314,16.315331,1.157502,72021950.0,8.411498,0.498606
std,14140.930062,0.054942,0.049745,406.184714,37.972339,32.481722,1.355028,80765160.0,27.105522,0.500085
min,0.0,40.50708,-74.24285,10.0,1.0,0.0,0.01,2787.0,1.0,0.0
25%,15931.75,40.692462,-73.984003,75.0,1.0,1.0,0.24,7388002.0,1.0,0.0
50%,28946.5,40.72825,-73.95672,120.0,3.0,4.0,0.65,33527080.0,1.0,0.0
75%,38478.5,40.762658,-73.934202,200.0,6.0,16.0,1.53,120762500.0,3.0,1.0
max,48893.0,40.89873,-73.72173,9999.0,999.0,395.0,10.37,273812300.0,327.0,1.0


In [8]:
data.dtypes
# it seems like categorical variables are region, accomodation_type, owner_id

id                       int64
region                  object
latitude               float64
longitude              float64
accommodation_type      object
cost                     int64
minimum_nights           int64
number_of_reviews        int64
reviews_per_month      float64
owner_id                 int64
owned_hotels             int64
yearly_availability      int64
dtype: object

In [9]:
# check whether there's class imbalance 
data['yearly_availability'].value_counts()

0    1439
1    1431
Name: yearly_availability, dtype: int64

In [33]:
# Checking if any column have nulls
data.isnull().sum(axis=0)

id                     0
region                 0
latitude               0
longitude              0
accommodation_type     0
cost                   0
minimum_nights         0
number_of_reviews      0
reviews_per_month      0
owner_id               0
owned_hotels           0
yearly_availability    0
dtype: int64

In [73]:
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

In [12]:
# it looks like the data is within nyc, can drop latitude/longitude later
# can use target encoding for region
data['region'].value_counts()

Manhattan        1333
Brooklyn         1075
Queens            370
Bronx              78
Staten Island      14
Name: region, dtype: int64

In [15]:
# see how many unique owners are there
data['owner_id'].value_counts().unique().sum()
# high cardinality - target encoding

208

In [16]:
def target_encoding(df, cat):
    y_avg=data['yearly_availability'].mean()
    y_reg=data.groupby(cat)['yearly_availability'].mean()
    num_reg=data.groupby(cat).size()
    smooth=(num_reg*y_reg+10*y_avg)/(num_reg+10)
    cat_target=cat+'_target'
    data[cat_target]=data[cat].map(smooth)

In [74]:
target_encoding(data,'owner_id')

In [75]:
target_encoding(data,'region')

In [76]:
data.head()

Unnamed: 0,id,region,latitude,longitude,accommodation_type,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability,owner_id_target,region_target
0,13232,Manhattan,40.71854,-74.00439,Entire home/apt,170,5,7,0.56,929983,1,0,0.453278,0.488448
1,246,Brooklyn,40.64446,-73.9503,Entire home/apt,65,3,238,2.3,281764,1,0,0.453278,0.447913
2,19091,Queens,40.78573,-73.81062,Private room,85,1,0,0.0,19923341,1,1,0.544188,0.618384
3,34305,Manhattan,40.73863,-73.98002,Private room,210,30,0,0.0,200380610,65,1,0.871438,0.488448
4,444,Manhattan,40.82426,-73.9463,Shared room,75,3,38,0.42,745069,3,1,0.544188,0.488448


In [20]:
# low cardinality, can use one-hot encoding 
data['accommodation_type'].value_counts()

Private room       1225
Entire home/apt    1002
Shared room         643
Name: accommodation_type, dtype: int64

In [77]:
data=pd.get_dummies(data, prefix=['acc'], columns=['accommodation_type'],drop_first=True)

In [78]:
original_data=data.copy()

In [79]:
data = data.drop(['id','region','latitude','longitude','owner_id'], axis = 1)

In [80]:
data.head()

Unnamed: 0,cost,minimum_nights,number_of_reviews,reviews_per_month,owned_hotels,yearly_availability,owner_id_target,region_target,acc_Private room,acc_Shared room
0,170,5,7,0.56,1,0,0.453278,0.488448,0,0
1,65,3,238,2.3,1,0,0.453278,0.447913,0,0
2,85,1,0,0.0,1,1,0.544188,0.618384,1,0
3,210,30,0,0.0,65,1,0.871438,0.488448,1,0
4,75,3,38,0.42,3,1,0.544188,0.488448,0,1


In [81]:
original_data.head()

Unnamed: 0,id,region,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability,owner_id_target,region_target,acc_Private room,acc_Shared room
0,13232,Manhattan,40.71854,-74.00439,170,5,7,0.56,929983,1,0,0.453278,0.488448,0,0
1,246,Brooklyn,40.64446,-73.9503,65,3,238,2.3,281764,1,0,0.453278,0.447913,0,0
2,19091,Queens,40.78573,-73.81062,85,1,0,0.0,19923341,1,1,0.544188,0.618384,1,0
3,34305,Manhattan,40.73863,-73.98002,210,30,0,0.0,200380610,65,1,0.871438,0.488448,1,0
4,444,Manhattan,40.82426,-73.9463,75,3,38,0.42,745069,3,1,0.544188,0.488448,0,1


In [46]:
X=data.drop(columns='yearly_availability',axis=0)
y=data['yearly_availability']


In [45]:
scoring={'accuracy':make_scorer(accuracy_score),
        'f1_score':make_scorer(f1_score)}

In [125]:
'''
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
'''
svc=LinearSVC(verbose=0,dual=False)
naive=GaussianNB()
rf=RandomForestClassifier()
xgb=xgboost.XGBClassifier(eval_metric='auc',use_label_encoder=False)

In [126]:
def model_evaluation(model,X,y):
    model_cv=cross_validate(model,X,y,cv=10,scoring=scoring)
    scoring_table=pd.DataFrame({'Accuracy':model_cv['test_accuracy'].mean(),
                               'F1 Score': model_cv['test_f1_score'].mean()},
                              index=[str(model)])
    return scoring_table

In [127]:
model_evaluation(svc,X,y)

Unnamed: 0,Accuracy,F1 Score
LinearSVC(dual=False),0.997561,0.997554


In [56]:
model_evaluation(naive,X,y)

Unnamed: 0,Accuracy,F1 Score
GaussianNB(),0.951568,0.951461


In [57]:
model_evaluation(rf,X,y)

Unnamed: 0,Accuracy,F1 Score
RandomForestClassifier(),0.99547,0.995465


In [62]:
model_evaluation(xgb,X,y)

Unnamed: 0,Accuracy,F1 Score
"XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,\n colsample_bynode=None, colsample_bytree=None, eval_metric='auc',\n gamma=None, gpu_id=None, importance_type='gain',\n interaction_constraints=None, learning_rate=None,\n max_delta_step=None, max_depth=None, min_child_weight=None,\n missing=nan, monotone_constraints=None, n_estimators=100,\n n_jobs=None, num_parallel_tree=None, random_state=None,\n reg_alpha=None, reg_lambda=None, scale_pos_weight=None,\n subsample=None, tree_method=None, use_label_encoder=False,\n validate_parameters=None, verbosity=None)",0.996516,0.996501


In [129]:
# looking at the performances of these models, it seems like svc has the best performance 
# with xgb and rf followed behind closely.
svc.fit(X,y)

LinearSVC(dual=False)

In [63]:
test=pd.read_csv('test.csv')

In [68]:
test.head()

Unnamed: 0,id,region,latitude,longitude,accommodation_type,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels
0,19215,Brooklyn,40.70912,-73.94513,Shared room,135,2,22,0.66,4360212,1
1,36301,Brooklyn,40.57646,-73.96641,Entire home/apt,69,2,8,0.9,181356989,2
2,40566,Manhattan,40.76616,-73.98228,Private room,225,30,0,0.0,13773574,12
3,33694,Manhattan,40.77668,-73.94587,Shared room,125,30,9,0.82,6788748,1
4,28873,Manhattan,40.80279,-73.9445,Entire home/apt,43,1,13,0.72,105061915,2


In [69]:
data.head()

Unnamed: 0,cost,minimum_nights,number_of_reviews,reviews_per_month,owned_hotels,yearly_availability,owner_id_target,region_target,acc_Private room,acc_Shared room
0,170,5,7,0.56,1,0,0.453278,0.488448,0,0
1,65,3,238,2.3,1,0,0.453278,0.447913,0,0
2,85,1,0,0.0,1,1,0.544188,0.618384,1,0
3,210,30,0,0.0,65,1,0.871438,0.488448,1,0
4,75,3,38,0.42,3,1,0.544188,0.488448,0,1


In [65]:
test['reviews_per_month'] = test['reviews_per_month'].fillna(0)

In [70]:
test=pd.get_dummies(test, prefix=['acc'], columns=['accommodation_type'],drop_first=True)

In [83]:
original_data.head()

Unnamed: 0,id,region,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,yearly_availability,owner_id_target,region_target,acc_Private room,acc_Shared room
0,13232,Manhattan,40.71854,-74.00439,170,5,7,0.56,929983,1,0,0.453278,0.488448,0,0
1,246,Brooklyn,40.64446,-73.9503,65,3,238,2.3,281764,1,0,0.453278,0.447913,0,0
2,19091,Queens,40.78573,-73.81062,85,1,0,0.0,19923341,1,1,0.544188,0.618384,1,0
3,34305,Manhattan,40.73863,-73.98002,210,30,0,0.0,200380610,65,1,0.871438,0.488448,1,0
4,444,Manhattan,40.82426,-73.9463,75,3,38,0.42,745069,3,1,0.544188,0.488448,0,1


In [102]:
target_data_region=original_data[['region','region_target']].drop_duplicates()

In [106]:
target_data_owner_id=original_data[['owner_id','owner_id_target']].drop_duplicates()

In [103]:
join_df=pd.merge(test,target_data_region,on='region',how='left')

In [107]:
join_df_2=pd.merge(join_df,target_data_owner_id,on='owner_id',how='left')

In [113]:
# Checking if any column have nulls
join_df_2.isnull().sum(axis=0)

id                     0
region                 0
latitude               0
longitude              0
cost                   0
minimum_nights         0
number_of_reviews      0
reviews_per_month      0
owner_id               0
owned_hotels           0
acc_Private room       0
acc_Shared room        0
region_target          0
owner_id_target      522
dtype: int64

In [114]:
join_df_2['owner_id_target'].fillna(data['yearly_availability'].mean(),inplace=True)

In [115]:
join_df_2.shape

(718, 14)

In [116]:
join_df_2.head()

Unnamed: 0,id,region,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,acc_Private room,acc_Shared room,region_target,owner_id_target
0,19215,Brooklyn,40.70912,-73.94513,135,2,22,0.66,4360212,1,0,1,0.447913,0.498606
1,36301,Brooklyn,40.57646,-73.96641,69,2,8,0.9,181356989,2,0,0,0.447913,0.498606
2,40566,Manhattan,40.76616,-73.98228,225,30,0,0.0,13773574,12,1,0,0.488448,0.498606
3,33694,Manhattan,40.77668,-73.94587,125,30,9,0.82,6788748,1,0,1,0.488448,0.498606
4,28873,Manhattan,40.80279,-73.9445,43,1,13,0.72,105061915,2,0,0,0.488448,0.498606


In [117]:
test = join_df_2.drop(['id','region','latitude','longitude','owner_id'], axis = 1)

In [122]:
test.head()

Unnamed: 0,cost,minimum_nights,number_of_reviews,reviews_per_month,owned_hotels,acc_Private room,acc_Shared room,region_target,owner_id_target
0,135,2,22,0.66,1,0,1,0.447913,0.498606
1,69,2,8,0.9,2,0,0,0.447913,0.498606
2,225,30,0,0.0,12,1,0,0.488448,0.498606
3,125,30,9,0.82,1,0,1,0.488448,0.498606
4,43,1,13,0.72,2,0,0,0.488448,0.498606


In [123]:
join_df_2.head()

Unnamed: 0,id,region,latitude,longitude,cost,minimum_nights,number_of_reviews,reviews_per_month,owner_id,owned_hotels,acc_Private room,acc_Shared room,region_target,owner_id_target
0,19215,Brooklyn,40.70912,-73.94513,135,2,22,0.66,4360212,1,0,1,0.447913,0.498606
1,36301,Brooklyn,40.57646,-73.96641,69,2,8,0.9,181356989,2,0,0,0.447913,0.498606
2,40566,Manhattan,40.76616,-73.98228,225,30,0,0.0,13773574,12,1,0,0.488448,0.498606
3,33694,Manhattan,40.77668,-73.94587,125,30,9,0.82,6788748,1,0,1,0.488448,0.498606
4,28873,Manhattan,40.80279,-73.9445,43,1,13,0.72,105061915,2,0,0,0.488448,0.498606


In [132]:
predictions = list(svc.predict(test))
ids = list(join_df_2['id'])
results = pd.DataFrame(list(zip(ids,predictions)), columns = ['id','yearly_availability'])

In [133]:
results

Unnamed: 0,id,yearly_availability
0,19215,0
1,36301,0
2,40566,1
3,33694,0
4,28873,0
...,...,...
713,26801,0
714,20110,0
715,31383,0
716,47135,1


In [None]:
results.to_csv('submissions.csv',index=False)