In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import requests
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold
import re, string, time
from sklearn.metrics import log_loss
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

#  Load data

In [2]:
data = pd.read_csv('preprocessed.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41011 entries, 0 to 41010
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    41011 non-null  int64  
 1   bathrooms                41011 non-null  float64
 2   bedrooms                 41011 non-null  int64  
 3   latitude                 41011 non-null  float64
 4   listing_id               41011 non-null  int64  
 5   longitude                41011 non-null  float64
 6   price                    41011 non-null  int64  
 7   interest_level           41011 non-null  int64  
 8   hour                     41011 non-null  int64  
 9   num_of_photos            41011 non-null  int64  
 10  num_of_features          41011 non-null  int64  
 11  len_of_description       41011 non-null  int64  
 12  price_per_bedroom        41011 non-null  float64
 13  price_per_bathroom       41011 non-null  float64
 14  price_per_bed_bath_roo

# Common Functions


In [4]:
def _preprocess(dtrain, dtest):
    # replace np.inf to np.nan
    # dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    # dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest


def _preprocess_log(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest

# passing split_data according to features
def train_cv(clf, model, split_data, preprocess = 'linear'):
    X_train, X_test, y_train, y_test = split_data
    print()
    if preprocess == 'log':
        X_train, X_test = _preprocess_log(X_train, X_test)
    elif preprocess == 'linear':
        X_train, X_test = _preprocess(X_train, X_test)
    elif preprocess == 'no_preprocess': 
        #use original data
        pass
        
    cv_scores, n_folds = [], 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=816)
   
    
    for i, (train_ind, val_ind) in enumerate(skf.split(X_train, y_train)):
        print("Running Fold", i + 1, "/", n_folds)
        start = time.time()
        
        train_x, val_x = X_train.iloc[train_ind, :], X_train.iloc[val_ind, :]
        train_y, val_y = y_train.iloc[train_ind], y_train.iloc[val_ind]
        if model == 'svm':
            clf, train_loss, val_loss = svm_run_model(svm_clf, (train_x, train_y), (val_x, val_y))
        if model == 'lr':
            clf, train_loss, val_loss = lr_run_model(lr_clf, (train_x, train_y), (val_x, val_y))
        if model == 'dt':
            clf, train_loss, val_loss = dt_run_model(dt_clf, (train_x, train_y), (val_x, val_y))
        
        print("train_loss: {0:.6f}, val_loss: {1:.6f}".format(train_loss, val_loss), end="\t")
        
        end = time.time()
        m, s = divmod(end-start, 60)
        h, m = divmod(m, 60)

        print("time elapsed: %d:%02d:%02d" % (h, m, s))
        y_pred = clf.predict(val_x)
        accuracy_score = metrics.accuracy_score(val_y, y_pred)
        f_score = metrics.f1_score(val_y, y_pred, average='macro')
        cv_scores.append([train_loss, val_loss, f_score, accuracy_score])
        
        print("accuracy score: ", accuracy_score)
        print("f score: ", f_score)
        
    mean_train_loss = np.mean([cv_scores[i][0] for i in range(len(cv_scores))])
    mean_val_loss = np.mean([cv_scores[i][1] for i in range(len(cv_scores))])
    
    print("train_loss mean: {0:.6f}, val_loss mean: {1:.6f}".format(mean_train_loss, mean_val_loss))

    return clf, cv_scores

# Training the models

## 1. SVM

In [5]:
svm_features = ['price','bedrooms','bathrooms','latitude','longitude']
svm_x = data[svm_features]
svm_y = data['interest_level']
#svm_x_train,svm_x_test,svm_y_train,svm_y_test = train_test_split(svm_x,svm_y)

In [6]:
def svm_split_data():
    return train_test_split(svm_x,svm_y)

In [7]:
def svm_run_model(clf, train_data,test_data=None):
    # print(test_data)
    if test_data:
        clf.fit(train_data[0],train_data[1])
        y_train_predict, y_test_predict = clf.predict_proba(train_data[0]), clf.predict_proba(test_data[0])
        y_train_loss, y_test_loss = log_loss(train_data[1],y_train_predict), log_loss(test_data[1], y_test_predict)
        return clf, y_train_loss, y_test_loss
    else:
        clf.fit(train_data[0],train_data[1])
        y_train_predict = clf.predict_proba(train_data[0])
        y_train_loss = log_loss(train_data[1], y_train_predict)
        return clf, y_train_loss

In [8]:
# init svm clf
clf_init = LinearSVC()
params = {'penalty': 'l2',
        'loss': 'squared_hinge',
        'C':0.01,     
        'multi_class': 'ovr',
        'fit_intercept': True,
        'verbose': 0,
        'random_state': 36683,
        }
clf_init.set_params(**params)
svm_clf = CalibratedClassifierCV(clf_init)


svm_clf, svm_cv_scores = train_cv(svm_clf, 'svm',svm_split_data(), preprocess='log')



Running Fold 1 / 5
train_loss: 0.730135, val_loss: 0.736924	time elapsed: 0:00:00
accuracy score:  0.6875812743823146
f score:  0.3226797247514512
Running Fold 2 / 5
train_loss: 0.732083, val_loss: 0.729340	time elapsed: 0:00:00
accuracy score:  0.6898569570871261
f score:  0.31893650919231514
Running Fold 3 / 5
train_loss: 0.731297, val_loss: 0.732260	time elapsed: 0:00:00
accuracy score:  0.6874187256176854
f score:  0.3186379682819483
Running Fold 4 / 5
train_loss: 0.731517, val_loss: 0.731499	time elapsed: 0:00:00
accuracy score:  0.6904568362867827
f score:  0.32442087219187976
Running Fold 5 / 5
train_loss: 0.732228, val_loss: 0.728817	time elapsed: 0:00:00
accuracy score:  0.6919200130060152
f score:  0.32963058305467136
train_loss mean: 0.731452, val_loss mean: 0.731768


## 2. Logistic Regression

In [9]:
lr_features = ['bathrooms', 'bedrooms', 'price']
X = data[lr_features]
y = data['interest_level']

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [10]:
def split_data():
    return train_test_split(X,y)

def lr_run_model(clf, dtrain, dtest=None):
    if dtest:
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred, y_test_pred = clf.predict_proba(dtrain[0]), clf.predict_proba(dtest[0])
        y_train_loss, y_test_loss = log_loss(dtrain[1], y_train_pred), log_loss(dtest[1], y_test_pred)
        return clf, y_train_loss, y_test_loss
    else:
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred = clf.predict_proba(dtrain[0])
        y_train_loss = log_loss(dtrain[1], y_train_pred)
        return clf, y_train_loss

In [11]:
# init logistic regression clf
lr_clf = LogisticRegression()
params = {'C': 0.01,
        'solver': 'liblinear',
        'multi_class': 'ovr',
        'n_jobs': -1,
        'verbose': 1,
        'max_iter': 10000,
        'random_state': 36883
        }
lr_clf.set_params(**params)

train_cv(lr_clf, 'lr',split_data(), preprocess='no_preprocess')


Running Fold 1 / 5
[LibLinear]train_loss: 0.747840, val_loss: 0.742989	time elapsed: 0:00:00
accuracy score:  0.6874187256176854
f score:  0.30171406468256745
Running Fold 2 / 5
[LibLinear]train_loss: 0.747205, val_loss: 0.745384	time elapsed: 0:00:00
accuracy score:  0.6849804941482445
f score:  0.293253689658215
Running Fold 3 / 5
[LibLinear]train_loss: 0.746614, val_loss: 0.748528	time elapsed: 0:00:00
accuracy score:  0.6833550065019506
f score:  0.29386812125392264
Running Fold 4 / 5
[LibLinear]train_loss: 0.746491, val_loss: 0.748675	time elapsed: 0:00:00
accuracy score:  0.683628678263697
f score:  0.29534517913800823
Running Fold 5 / 5
[LibLinear]train_loss: 0.746282, val_loss: 0.749422	time elapsed: 0:00:00
accuracy score:  0.684604129409852
f score:  0.2942158851542486
train_loss mean: 0.746887, val_loss mean: 0.746999


(LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=10000,
                    multi_class='ovr', n_jobs=-1, penalty='l2',
                    random_state=36883, solver='liblinear', tol=0.0001,
                    verbose=1, warm_start=False),
 [[0.7478401467482448,
   0.7429889607293084,
   0.30171406468256745,
   0.6874187256176854],
  [0.7472050831900721,
   0.745383594213056,
   0.293253689658215,
   0.6849804941482445],
  [0.7466142808701104,
   0.7485278008592905,
   0.29386812125392264,
   0.6833550065019506],
  [0.7464913845162143,
   0.7486749688696462,
   0.29534517913800823,
   0.683628678263697],
  [0.7462818006740871,
   0.7494218763193425,
   0.2942158851542486,
   0.684604129409852]])

In [12]:
train_cv(lr_clf, 'lr',split_data(), preprocess='log')


Running Fold 1 / 5
[LibLinear]train_loss: 0.749131, val_loss: 0.743787	time elapsed: 0:00:00
accuracy score:  0.6911573472041612
f score:  0.3007470447733701
Running Fold 2 / 5
[LibLinear]train_loss: 0.747826, val_loss: 0.748326	time elapsed: 0:00:00
accuracy score:  0.6895318595578673
f score:  0.2964330033562418
Running Fold 3 / 5
[LibLinear]train_loss: 0.747317, val_loss: 0.750478	time elapsed: 0:00:00
accuracy score:  0.688556566970091
f score:  0.2918196748990775
Running Fold 4 / 5
[LibLinear]train_loss: 0.746857, val_loss: 0.752156	time elapsed: 0:00:00
accuracy score:  0.6849292797919038
f score:  0.2939744673641533
Running Fold 5 / 5
[LibLinear]train_loss: 0.748449, val_loss: 0.746282	time elapsed: 0:00:00
accuracy score:  0.6870427572752398
f score:  0.2962542253348985
train_loss mean: 0.747916, val_loss mean: 0.748206


(LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=10000,
                    multi_class='ovr', n_jobs=-1, penalty='l2',
                    random_state=36883, solver='liblinear', tol=0.0001,
                    verbose=1, warm_start=False),
 [[0.749130960960146,
   0.7437866895433026,
   0.3007470447733701,
   0.6911573472041612],
  [0.7478263283649799,
   0.7483261028053857,
   0.2964330033562418,
   0.6895318595578673],
  [0.7473165596219083,
   0.7504779722542395,
   0.2918196748990775,
   0.688556566970091],
  [0.746856943335572,
   0.7521558414603248,
   0.2939744673641533,
   0.6849292797919038],
  [0.7484493266396346,
   0.7462818838215943,
   0.2962542253348985,
   0.6870427572752398]])

In [None]:
lr_clf, lr_cv_scores = train_cv(lr_clf, 'lr',split_data(), preprocess='linear')

## 3. Decision tree

In [13]:
def dt_split_data(x, y):
    return train_test_split(x,y)

def dt_run_model(clf, dtrain, dtest=None):
    if dtest:
#         tree.plot_tree(clf.fit(dtrain[0], dtrain[1]), )
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred, y_test_pred = clf.predict_proba(dtrain[0]), clf.predict_proba(dtest[0])
        y_train_loss, y_test_loss = log_loss(dtrain[1], y_train_pred), log_loss(dtest[1], y_test_pred)
        # print(y_train_pred, y_test_pred)
        
        return clf, y_train_loss, y_test_loss
    else:
#         tree.plot_tree(clf.fit(dtrain[0], dtrain[1]))
        clf.fit(dtrain[0], dtrain[1])
        y_train_pred = clf.predict_proba(dtrain[0])
        y_train_loss = log_loss(dtrain[1], y_train_pred)
        # print(y_train_pred, y_test_pred)
        return clf, y_train_loss

In [14]:
data = pd.read_csv('preprocessed.csv')
test_data = pd.read_csv('preprocessed_test_data.csv')

dt_features = ['price', 'dist_to_city_center','price_per_bedroom', 'price_per_bathroom', 'num_of_features','len_of_description', 'pos_count', 'num_of_photos', 'bedrooms', 'bathrooms']
dt_x = data[dt_features]
dt_y = data['interest_level']

# init decision tree clf
dt_clf = DecisionTreeClassifier(max_depth=4, criterion='entropy', max_features='auto')
dt_clf, dt_cv_scores = train_cv(dt_clf, 'dt', dt_split_data(dt_x, dt_y))


Running Fold 1 / 5


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

# Test model with test.json and build submittion.csv


In [None]:
def testMode(clf, which, features):
    data = pd.read_csv('preprocessed.csv')
    test_data = pd.read_csv('preprocessed_test_data.csv')

    listing_id = test_data['listing_id']
    # test_data = test_data.replace([np.inf, -np.inf], np.nan)
    data, test_data = _preprocess(data, test_data)

    # test_data.info()
    test_data_x = test_data[features]

    predictions = clf.predict_proba(test_data_x)
    # test_data
    predictions
    # res
    submission = pd.DataFrame({'listing_id':listing_id,'low':predictions[:,0], 'medium': predictions[:,1], 'high': predictions[:,2] })
    submission.listing_id = submission.listing_id.astype(int)

    submission.info()
    filename = 'output_' + which + '.csv'
    submission.to_csv(filename,index=False)

In [None]:
testMode(dt_clf, 'dt', dt_features)

In [None]:
testMode(svm_clf, 'svm', svm_features)

In [None]:
testMode(lr_clf, 'lr', lr_features)

# Evaluation models

In [None]:
cols = ['train_loss', 'val_loss', 'f_score', 'accuracy_score']
lr_scores = pd.DataFrame(lr_cv_scores, columns=cols)

In [None]:
lr_scores[cols].plot(title='Logistic Regression log_loss vs f_score')

In [None]:
svm_scores = pd.DataFrame(svm_cv_scores, columns=cols)
svm_scores[cols].plot(title='SVM log_loss vs f_score')

In [None]:
dt_scores = pd.DataFrame(dt_cv_scores, columns=cols)
dt_scores[cols].plot(title='Decision Tree log_loss vs f_score')