# Combine Numeric and Categorical Features

# Files that we need to load

- amenities_test_deps.csv
- amenities_test_indeps.csv
- amenities_train_deps.csv
- amenities_train_indeps.csv

- categorical_test_deps.csv
- categorical_test_indeps.csv
- categorical_train_deps.csv
- categorical_train_indeps.csv

- numeric_test_deps.csv
- numeric_test_indeps.csv
- numeric_train_deps.csv
- numeric_train_indeps.csv



In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
def read_df(filename, valtype):
    df = pd.read_csv(filename, low_memory=False, dtype=valtype)
    return df

In [4]:
from collections import OrderedDict, defaultdict

def create_combined_df(input_dict):
    fdf = pd.DataFrame()
    cols = OrderedDict()
    for k, v in input_dict.items():
        df = read_df('./data/'+k, v)
        colnames = [c for c in df.columns if c not in ['None', 'Unnamed: 0']]
        cols[k] = colnames
        fdf = pd.concat([fdf, df], axis=1)
    
    # fdf = fdf.DataFrame(fdf, columns=cols)
    fdf = fdf.drop(['None', 'Unnamed: 0'], axis=1)
    return fdf, cols

# Load Training Data

In [5]:
train_inpd = {
    'numeric_train_deps.csv': float, 
    'categorical_train_deps.csv': int,
    'amenities_train_deps.csv': int, 
}

X_train, cols = create_combined_df(train_inpd)
y_train = pd.read_csv('./data/numeric_train_indeps.csv', low_memory=False)
y_train = y_train.drop(['Unnamed: 0'], axis=1).astype(float)
y_train.sample()

print X_train.shape
print y_train.shape
X_train.sample()
y_train.sample()

(127302, 195)
(127302, 1)


Unnamed: 0,review_scores_rating
111054,100.0


In [6]:
numeric_cols = cols['numeric_train_deps.csv']
categorical_cols = cols['categorical_train_deps.csv']
amen_cols = cols['amenities_train_deps.csv']

In [7]:
train_colnames = X_train.columns
dependent_variable = y_train.columns[0]

# Load Test Data

In [8]:
test_inpd = {
    'numeric_test_deps.csv': float, 
    'categorical_test_deps.csv': int,
    'amenities_test_deps.csv': int, 
}

X_test, cols = create_combined_df(test_inpd)
y_test = pd.read_csv('./data/numeric_test_indeps.csv', low_memory=False)
y_test = y_test.drop(['Unnamed: 0'], axis=1).astype(float)
y_test.sample()

print X_test.shape
print y_test.shape
X_test.sample()
y_test.sample()

(62702, 195)
(62702, 1)


Unnamed: 0,review_scores_rating
26592,80.0


In [9]:
import numpy as np

def get_rand_idxs(start, end, size, seed=1234):
    np.random.seed(seed)
    return np.random.randint(start, end, size)

In [10]:
def run_prediction(X, y, Xt, yt, model):
    if len(y.shape) == 1:
        y = y.reshape(-1, 1)
    if len(yt.shape) == 1:
        yt = yt.reshape(-1, 1)
    
    res = {'train': {}, 'test': {}}
    
    model.fit(X, y)
    
    train_pred = model.predict(X)
    res['train']['pred'] = train_pred
    
    test_pred = model.predict(Xt)
    res['test']['pred'] = test_pred
    
    
    for name, tup in zip(['train', 'test'], [(y, train_pred), (yt, test_pred)]):
        act, prd = tup[0].ravel(), tup[1].ravel()
        print 'Results for %s' % name
        
        mserr = mean_squared_error(act, prd)
        res[name]['mse'] = mserr
        print '%s Mean Squared Error: %.4f' % (name, mserr)
        
        rsq_score = r2_score(act, prd)
        res[name]['r2'] = rsq_score
        print '%s R-Squared: %.4f' % (name, rsq_score)
        
        plt.scatter(act, prd, alpha=0.5)
        plt.title(name+': predictions vs. actual')
        plt.show()
        plt.scatter(act, act-prd, alpha=0.5)
        plt.title(name+': residuals vs. actual')
        plt.show()
        plt.hist(act-prd, alpha=0.5)
        plt.title(name+': residuals histogram')
        plt.show()
    
    return res

# Takeaways
1. Remove 'bathrooms' >= 5
2. Remove 'bedrooms' > 5
3. Remove 'beds' >= 8
4. Remove 'cleaning_fee' >= 400
5. Remove 'guests_included' > 8
6. Remove 'host_listings_count' > 100
7. Remove 'host_acceptance_rate' == 0.
8. Remove 'host_response_rate' == 0.
9. Remove 'reviews_per_month' > 12.
10. Drop 'host_has_profile_pic'
11. Drop 'host_identity_verified'

# Impute missing values for Train and Test sets
We will use Median for imputing missing values because it is not as affected by outliers as the Mean.
We will train the Imputer on the training data, and use this to fill the values for both Train and Test sets. 
We will not train a new Imputer on test data. This is important because we do not want to look at the test data when imputing the values.

In [11]:
# Impute missing values using median for both independent and dependent variables
# We don't care for host_has_profile_pic, and host_identity_verified.
# We dropped missing values from our dependent variable earlier, 
# so it won't be affected by imputation here.

import numpy as np
from sklearn.preprocessing import Imputer

# We need the 
train_colnames = X_train.columns

# Train data first
# axis 0 means impute along columns
train_imp = Imputer(missing_values=np.nan, strategy='median', axis=0, copy=True)
train_imp.fit(X_train)
X_train_imp = train_imp.transform(X_train)


# Then Test data
test_colnames = X_test.columns
X_test_imp = train_imp.transform(X_test)

print X_train_imp.shape
print X_test_imp.shape

(127302, 195)
(62702, 195)


# Handle Outliers
Drop them from Training data, as discussed in the Takeaways section.

In [12]:
all_columns = list(train_colnames.values) + [dependent_variable]

In [13]:
Train = pd.DataFrame(X_train_imp, columns=train_colnames)
Train = pd.concat([Train, y_train], axis=1)

In [14]:
Test = pd.DataFrame(X_test_imp, columns=test_colnames)
Test = pd.concat([Test, y_test], axis=1)

In [15]:
print Test.shape

(62702, 196)


In [16]:
def drop_outliers(t):
    # Drop outliers from dependent variable
    h = 0
    if 'review_scores_value' == dependent_variable:
        h = 3
    elif 'review_scores_rating' == dependent_variable:
        h = 80
    t = t.loc[t[dependent_variable] >= h]
    
    # Remove reviews_per_month == 0
    t = t.loc[t.reviews_per_month > 0].astype(float)

    # 1. Remove 'bathrooms' >= 5, convert to int
    t = t.loc[t.bathrooms < 5]
    t.bathrooms = t.bathrooms.astype(int)

    # 2. Remove 'bedrooms' > 5, convert to int
    t = t.loc[t.bedrooms <= 5]
    t.bedrooms = t.bedrooms.astype(int)

    # 3. Remove 'beds' >= 8, convert to int
    t = t.loc[t.beds < 8]
    t.beds = t.beds.astype(int)

    # 4. Remove 'cleaning_fee' > 400
    t = t.loc[t.cleaning_fee <= 400]

    # 5. Remove 'guests_included' > 8
    t = t.loc[t.guests_included <= 8]
    t = t.loc[t.guests_included > 0]
    t.guests_included = t.guests_included.astype(int)

    # 6. Remove 'host_listings_count' > 100
    t = t.loc[t.host_listings_count <= 100]
    # Remove'host_listings_count' <= 0
    t = t.loc[t.host_listings_count > 0]
    t.host_listings_count = t.host_listings_count.astype(int)
    
    

    # 7. Remove 'host_acceptance_rate' == 0.
    t = t.loc[t.host_acceptance_rate > 0.]

    # 8. Remove 'host_response_rate' == 0.
    t = t.loc[t.host_acceptance_rate > 0.]

    # 9. Remove 'reviews_per_month' > 12.
    t = t.loc[t.reviews_per_month <= 12]
    
    return t

Train = drop_outliers(Train)
# For Test, we will only drop the unneeded columns
# Test = Test.drop(['host_has_profile_pic', 'host_identity_verified'], axis=1)

Test = drop_outliers(Test)

print Train.shape
print Test.shape

(119615, 196)
(58911, 196)


In [17]:
np.unique(Train.review_scores_rating)

array([ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,
        91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99., 100.])

In [18]:
np.unique(Test.review_scores_rating)

array([ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,
        91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99., 100.])

# Create feature interactions

1. bathrooms x bedrooms
2. bedrooms x beds
3. beds x guests_included
4. bathrooms x guests_included
5. host_acceptance_rate x host_response_rate


In [19]:
def create_interaction(f1, f2, intyp):
    if 'D' == intype:
        return f1 / f2
    if 'A' == intype:
        return f1 + f2
    if 'S' == intype:
        return f1 - f2
    
    # Mulitply the features by default
    return f1 * f2

def get_interacted_dataset(ds, interactions_dict, drop=False):
    for k, v in interactions_dict.items():
        ds[k+'_by_'+v[0]] = ds[k] * ds[v[0]]
        
    if drop:
        for k in interactions_dict.keys():
            ds = ds.drop(k, axis=1)
    
    return ds


inter_dct = {
    'bathrooms': ('bedrooms', 'D'), 
    'beds': ('bedrooms', 'D'), 
    'beds': ('guests_included', 'M'), 
    'bathrooms': ('guests_included', 'D'), 
    'cleaning_fee': ('guests_included', 'D'), 
    'host_acceptance_rate': ('host_response_rate', 'D')
}

Train = get_interacted_dataset(Train, inter_dct, drop=False)
Test = get_interacted_dataset(Test, inter_dct, drop=False)

In [20]:
print Train.shape
print Test.shape

(119615, 200)
(58911, 200)


# Separate the Dependent and Independet Variables

In [21]:
# Train data
y_train = Train[dependent_variable]
print y_train.shape

X_train = Train.drop([dependent_variable], axis=1)
print X_train.shape

# Test data
y_test = Test[dependent_variable]
print y_test.shape

X_test = Test.drop([dependent_variable], axis=1)
print X_test.shape

(119615,)
(119615, 199)
(58911,)
(58911, 199)


# Normalize numeric columns

In [22]:
def normalize_df(df):
    return (df - df.min()) / (df.max() - df.min())

# X_train = (X_train - X_train.min()) / (X_train.max() - X_train.min())

X_train_num = normalize_df(X_train[numeric_cols])
X_test_num = normalize_df(X_test[numeric_cols])

In [23]:
print X_train_num.shape
print X_test_num.shape

(119615, 12)
(58911, 12)


In [24]:
print type(X_train)
print type(X_test)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


# Scale the Numeric columns
We don't want any one variable dominating others in the regression model, so let's start by scaling the dependent variables. As with Imputation, we will train the scaler only on the training data (to learn the mean and standard deviation), and then use it as-is on the Test data. We will assume that all dependent variables are interval.

In [25]:
X_train_num = pd.DataFrame(X_train_num, columns=numeric_cols)
X_test_num = pd.DataFrame(X_test_num, columns=numeric_cols)

# Write back all data to csv

In [26]:
# Write numeric data

In [27]:
y_train = pd.DataFrame(y_train, columns=[dependent_variable])
y_test = pd.DataFrame(y_test, columns=[dependent_variable])

In [28]:
X_train_num.to_csv('./data/fin_num_train_deps.csv', encoding='utf8')
y_train.to_csv('./data/fin_train_indeps.csv', encoding='utf8')
X_test_num.to_csv('./data/fin_num_test_deps.csv', encoding='utf8')
y_test.to_csv('./data/fin_test_indeps.csv', encoding='utf8')

In [29]:
# Write categorical data

In [30]:
X_train_cat = X_train[categorical_cols].astype(int)
X_test_cat = X_test[categorical_cols].astype(int)

X_train_cat.to_csv('./data/fin_cat_train_deps.csv', encoding='utf8')
X_test_cat.to_csv('./data/fin_cat_test_deps.csv', encoding='utf8')

In [31]:
# Write amenities columns

In [32]:
X_train_amen = X_train[amen_cols].astype(int)
X_test_amen = X_test[amen_cols].astype(int)

X_train_amen.to_csv('./data/fin_amen_train_deps.csv', encoding='utf8')
X_test_amen.to_csv('./data/fin_amen_test_deps.csv', encoding='utf8')

In [33]:
print (X_train_num.shape, X_train_cat.shape, X_train_amen.shape)
print (X_test_num.shape, X_test_cat.shape, X_test_amen.shape)

((119615, 12), (119615, 124), (119615, 59))
((58911, 12), (58911, 124), (58911, 59))


In [34]:
# Create a combined Train and Test dataset

In [35]:
num_train = pd.read_csv('./data/fin_num_train_deps.csv')
num_train = num_train.drop(['Unnamed: 0'], axis=1)
num_test = pd.read_csv('./data/fin_num_test_deps.csv')
num_test = num_test.drop(['Unnamed: 0'], axis=1)

cat_train = pd.read_csv('./data/fin_cat_train_deps.csv')
cat_train = cat_train.drop(['Unnamed: 0'], axis=1)
cat_test = pd.read_csv('./data/fin_cat_test_deps.csv')
cat_test = cat_test.drop(['Unnamed: 0'], axis=1)

amen_train = pd.read_csv('./data/fin_amen_train_deps.csv')
amen_train = amen_train.drop(['Unnamed: 0'], axis=1)
amen_test = pd.read_csv('./data/fin_amen_test_deps.csv')
amen_test = amen_test.drop(['Unnamed: 0'], axis=1)

In [36]:
print (num_train.shape, cat_train.shape, amen_train.shape)

((119615, 12), (119615, 124), (119615, 59))


In [37]:
X_train_comb = pd.concat([num_train, cat_train, amen_train], 
                         axis=1)
X_test_comb = pd.concat([num_test, cat_test, amen_test], 
                        axis=1)

In [38]:
print X_train_comb.shape, y_train.shape
print X_test_comb.shape, y_test.shape

(119615, 195) (119615, 1)
(58911, 195) (58911, 1)


In [39]:
X_train_comb.to_csv('./data/fin_comb_train_deps.csv', encoding='utf8')
X_test_comb.to_csv('./data/fin_comb_test_deps.csv', encoding='utf8')