# Process text data

# Files that we need to load

- amenities_test_deps.csv
- amenities_test_indeps.csv
- amenities_train_deps.csv
- amenities_train_indeps.csv

- categorical_test_deps.csv
- categorical_test_indeps.csv
- categorical_train_deps.csv
- categorical_train_indeps.csv

- numeric_test_deps.csv
- numeric_test_indeps.csv
- numeric_train_deps.csv
- numeric_train_indeps.csv



In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
def read_df(filename, valtype):
    df = pd.read_csv(filename, low_memory=False, dtype=valtype)
    return df

In [4]:
from collections import OrderedDict, defaultdict

def create_combined_df(input_dict):
    fdf = pd.DataFrame()
    cols = OrderedDict()
    for k, v in input_dict.items():
        df = read_df('./data/'+k, v)
        colnames = [c for c in df.columns if c not in ['Unnamed: 0']]
        cols[k] = colnames
        fdf = pd.concat([fdf, df], axis=1)
    
    # fdf = fdf.DataFrame(fdf, columns=cols)
    fdf = fdf.drop(['Unnamed: 0'], axis=1)
    return fdf, cols

# Load Training Data

In [16]:
train_inpd = {
    'numeric_train_deps.csv': float, 
    'text_train_deps.csv': str,
}

X_train, cols = create_combined_df(train_inpd)
y_train = pd.read_csv('./data/numeric_train_indeps.csv', low_memory=False)
y_train = y_train.drop(['Unnamed: 0'], axis=1).astype(float)
y_train.sample()

print X_train.shape
print y_train.shape
X_train.sample()
y_train.sample()

(127302, 18)
(127302, 1)


Unnamed: 0,review_scores_rating
36164,80.0


In [17]:
numeric_cols = cols['numeric_train_deps.csv']
text_cols = cols['text_train_deps.csv']

In [18]:
train_colnames = X_train.columns
dependent_variable = y_train.columns[0]

# Load Test Data

In [19]:
test_inpd = {
    'numeric_test_deps.csv': float, 
    'text_train_deps.csv': str,
}

X_test, cols = create_combined_df(test_inpd)
y_test = pd.read_csv('./data/numeric_test_indeps.csv', low_memory=False)
y_test = y_test.drop(['Unnamed: 0'], axis=1).astype(float)
y_test.sample()

print X_test.shape
print y_test.shape
X_test.sample()
y_test.sample()

(127302, 18)
(62702, 1)


Unnamed: 0,review_scores_rating
32922,84.0


In [20]:
import numpy as np

def get_rand_idxs(start, end, size, seed=1234):
    np.random.seed(seed)
    return np.random.randint(start, end, size)

In [21]:
def run_prediction(X, y, Xt, yt, model):
    if len(y.shape) == 1:
        y = y.reshape(-1, 1)
    if len(yt.shape) == 1:
        yt = yt.reshape(-1, 1)
    
    res = {'train': {}, 'test': {}}
    
    model.fit(X, y)
    
    train_pred = model.predict(X)
    res['train']['pred'] = train_pred
    
    test_pred = model.predict(Xt)
    res['test']['pred'] = test_pred
    
    
    for name, tup in zip(['train', 'test'], [(y, train_pred), (yt, test_pred)]):
        act, prd = tup[0].ravel(), tup[1].ravel()
        print 'Results for %s' % name
        
        mserr = mean_squared_error(act, prd)
        res[name]['mse'] = mserr
        print '%s Mean Squared Error: %.4f' % (name, mserr)
        
        rsq_score = r2_score(act, prd)
        res[name]['r2'] = rsq_score
        print '%s R-Squared: %.4f' % (name, rsq_score)
        
        plt.scatter(act, prd, alpha=0.5)
        plt.title(name+': predictions vs. actual')
        plt.show()
        plt.scatter(act, act-prd, alpha=0.5)
        plt.title(name+': residuals vs. actual')
        plt.show()
        plt.hist(act-prd, alpha=0.5)
        plt.title(name+': residuals histogram')
        plt.show()
    
    return res

# Impute missing values for Train and Test sets
We will use Median for imputing missing values because it is not as affected by outliers as the Mean.
We will train the Imputer on the training data, and use this to fill the values for both Train and Test sets. 
We will not train a new Imputer on test data. This is important because we do not want to look at the test data when imputing the values.

In [24]:
X_num_train = X_train[numeric_cols]
X_num_test = X_test[numeric_cols]

In [25]:
print X_num_train.shape
print X_num_test.shape

(127302, 12)
(127302, 12)


In [57]:
# Impute missing values using median for both independent and dependent variables
# We don't care for host_has_profile_pic, and host_identity_verified.
# We dropped missing values from our dependent variable earlier, 
# so it won't be affected by imputation here.

import numpy as np
from sklearn.preprocessing import Imputer

# We need the 
num_train_colnames = X_num_train.columns

# Train data first
# axis 0 means impute along columns
train_imp = Imputer(missing_values=np.nan, strategy='median', axis=0, copy=True)
train_imp.fit(X_num_train)
X_train_imp = train_imp.transform(X_num_train)


# Then Test data
num_test_colnames = X_num_test.columns
X_test_imp = train_imp.transform(X_num_test)

print X_train_imp.shape
print X_test_imp.shape

(127302, 12)
(127302, 12)


In [58]:
print len(num_train_colnames)

12


# Handle Outliers
Drop them from Training data, as discussed in the Takeaways section.

In [59]:
# all_columns = list(train_colnames.values) + [dependent_variable]

In [61]:
Train = pd.DataFrame(X_train_imp, columns=num_train_colnames)
Train = pd.concat([Train, X_train[text_cols]], axis=1)
Train = pd.concat([Train, y_train], axis=1)
print Train.shape

(127302, 19)


In [64]:
Test = pd.DataFrame(X_test_imp, columns=num_test_colnames)
Test = pd.concat([Test, X_test[text_cols]], axis=1)
Test = pd.concat([Test, y_test], axis=1)

In [65]:
print Test.shape

(127302, 19)


In [70]:
Test.loc[Test.reviews_per_month > 0]

Unnamed: 0,bathrooms,bedrooms,beds,cleaning_fee,guests_included,host_listings_count,host_acceptance_rate,host_response_rate,host_has_profile_pic,host_identity_verified,days_delta,reviews_per_month,description,host_about,house_rules,neighborhood_overview,notes,summary,review_scores_rating
0,1.0,1.0,1.0,25.0,1.0,1.0,0.33,1.00,1.0,1.0,757.0,0.53,san francisco station at t park safeway,none,none,none,none,san francisco station at t park safeway,100.0
1,1.0,1.0,1.0,20.0,1.0,1.0,1.00,1.00,1.0,1.0,1603.0,1.62,a private bedroom in a classic victorian one b...,san francisco ca resident midwest born i have ...,we are hoping to encourage families students o...,geographically noe valley really does feel lik...,you will be hard pressed to find a better loca...,a private bedroom in a classic victorian one b...,100.0
2,1.0,1.0,1.0,70.0,1.0,1.0,0.94,0.90,1.0,1.0,1462.0,1.16,str phone number hidden private entrance bathr...,none,we expect quiet after 10 00 pm we have a frien...,the west portal neighborhood has its own charm...,parking is available with advance notice this ...,str phone number hidden private entrance bathr...,89.0
3,1.0,1.0,1.0,70.0,1.0,1.0,1.00,1.00,1.0,1.0,1147.0,0.30,great place to call a home in sf newly remodel...,none,none,none,none,great place to call a home in sf newly remodel...,73.0
4,2.0,1.0,1.0,70.0,1.0,3.0,0.96,0.83,1.0,1.0,480.0,0.56,enjoy the lovely south hill neighborhood of sa...,i love san francisco and all that it has to of...,no parties at any time no loud noise after 10 ...,the neighborhood has a variety of cuisines pre...,this bedroom has twin beds flat screen tv with...,enjoy the lovely south hill neighborhood of sa...,69.0
5,1.0,1.0,1.0,25.0,1.0,1.0,1.00,0.60,1.0,1.0,739.0,1.97,built 1931 the 3br 2 5ba is elegant but family...,i m the proud father of two girls and husband ...,please treat this house like your home,the neighborhood has everything you d expect i...,we are 1 mile from the beach the beach is seve...,built 1931 the 3br 2 5ba is elegant but family...,100.0
6,1.0,1.0,1.0,50.0,1.0,1.0,1.00,1.00,1.0,1.0,1536.0,0.03,hi welcome gorgeous unobstructed views of the ...,my name is maryam and i have been living in sa...,,none,please note for groups of more than two i requ...,hi welcome gorgeous unobstructed views of the ...,100.0
7,1.5,3.0,5.0,150.0,6.0,1.0,1.00,1.00,1.0,1.0,1827.0,5.31,legal sf city approved one bedroom one bathroo...,hello we are a family of three living in beaut...,thank you in advance for respecting our house ...,just a few blocks from golden gate park and uc...,san francisco short term residential rental ce...,legal sf city approved one bedroom one bathroo...,96.0
8,1.0,1.0,1.0,70.0,2.0,1.0,1.00,1.00,1.0,1.0,1625.0,0.04,a modern studio off a secluded garden court wi...,i m an architect in san francisco with my own ...,no smoking please inside or in the courtyard n...,our mission neighborhood is gritty and transit...,community a portion of the income from the mis...,a modern studio off a secluded garden court wi...,100.0
9,1.0,1.0,2.0,75.0,2.0,1.0,1.00,1.00,1.0,1.0,1890.0,0.44,enjoy 2 private bedrooms in a beautiful home l...,i m a california native and have lived in san ...,1 no smoking please 2 no additional pets 2 won...,the marina district has an active and friendly...,i really do love to make everyone s stay in sa...,enjoy 2 private bedrooms in a beautiful home l...,100.0


In [71]:
def drop_outliers(t):
    # Drop outliers from dependent variable
    h = 0
    if 'review_scores_value' == dependent_variable:
        h = 3
    elif 'review_scores_rating' == dependent_variable:
        h = 80
    t = t.loc[t[dependent_variable] >= h]
    
    # Remove reviews_per_month == 0
    t = t.loc[t.reviews_per_month > 0]

    # 1. Remove 'bathrooms' >= 5, convert to int
    t = t.loc[t.bathrooms < 5]
    t.bathrooms = t.bathrooms.astype(int)

    # 2. Remove 'bedrooms' > 5, convert to int
    t = t.loc[t.bedrooms <= 5]
    t.bedrooms = t.bedrooms.astype(int)

    # 3. Remove 'beds' >= 8, convert to int
    t = t.loc[t.beds < 8]
    t.beds = t.beds.astype(int)

    # 4. Remove 'cleaning_fee' > 400
    t = t.loc[t.cleaning_fee <= 400]

    # 5. Remove 'guests_included' > 8
    t = t.loc[t.guests_included <= 8]
    t = t.loc[t.guests_included > 0]
    t.guests_included = t.guests_included.astype(int)

    # 6. Remove 'host_listings_count' > 100
    t = t.loc[t.host_listings_count <= 100]
    # Remove'host_listings_count' <= 0
    t = t.loc[t.host_listings_count > 0]
    t.host_listings_count = t.host_listings_count.astype(int)
    
    

    # 7. Remove 'host_acceptance_rate' == 0.
    t = t.loc[t.host_acceptance_rate > 0.]

    # 8. Remove 'host_response_rate' == 0.
    t = t.loc[t.host_acceptance_rate > 0.]

    # 9. Remove 'reviews_per_month' > 12.
    t = t.loc[t.reviews_per_month <= 12]
    
    return t

In [73]:
Train = drop_outliers(Train)
Test = drop_outliers(Test)

In [74]:
np.unique(Train.review_scores_rating)

array([ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,
        91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99., 100.])

In [75]:
np.unique(Test.review_scores_rating)

array([ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,
        91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99., 100.])

# Separate the Dependent and Independet Variables

In [None]:
# Train data
y_train = Train[dependent_variable]
print y_train.shape

X_train = Train.drop([dependent_variable], axis=1)
print X_train.shape

# Test data
y_test = Test[dependent_variable]
print y_test.shape

X_test = Test.drop([dependent_variable], axis=1)
print X_test.shape

In [None]:
print type(X_train)
print type(X_test)

# Scale the Numeric columns
We don't want any one variable dominating others in the regression model, so let's start by scaling the dependent variables. As with Imputation, we will train the scaler only on the training data (to learn the mean and standard deviation), and then use it as-is on the Test data. We will assume that all dependent variables are interval.

In [None]:
X_train_num = pd.DataFrame(X_train_num, columns=numeric_cols)
X_test_num = pd.DataFrame(X_test_num, columns=numeric_cols)

# Write back all data to csv

In [None]:
# Write numeric data

In [None]:
y_train = pd.DataFrame(y_train, columns=[dependent_variable])
y_test = pd.DataFrame(y_test, columns=[dependent_variable])

In [None]:
X_train_num.to_csv('./data/fin_num_train_deps.csv', encoding='utf8')
y_train.to_csv('./data/fin_train_indeps.csv', encoding='utf8')
X_test_num.to_csv('./data/fin_num_test_deps.csv', encoding='utf8')
y_test.to_csv('./data/fin_test_indeps.csv', encoding='utf8')

In [None]:
# Write categorical data

In [None]:
X_train_cat = X_train[categorical_cols].astype(int)
X_test_cat = X_test[categorical_cols].astype(int)

X_train_cat.to_csv('./data/fin_cat_train_deps.csv', encoding='utf8')
X_test_cat.to_csv('./data/fin_cat_test_deps.csv', encoding='utf8')

In [None]:
# Write amenities columns

In [None]:
X_train_amen = X_train[amen_cols].astype(int)
X_test_amen = X_test[amen_cols].astype(int)

X_train_amen.to_csv('./data/fin_amen_train_deps.csv', encoding='utf8')
X_test_amen.to_csv('./data/fin_amen_test_deps.csv', encoding='utf8')

In [None]:
print (X_train_num.shape, X_train_cat.shape, X_train_amen.shape)
print (X_test_num.shape, X_test_cat.shape, X_test_amen.shape)

In [None]:
# Create a combined Train and Test dataset

In [None]:
num_train = pd.read_csv('./data/fin_num_train_deps.csv')
num_train = num_train.drop(['Unnamed: 0'], axis=1)
num_test = pd.read_csv('./data/fin_num_test_deps.csv')
num_test = num_test.drop(['Unnamed: 0'], axis=1)

cat_train = pd.read_csv('./data/fin_cat_train_deps.csv')
cat_train = cat_train.drop(['Unnamed: 0'], axis=1)
cat_test = pd.read_csv('./data/fin_cat_test_deps.csv')
cat_test = cat_test.drop(['Unnamed: 0'], axis=1)

amen_train = pd.read_csv('./data/fin_amen_train_deps.csv')
amen_train = amen_train.drop(['Unnamed: 0'], axis=1)
amen_test = pd.read_csv('./data/fin_amen_test_deps.csv')
amen_test = amen_test.drop(['Unnamed: 0'], axis=1)

In [None]:
print (num_train.shape, cat_train.shape, amen_train.shape)

In [None]:
X_train_comb = pd.concat([num_train, cat_train, amen_train], 
                         axis=1)
X_test_comb = pd.concat([num_test, cat_test, amen_test], 
                        axis=1)

In [None]:
print X_train_comb.shape, y_train.shape
print X_test_comb.shape, y_test.shape

In [None]:
X_train_comb.to_csv('./data/fin_comb_train_deps.csv', encoding='utf8')
X_test_comb.to_csv('./data/fin_comb_test_deps.csv', encoding='utf8')