In [None]:
%load_ext autoreload
%autoreload 2
import src.data_proc as data_proc

import numpy as np
import pandas as pd
import sys
import os
import gc
import random
pd.options.display.max_columns = None
pd.options.mode.chained_assignment = None
pd.options.display.float_format

from sklearn.model_selection import train_test_split

from catboost import CatBoostRegressor, Pool

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [None]:
%%time
# Load in the data
train_2016 = pd.read_csv("data/train_2016_v2.csv")
prop = pd.read_csv("data/properties_2016.csv", dtype={
    'propertycountylandusecode': str,
    'hashottuborspa': str,
    'propertyzoningdesc': str,
    'fireplaceflag': str,
    'taxdelinquencyflag': str
})

# Parse the flag attributes
def convert_true_to_float(df, col):
    df.loc[df[col] == 'true', col] = '1'
    df.loc[df[col] == 'Y', col] = '1'
    df[col] = df[col].astype(float)

for col in ['hashottuborspa', 'fireplaceflag', 'taxdelinquencyflag']:
    convert_true_to_float(prop, col)

In [None]:
print("Number of transaction records: {}".format(len(train_2016)))
print("Number of properties: {}".format(len(prop)))
print("Number of property features: {}".format(len(prop.columns)-1))

In [None]:
# Rename & retype the feature columns; also unify representations of missing values
data_proc.rename_columns(prop)
data_proc.retype_columns(prop)

In [None]:
# Join the training data with the property table
train_2016 = train_2016.merge(how='left', right=prop, on='parcelid')

# Look at how complete (i.e. no missing value) each training set feature is
data_proc.print_complete_percentage(train_2016)
train_2016.head(30)

In [None]:
# Look at the distribution of the target variable (log-error)
print(train_2016['logerror'].describe())
train_2016.loc[abs(train_2016['logerror']) < 0.6, 'logerror'].hist(bins=40)

In [None]:
# Looks like there are some outliers in the training data (very large logerror)
# abs(logerror) > 0.6 seems abnormal
# Maybe should identify and remove them?
threshold = 0.6
print("{} training examples in total".format(len(train_2016)))
print("{} with abs(logerror) > {}".format((abs(train_2016['logerror']) > threshold).sum(), threshold))

train_2016 = train_2016[abs(train_2016.logerror) <= threshold]

Analyze time data, and engineer some features!

In [None]:
datetime = pd.to_datetime(train_2016.transactiondate).dt
year = datetime.year
month = datetime.month
quarter = datetime.quarter
day = datetime.day

for m in range(1, 13):
    print("{}: {}".format(m, train_2016[month == m].logerror.median()))
print()

for q in range(1, 5):
    print("{}: {}".format(q, train_2016[quarter == q].logerror.median()))
print()

for d in range(1, 32):
    print("{}: {}".format(d, train_2016[day == d].logerror.median()))

In [None]:
datetime = pd.to_datetime(train_2016.transactiondate).dt
train_2016['year'] = datetime.year
train_2016['month'] = datetime.month
train_2016['quarter'] = datetime.quarter

In [None]:
print(train_2016.groupby('year')['year', 'logerror'].median())
print(train_2016.groupby('month')['month', 'logerror'].median())
print(train_2016.groupby('quarter')['quarter', 'logerror'].median())

Analyze location and neighborhood data, and see if we can extract some features

In [None]:
temp = prop[prop.regionidneighborhood == 27080]
plt.scatter(temp.latitude, temp.longitude, s=1)

In [None]:
temp.describe()

In [None]:
# Let's see if there are any feature value outliers -> looks like things are fine, no need to process
prop_2017.describe().loc[['min', 'max', 'mean']].T