In [14]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
from tqdm import tqdm
import gc
import datetime as dt
pd.options.display.max_columns = 999

In [3]:

print('Loading Properties ...')
properties2016 = pd.read_csv("C:\\Users\\Lin\\Desktop\\SMU\\Y3S1\\DM G1\\Project\\Codes\\data\\properties_2016.csv", low_memory = False)
properties2017 = pd.read_csv("C:\\Users\\Lin\\Desktop\\SMU\\Y3S1\\DM G1\\Project\\Codes\\data\\properties_2017.csv", low_memory = False)
print('Loading Train ...')
train = pd.read_csv("C:\\Users\\Lin\\Desktop\\SMU\\Y3S1\\DM G1\\Project\\Codes\\data\\train.csv", parse_dates=['transactiondate'], low_memory=False)
print('Loading Done')

Loading Properties ...
Loading Train ...
Loading Done


In [4]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 +df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df
train = add_date_features(train)

In [6]:
sample_submission  = pd.read_csv("C:\\Users\\Lin\\Desktop\\SMU\\Y3S1\\DM G1\\Project\\Codes\\data\\sample_submission.csv", low_memory=False)
sample_submission.columns = ['parcelid','201704','201705','201706','201707','201708','201709']

In [8]:
train = pd.merge(train, properties2017, how='left' ,on = 'parcelid')

In [10]:
train.iloc[:, train.columns.str.startswith('tax')] = np.nan

In [16]:
train_df = train
test_df = pd.merge(sample_submission[['parcelid']], properties2017, how='left',on='parcelid')

In [17]:
del properties2016, properties2017, train
gc.collect();

In [18]:
print('Remove missing data fields ...')

missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % len(exclude_missing))

Remove missing data fields ...
We exclude: 18


In [19]:
del num_rows, missing_perc_thresh
gc.collect();

In [20]:
print ("Remove features with one unique value !!")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % len(exclude_unique))

Remove features with one unique value !!
We exclude: 10


In [22]:
print ("Define training features !!")
exclude_other = ['parcelid', 'logerror','propertyzoningdesc','subdate']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % len(train_features))

Define training features !!
We use these for training: 39


In [23]:
print ("Define categorial features !!")
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)

Define categorial features !!


In [24]:
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt']


In [25]:
print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Replacing NaN values by -999 !!


In [26]:
print ("Training time !!")
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

Training time !!
(113039, 39) (113039,)


In [27]:
test_df['transactiondate'] = pd.Timestamp('2016-12-01') 
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

(2985217, 39)


In [28]:
num_ensembles = 5
y_pred = 0.0
for i in tqdm(range(num_ensembles)):
    model = CatBoostRegressor(
        iterations=630, learning_rate=0.03,
        depth=6, l2_leaf_reg=3,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=i)
    model.fit(
        X_train, y_train,
        cat_features=cat_feature_inds)
    y_pred += model.predict(X_test)
y_pred /= num_ensembles

100%|███████████████████████████████████████████| 5/5 [36:34<00:00, 438.89s/it]


In [29]:
submission = pd.DataFrame({
    'parcelid': test_df['parcelid'],
})

In [31]:
len(y_pred)

2985217

In [32]:
test_dates = {
    '201704': pd.Timestamp('2016-04-30'),
    '201705': pd.Timestamp('2016-05-31'),
    '201706': pd.Timestamp('2016-06-30'),
    '201707': pd.Timestamp('2017-07-30'),
    '201708': pd.Timestamp('2017-08-31'),
    '201709': pd.Timestamp('2017-09-30')
}

In [33]:
test_dates

{'201704': Timestamp('2016-04-30 00:00:00'),
 '201705': Timestamp('2016-05-31 00:00:00'),
 '201706': Timestamp('2016-06-30 00:00:00'),
 '201707': Timestamp('2017-07-30 00:00:00'),
 '201708': Timestamp('2017-08-31 00:00:00'),
 '201709': Timestamp('2017-09-30 00:00:00')}

In [36]:
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred

Predicting for: 201704 ... 
Predicting for: 201705 ... 
Predicting for: 201706 ... 
Predicting for: 201707 ... 
Predicting for: 201708 ... 
Predicting for: 201709 ... 


In [37]:
submission.to_csv('Only_CatBoost.csv', float_format='%.6f',index=False)

In [40]:
test_path = "C:\\Users\\Lin\\Desktop\\SMU\\Y3S1\\DM G1\\Project\\Codes\\data\\test.csv"
test = pd.read_csv(test_path, low_memory=False)
test_size = len(test)

In [42]:
from math import fabs as absolute
total_ae = 0.0
print("Processing ", test_size, "rows")
for index,row in test.iterrows():
    r = submission.loc[submission['parcelid'] == row[0]]
    
    pred_error = r[str(row[3])]
    total_ae += absolute(pred_error - row[1])
    
    if index % 10000 == 0 and index !=0:
        print("Completed: ", index, "rows")
        
print("===Process Completed===")
print("\n Final score:", total_ae/test_size)

Processing  54849 rows
Completed:  10000 rows
Completed:  20000 rows
Completed:  30000 rows
Completed:  40000 rows
Completed:  50000 rows
===Process Completed===

 Final score: 0.06846661204833332


In [43]:
submission.head()

Unnamed: 0,parcelid,201704,201705,201706,201707,201708,201709
0,10754147,0.020475,0.020475,0.020475,0.020475,0.020475,0.020475
1,10759547,0.004258,0.004258,0.004258,0.004258,0.004258,0.004258
2,10843547,0.031898,0.031898,0.031898,0.031898,0.031898,0.031898
3,10859147,0.036341,0.036341,0.036341,0.036341,0.036341,0.036341
4,10879947,0.008318,0.008318,0.008318,0.008318,0.008318,0.008318
