# Attempt 3.1: Numericalize Equipment Model

## idea: 
* Evolved from [James' idea](https://github.com/LJamesHu/regression-case-study/blob/master/model.py)
* Use "average sales price" and "average sales price per age" as numerical representations of equipment model.
    * Parts to a full "equipment model identifier":
        * `ProductGroup`
        * `fiBaseModel`
        * `ModelID`
    * Group observations by model and take the average, store in dictionaries:
        * average price if only `ProductGroup` is known
        * average price if onl `ProductGroup` and `fiBaseModel` are known
        * average price when all three parts are known
* Reference sales price for a test observation is 
    * `(average sales price per age) * (age)` if the test observation has a valid age
    * `(average sales price)` if the test observation does not have a valid age
<br>

## feature matrix:
* `Age`: sales date - year made
* `price_ref`: average sales price found in the reference dictionaries

<br>
##  v3.1:
* rows with invalid `Age` ~= 10% of data...
    * training -- discard them
    * testing -- replace with average age of equipment model

<br>
*(converted into an object)*
<br>
<hr style='	background-color: #fff; border-top: 2px dashed #8c8b8b;'>

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.grid_search import GridSearchCV 

from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

In [3]:
import pandas as pd
import numpy as np
import zipfile

In [4]:
df_train = pd.read_csv(zipfile.ZipFile('data/Train.zip', mode='r').open('Train.csv'), low_memory=False)
df_test = pd.read_csv(zipfile.ZipFile('data/Test.zip', mode='r').open('test.csv'), low_memory=False)

<hr style='	background-color: #fff; border-top: 2px dashed #8c8b8b;'>
<br>
# Data Cleaner

In [82]:
def df_cleaner(df, isTrain=False, mute=True):
    
    info = [ \
            'ProductGroup',
            'state',
            'fiBaseModel',
            'ModelID',
           ]
 
    df = df[info + ['YearMade', 'saledate'] + isTrain*['SalePrice']].copy()


    df['ModelID'] = df['ModelID'].astype('str')
    

    # index string `1/10/2003 0:00` to get sale year (faster than using `pd.DatetimeIndex(df['saledate']).year`)
    df['Age'] = df.saledate.str[-9:-4].astype(int) - df['YearMade']
    
    if isTrain:
        # from inspecting the historgram of YearMade, drop all made before 1940 (probably fictitious)
        df = df[df.YearMade >= 1940] 
        
        df = df[df.Age > 0]
        df['PricePerAge'] = df['SalePrice'] / (df['Age'] + 1)  # avoid division by 0
        
    ## what to do with NaNs???
    
    df.drop(['YearMade', 'saledate'], axis=1, inplace=True)

    if not mute:
        print df.columns
        print df.shape
        
    return df

### Tester

In [83]:
df = df_cleaner(df_train[:30], isTrain=True, mute=False)
# %timeit df_cleaner(df_train[:100])

Index([u'ProductGroup', u'state', u'fiBaseModel', u'ModelID', u'SalePrice',
       u'Age', u'PricePerAge'],
      dtype='object')
(28, 7)


<hr style='	background-color: #fff; border-top: 2px dashed #8c8b8b;'>
<br>
# Model

In [84]:
def model_average_dics(df):
    """
    :type df: pandas.DataFrame
    :type mute: bool
    :rtype: pandas.DataFrame
    """
    
    info = [ \
            'ProductGroup',  ## every vehicle has a valid ProductGroup
            'state',
            'fiBaseModel',
            'ModelID',
           ]

    
    Price_dics, PricePerAge_dics, Age_dics = {}, {}, {}
    for i in xrange(1,len(info)+1):
        cnames = info[0:i]
        
        # SalePrice and PricePerAge both have no NaN so can dropna together
        grpdf = df.dropna().groupby(cnames) 
        Price_dics[i]       = grpdf.mean()['SalePrice'].to_dict()
        PricePerAge_dics[i] = grpdf.mean()['PricePerAge'].to_dict()
        Age_dics[i]         = grpdf.mean()['Age'].to_dict()
        
    return (Price_dics, PricePerAge_dics, Age_dics)
    

In [85]:
def get_reference_price(df, price_dics=None, mute=True):
    """
    Columns in df: info, SalePrice, Age, PricePerAge
    """

    if not mute:
        print df.columns
    
    
    if not price_dics:
        price_dics = model_average_dics(df)
    
    
    info = [ \
            'ProductGroup',
            'state',
            'fiBaseModel',
            'ModelID',
           ]
    
    feats = df[info + ['Age']].values  # only want info and Age

    completeness = df[info].notnull().sum(1).values  # how complete a row's info is 
    hasAge = feats[:,-1] >= 0
    age_multiplier = feats[:,-1]
    age_multiplier[age_multiplier >= 0] += 1   # because PricePerAvg was calculated by dividing by Age+1
    #age_multiplier[age_multiplier < 0] = 1    # will multiply to avg SalePrice

    PricePerAge_dics = price_dics[1]
    Age_dics = price_dics[2]
    
    price_ref = np.zeros(len(df))
    for r in xrange(len(feats)):
        c = completeness[r]
        t = tuple(feats[r,:c])
        
        while t not in PricePerAge_dics[c]:
            c -= 1
            t = feats[r,0] if c == 1 else tuple(feats[r,:c])
            if c == 0: print 'Error! \n Index %i:%s' % (df.index[r], feats[r])
        
        price_ref[r] = (age_multiplier[r] if hasAge[r] else Age_dics[c][t]) * PricePerAge_dics[c][t]

        
    return (price_ref, price_dics)

In [86]:
def fit(estimator, df, mute=True):
    
    df = df_cleaner(df, isTrain=True, mute=mute)
    price_ref, price_dics = get_reference_price(df, mute=mute)
        
    X = np.stack((df['Age'].astype(float).values, price_ref), axis=-1)
    y = df['SalePrice'].values
    
    if not mute:
        print X.shape
        print y.shape
        
    return estimator.fit(X,y), price_dics

### Tester

In [87]:
model, price_dics = fit(RandomForestRegressor(n_jobs=-1), df_train[:30])

<hr style='background-color: #fff; border-top: 2px dotted #8c8b8b;'>

In [88]:
def predict(estimator, price_dics, df, mute=True):
    
    df = df_cleaner(df, isTrain=False, mute=mute)
    price_ref, price_dics = get_reference_price(df, price_dics=price_dics, mute=mute)

    X = np.stack((df['Age'].astype(float).values, price_ref), axis=-1)
    
    return estimator.predict(X)

### Tester

In [89]:
y_pred = predict(model, price_dics, df_train[:30])

<hr style='	background-color: #fff; border-top: 2px dashed #8c8b8b;'>
<br>
# Quick Test -- 2000 Random Observations
***(n.b.: comment out "`rand_i = rand_i[:2000]`" for a full test on `Train.csv` data)***

In [94]:
np.random.seed(42)

rand_i = range(len(df_train))
np.random.shuffle(rand_i)
rand_i = rand_i[:2000]
test_size = int(len(rand_i)*0.2)

qt_df_test = df_train.iloc[rand_i[:test_size], :]
qt_df_train = df_train.iloc[rand_i[test_size:], :]

In [95]:
estimator = RandomForestRegressor(50, n_jobs=-1)
# estimator = GradientBoostingRegressor()
# estimator = AdaBoostRegressor()

model, price_dics = fit(estimator, qt_df_train)

In [96]:
y_pred = predict(model, price_dics, qt_df_test)
print rmsle(qt_df_test['SalePrice'].values, y_pred)

0.909088934571


<hr style='	background-color: #fff; border-top: 2px dashed #8c8b8b;'>
<br>

# Moment of Truth... (dum dum dum)

In [15]:
def rmsle(y, y_pred):
    """
    :type y: numpy.array
    :type y_pred: numpy.array
    :rtype: float
    """
    return np.sqrt(np.sum((np.log(y_pred + 1) - np.log(y + 1))**2) / len(y))

In [21]:
test_solution = pd.read_csv('data/do_not_open/test_soln.csv')

In [97]:
model, price_dics = fit(RandomForestRegressor(30, n_jobs=-1), df_train)

In [98]:
y_pred = predict(model, price_dics, df_test)
print rmsle(test_solution['SalePrice'].values, y_pred)

0.651515625128


In [None]:
result_df = pd.DataFrame(np.c_[df_test['SalesID'].values, y_pred], 
                         columns = ['SalesID', 'SalePrice'])
result_df.to_csv('result_individual2.csv', index=False)