This is an example of a (not very interesting) submission.

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Binarizer, Imputer, OneHotEncoder

import numpy as np
import pandas as pd

In [2]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column 
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X[[self.column]].values 

class CategoricalExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        self.values = None
        
    def _create_values(self, indices):
        return {ind: i+1 for i, ind in enumerate(indices)}
    
    def _apply_values(self, row_val):
        return self.values.get(row_val, 0)
        
    def fit(self, X, y=None):
        self.values = self._create_values(X[self.column].value_counts().index)
        return self 
    
    def transform(self, X, y=None):
        col = X[self.column].apply(self._apply_values)
        return col.values.reshape(-1, 1)

In [3]:
df = pd.read_csv('datasets/train.csv')
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,Normal,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,,,,0,1,2010,WD,Abnorml,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,138500


In [4]:
df[['Lot Frontage', 'Lot Area', 'Street']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 3 columns):
Lot Frontage    1721 non-null float64
Lot Area        2051 non-null int64
Street          2051 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 48.1+ KB


In [5]:
y = df[['SalePrice']].copy()
X = df[[col for col in df if col not in ['SalePrice', 'Sale Condition']]].copy()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
lot_frontage_pipe = Pipeline([
    ('extract', FeatureExtractor('Lot Frontage')),
    ('impute_average', Imputer(strategy='mean')), # this may not be an appropriate transformation. Why?
])

street_pipe = Pipeline([
    ('extract', CategoricalExtractor('Street')),
    ('dummify', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

feature_union = FeatureUnion([
    ('lot_frontage', lot_frontage_pipe),
    ('street', street_pipe),
    ('lot_area', FeatureExtractor('Lot Area'))
])

model = Pipeline([
    ('extract_features', feature_union),
    ('linear_reg', LinearRegression())
])

model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.0976084901718
