In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
from importlib import reload

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

from sklearn.linear_model import LinearRegression

from _lib.preprocess import preprocess_missing as prep
from _lib.preprocess import get_instruction as info
from _lib.preprocess_test import preprocess_missing as prep_test

df = pd.read_csv("_database/Input/train.csv", index_col = 0)

df = prep(df)

X = df.drop("SalePrice", axis = 1)
y = df["SalePrice"]



(1460, 80)
(1460, 80)


In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,
#                                                   random_state = 10)

### Pipelines

In [2]:
# Function to generate polynomial features of numerical variables
def gen_poly(data, degree):
    '''
    Input : Vector or matrix 
    
    Return matrix of polynomial for each polynomial degree from 1 to degree calculated on each column
    '''
    
    result = np.concatenate([np.power(data, d) for d in np.arange(1, degree + 1)], axis = 1)
    return result

df_type = pd.DataFrame({"Columns" : X.columns, "Type" : [str(X[col].dtype) for col in X.columns]})
num_columns = df_type.loc[(df_type["Type"] == "int64") | (df_type["Type"] == "float64")]["Columns"]
cat_columns = df_type.loc[(df_type["Type"] != "int64") & (df_type["Type"] != "float64")]["Columns"]

# Categories in categorical features
list_categories = [np.array(info(col)) for col in cat_columns.values]
list_categories[14] = np.arange(1, 11)
list_categories[15] = np.arange(1, 11)

# Polynomial degree
poly_degree = 1

get_numerical = FunctionTransformer(lambda x : x[num_columns.values].values,
                                    validate = False)
get_category = FunctionTransformer(lambda x : x[cat_columns.values], validate = False)
generate_poly = FunctionTransformer(lambda x : gen_poly(x, poly_degree), validate = False)

pipeline_num_prep = Pipeline([('selector', get_numerical),
                              ('poly', generate_poly)])

pipeline_cat_prep = Pipeline([('selector', get_category),
                              ('Dummy', OneHotEncoder(drop = 'first', sparse = False,
                                                     categories = list_categories))])

transformers = [ ('Numerical', pipeline_num_prep), ('Categorical', pipeline_cat_prep) ]

preprocess_union = FeatureUnion(transformer_list = transformers)

pl = Pipeline([
    ('union', preprocess_union)

])

### Linear Regression

In [3]:
X_prep = preprocess_union.fit_transform(X)

In [4]:
linreg = LinearRegression()

In [5]:
linreg.fit(X_prep, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Predict the test sample from csv files

In [6]:
test = pd.read_csv("_database/Input/test.csv", index_col = 0)

In [7]:
X_test = prep_test(test)

In [8]:
X_test = preprocess_union.transform(X_test)

In [11]:
y_pred = linreg.predict(X_test)

In [17]:
# write to csv files for submission
result = pd.DataFrame(data = y_pred, index = test.index, columns = ["SalePrice"])
result

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,115606.061790
1462,162037.410468
1463,191196.297657
1464,196578.742849
1465,208611.067091
...,...
2915,85092.360143
2916,78062.891973
2917,178319.168491
2918,116931.862866
