In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from _lib.preprocess import preprocess_missing as prep
from _lib.preprocess import get_instruction as info

In [2]:
plt.style.use("seaborn")

In [3]:
df = pd.read_csv("_database/Input/train.csv", index_col = 0)

df = prep(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [4]:
df["LogSalePrice"] = np.log(df["SalePrice"])

In [5]:
y = df["SalePrice"]

In [6]:
X = df.drop(["LogSalePrice", "SalePrice"], axis = 1)

### Split the training data and test data

In [7]:
seed = 500

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed)

### Feature Engineering

In [9]:
df_type = pd.DataFrame({"Columns" : X.columns, "Type" : [str(X[col].dtype) for col in X.columns]})
num_columns = df_type.loc[(df_type["Type"] == "int64") | (df_type["Type"] == "float64")]["Columns"]
cat_columns = df_type.loc[(df_type["Type"] != "int64") & (df_type["Type"] != "float64")]["Columns"]

In [10]:
X_num = X_train[num_columns.values].values
X_cat = X_train[cat_columns.values]

In [11]:
list_categories = [np.array(info(col)) for col in cat_columns.values]

In [12]:
enc = OneHotEncoder(drop = 'first', sparse = False, categories = list_categories) # get dummy variables

In [13]:
X_cat = enc.fit_transform(X_cat)

ValueError: Found unknown categories ['1.5Fin', '2.5Unf', '1.5Unf', '2.5Fin'] in column 13 during fit

In [14]:
err_name = '2fmCon'
for i in range(len(cat_columns)):
    if err_name in df[cat_columns.values[i]].unique():
        err_col = cat_columns.values[i]
        print(err_col)
        break

In [None]:
df[err_col].value_counts()

In [None]:
print(X_num.shape)

In [None]:
print(X_cat.shape)

In [None]:
X_train = np.concatenate([X_num, X_cat], axis = 1)

In [None]:
print(X_train.shape, y_train.shape)

### Fit Linear Regression model

In [None]:
X_num = X_test[num_columns.values].values
X_cat = X_test[cat_columns.values]

In [None]:
X_cat = enc.transform(X_cat)

In [None]:
X_test = np.concatenate([X_num, X_cat], axis = 1)

In [None]:
X_test.shape

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X_train, y_train)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
y_pred = linreg.predict(X_test)

In [None]:
# Compute the MSE
mean_squared_error(y_test, y_pred)

In [None]:
linreg.score(X_test, y_test)

We got 75% score

### Validate the model using 10-fold cross validation

In [None]:
scores = cross_val_score(linreg, X, y, cv = 10)

In [None]:
print(scores)

In [None]:
print(np.mean(scores))

Scores are varies from low to high -> bad

### Feature Engineering 2 (Polynomial)

In [None]:
def gen_poly(data, degree):
    '''
    Input : Vector or matrix 
    
    Return matrix of polynomial for each polynomial degree from 1 to degree calculated on each column
    '''
    
    result = np.concatenate([np.power(data, d) for d in np.arange(1, degree + 1)], axis = 1)
    return result

### Pipelines

In [None]:
d = 1

In [None]:
prep_func = FunctionTransformer(lambda x : prep(x), validate = False)

In [None]:
get_numerical = FunctionTransformer(lambda x : x[num_columns.values].values,
                                    validate = False)
get_category = FunctionTransformer(lambda x : x[cat_columns.values], validate = False)
generate_poly = FunctionTransformer(lambda x : gen_poly(x, d), validate = False)

In [None]:
pipeline_num_prep = Pipeline([('selector', get_numerical),
                              ('poly', generate_poly)])

pipeline_cat_prep = Pipeline([('selector', get_category),
                              ('Dummy', OneHotEncoder(drop = 'first', sparse = False))])

In [None]:
transformers = [ ('Numerical', pipeline_num_prep), ('Categorical', pipeline_cat_prep) ]

In [None]:
preprocess_union = FeatureUnion(transformer_list = transformers)

In [None]:
pl = Pipeline([
    ('preprocessing', prep_func),
    ('union', preprocess_union),
    ('linreg', LinearRegression())
])

### Predict the test sample from csv files

In [None]:
test = pd.read_csv("_database/Input/test.csv")