In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from _lib.preprocess import preprocess_missing as prep
from _lib.preprocess import get_instruction as info

In [None]:
plt.style.use("seaborn")

In [None]:
df = pd.read_csv("_database/Input/train.csv", index_col = 0)

df = prep(df)

In [None]:
df["LogSalePrice"] = np.log(df["SalePrice"])

In [None]:
y = df["SalePrice"]

In [None]:
X = df.drop(["LogSalePrice", "SalePrice"], axis = 1)

### Split the training data and test data

In [None]:
seed = 500

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed)

### Feature Engineering

In [None]:
df_type = pd.DataFrame({"Columns" : X.columns, "Type" : [str(X[col].dtype) for col in X.columns]})
num_columns = df_type.loc[(df_type["Type"] == "int64") | (df_type["Type"] == "float64")]["Columns"]
cat_columns = df_type.loc[(df_type["Type"] != "int64") & (df_type["Type"] != "float64")]["Columns"]

In [None]:
X_num = X_train[num_columns.values].values
X_cat = X_train[cat_columns.values]

In [None]:
# We extract the categories from the data description.txt and use it as categories in OneHotEncoder
list_categories = [np.array(info(col)) for col in cat_columns.values]
list_categories[14] = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
list_categories[15] = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [None]:
enc = OneHotEncoder(drop = 'first', sparse = False, categories = list_categories) # get dummy variables

In [None]:
df["OverallQual"].dtype

In [None]:
list_categories

In [None]:
X_cat = enc.fit_transform(X_cat)

In [None]:
with open("_lib/preprocess.py") as pre:
    lines = pre.readlines()

In [None]:
def line_insertion(lines, col_name, before, after):
    lines.insert(39, '    \n')
    lines.insert(39, '    df.loc[df[{col}] == "{bef}", "{col}"] = "{af}"\n'.format(col = col_name, bef = before, af = after))
    lines.insert(39, '    # {col} value {bef} change to {af}\n'.format(col = col_name, bef = before, af = after))

In [None]:
with open("_lib/preprocess.py", "w") as pre:
    pre.write("".join(lines))

In [None]:
list(enumerate(lines))

In [None]:
regex = re.compile("(\[.*\])")
regex2 = re.compile("'.*'")
try: 
    enc.fit_transform(X_cat)
except ValueError as err:
    print("Working on {}\n".format(err))
    err = str(err)
    match = regex.search(err).group(1)
    err = re.findall("'(.*?)'", match)
    
    infor = True
    
    for e in err:
        
        for i in range(len(cat_columns)):
            if e in df[cat_columns.values[i]].unique():
                
                err_col = cat_columns.values[i]
                
                if infor:
                    _ = info(err_col, output = False)
                    infor = False
                
                print("{} is not found\n".format(e))
                
                while True:
                    correct_val = input("Enter the correct value : ")
                    if correct_val == "info":
                        print(df[err_col].value_counts())
                        continue
                    break
                
                line_insertion(lines, err_col, e, correct_val)
                
                print("{bf} was successfully change to {af}".format(bf = e, af = correct_val))
                break


In [None]:
info(err_col, output = False)

In [None]:
cats = re.compile("\s+([A-Za-z0-9.,*]+)\s+")

In [None]:
info(err_col)

In [None]:
re.search(cats, "1.5Fin	One and one-half story: 2nd level finished").group(1)

In [None]:
df[err_col].value_counts()

In [None]:
err_name = '2fmCon'
for i in range(len(cat_columns)):
    if err_name in df[cat_columns.values[i]].unique():
        err_col = cat_columns.values[i]
        print(err_col)
        break 

In [None]:
df[err_col].value_counts()

In [None]:
print(X_num.shape)

In [None]:
print(X_cat.shape)

In [None]:
X_train = np.concatenate([X_num, X_cat], axis = 1)

In [None]:
print(X_train.shape, y_train.shape)

### Fit Linear Regression model

In [None]:
X_num = X_test[num_columns.values].values
X_cat = X_test[cat_columns.values]

In [None]:
X_cat = enc.transform(X_cat)

In [None]:
X_test = np.concatenate([X_num, X_cat], axis = 1)

In [None]:
X_test.shape

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X_train, y_train)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
y_pred = linreg.predict(X_test)

In [None]:
# Compute the MSE
mean_squared_error(y_test, y_pred)

In [None]:
linreg.score(X_test, y_test)

We got 75% score

### Validate the model using 10-fold cross validation

In [None]:
scores = cross_val_score(linreg, X, y, cv = 10)

In [None]:
print(scores)

In [None]:
print(np.mean(scores))

Scores are varies from low to high -> bad

### Feature Engineering 2 (Polynomial)

In [None]:
def gen_poly(data, degree):
    '''
    Input : Vector or matrix 
    
    Return matrix of polynomial for each polynomial degree from 1 to degree calculated on each column
    '''
    
    result = np.concatenate([np.power(data, d) for d in np.arange(1, degree + 1)], axis = 1)
    return result

### Pipelines

In [None]:
d = 1

In [None]:
prep_func = FunctionTransformer(lambda x : prep(x), validate = False)

In [None]:
get_numerical = FunctionTransformer(lambda x : x[num_columns.values].values,
                                    validate = False)
get_category = FunctionTransformer(lambda x : x[cat_columns.values], validate = False)
generate_poly = FunctionTransformer(lambda x : gen_poly(x, d), validate = False)

In [None]:
pipeline_num_prep = Pipeline([('selector', get_numerical),
                              ('poly', generate_poly)])

pipeline_cat_prep = Pipeline([('selector', get_category),
                              ('Dummy', OneHotEncoder(drop = 'first', sparse = False))])

In [None]:
transformers = [ ('Numerical', pipeline_num_prep), ('Categorical', pipeline_cat_prep) ]

In [None]:
preprocess_union = FeatureUnion(transformer_list = transformers)

In [None]:
pl = Pipeline([
    ('preprocessing', prep_func),
    ('union', preprocess_union),
    ('linreg', LinearRegression())
])

### Predict the test sample from csv files

In [None]:
test = pd.read_csv("_database/Input/test.csv")