In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
#import seaborn as sns
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn import linear_model

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.decomposition import PCA
from feature_engineering import *
from train_and_pred import *

# Function Definitions

In [2]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred)/y_true))*100

In [3]:
def score(y_test,y_pred):
    #### Score using RMSE (root mean square error)
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    print("RMSE score: %f" % rmse)
    
    #### Score using MAPE (mean absolute porcentage error)
    ###### (MAPE is how far the model’s predictions are off from their corresponding outputs on average)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print("MAPE score: %f" % mape)

# Read File

In [4]:
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv') #Data Test for kaggle

### Replace values where NaN has meaning

In [5]:
data = replace_NaN_meaning(data)
data_test = replace_NaN_meaning(data_test)

## Transforming some numerical variables that are really categorical

In [6]:
data = transform_numerical_col_categorical(data)
data_test = transform_numerical_col_categorical(data_test)

### Split features and target

In [7]:
lis_drop_num_columns = ['SalePrice','Id']
X = data.drop(lis_drop_num_columns,axis=1)
Y = data[['SalePrice']]

id_test_kaggle = data_test[['Id']]
X_kaggle = data_test.drop('Id',axis=1)

# Handle Missing Data

In [8]:
X, num_columns, categ_columns = handle_missing_data(X)
X_kaggle, num_columns_kg, categ_columns_kg = handle_missing_data(X_kaggle)

## Concat Dataframes

In [9]:
concat_df = pd.concat([X,X_kaggle])

## Normalize numerical data

In [10]:
concat_df[num_columns.columns] = preprocessing.scale(concat_df[num_columns.columns])

## Using One-Hot encoding

In [11]:
#use one-hot encoding in categorical data
one_hot_encoding_all = pd.get_dummies(concat_df[categ_columns.columns])

### Join categorical and numerical columns again

In [12]:
X_final, X_final_kaggle = split_data_dataKaggle(X, X_kaggle, one_hot_encoding_all,num_columns)

## Split train and test

In [13]:
#Split train x test
X_train, X_test, y_train, y_test = train_test_split(X_final, Y, shuffle=True, random_state=42)

# Train and prediction

#### Random Florest

In [14]:
reg, y_pred = train_pred_RandomFlorest(X_train, y_train, X_test, grid_search=False)

#### Gradient Boosting Regressor

In [15]:
reg, y_pred = train_pred_GradientBoostingRegressor(X_train, y_train, X_test, grid_search=False)

#### Recursive feature elimination with cross-validation#### 

In [None]:
# Create the RFE object and compute a cross-validated score.
rf = RandomForestRegressor()
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=rf, step=1, cv=StratifiedKFold(2))
reg = rfecv.fit(X_train, y_train)
print("Optimal number of features : %d" % rfecv.n_features_)

### Lasso

In [None]:
reg7 = linear_model.Lasso().fit(X_train, y_train)
y_pred7 = reg7.predict(X_test)

In [None]:
param_grid = {'alpha' : [0.01,0.1,1,10]  }
grid_rf = GridSearchCV(linear_model.Lasso(), param_grid, cv=10)
reg7 = grid_rf.fit(X_train, y_train)
y_pred7 = reg7.predict(X_test)

## Final Predictor to Combine the predictors

In [None]:
# Train
X_comb = pd.DataFrame({'rf': reg3.predict(X_train), 'gb': reg4.predict(X_train), 'lasso': reg7.predict(X_train)})
reg_comb = LinearRegression().fit(X_comb, y_train)

#Test
X_comb_test = pd.DataFrame({'rf': y_pred3, 'gb': y_pred4, 'lasso': y_pred7})
y_pred_ens = reg_comb.predict(X_comb_test)

In [None]:
print(score(y_test,y_pred3))
print(score(y_test,y_pred4))
print(score(y_test,y_pred7))

## Score

In [None]:
score(y_test,y_pred_ens)

# Test Kaggle

In [None]:
X_final_kaggle[num_columns.columns] = X_final_kaggle[num_columns.columns].groupby(num_columns.columns, axis = 1).transform(lambda x: x.fillna(x.mean()))

In [None]:
Y_kaggle = reg3.predict(X_final_kaggle)
Y_kaggle2 = reg4.predict(X_final_kaggle)
Y_kaggle3 = reg7.predict(X_final_kaggle)

### Ensemble Predictor 

In [None]:
X_comb_kaggle = pd.DataFrame({'rf': Y_kaggle, 'gb': Y_kaggle2, 'lasso': Y_kaggle3})
Y_kaggle_ens = reg_comb.predict(X_comb_kaggle)

In [None]:
df_submission_kaggle = pd.DataFrame(Y_kaggle_ens, columns =['SalePrice']) 
df_submission_kaggle['Id'] = id_test_kaggle

In [None]:
df_submission_kaggle.to_csv('submission.csv', index=False)

In [None]:
df_submission_kaggle.shape