In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('train_set.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,826,20,RL,114.0,14803,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,New,Partial,385000
1,949,60,RL,65.0,14006,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Normal,192500
2,189,90,RL,64.0,7018,Pave,,Reg,Bnk,AllPub,...,0,,,,0,6,2009,WD,Alloca,153337
3,1060,50,RL,,11275,Pave,,IR1,HLS,AllPub,...,0,,,,0,3,2007,WD,Normal,220000
4,668,20,RL,65.0,8125,Pave,,Reg,Lvl,AllPub,...,0,,,,0,10,2008,WD,Normal,193500


In [4]:
features = data.drop('SalePrice', axis=1)
housing_labels = data['SalePrice'].copy()

In [5]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1168 non-null   int64  
 1   MSSubClass     1168 non-null   int64  
 2   MSZoning       1168 non-null   object 
 3   LotFrontage    962 non-null    float64
 4   LotArea        1168 non-null   int64  
 5   Street         1168 non-null   object 
 6   Alley          70 non-null     object 
 7   LotShape       1168 non-null   object 
 8   LandContour    1168 non-null   object 
 9   Utilities      1168 non-null   object 
 10  LotConfig      1168 non-null   object 
 11  LandSlope      1168 non-null   object 
 12  Neighborhood   1168 non-null   object 
 13  Condition1     1168 non-null   object 
 14  Condition2     1168 non-null   object 
 15  BldgType       1168 non-null   object 
 16  HouseStyle     1168 non-null   object 
 17  OverallQual    1168 non-null   int64  
 18  OverallC

## FEATURE ENGINEERING

We need to apply feature engineering, since there are a lot of features in the dataset, and some of them could be redundant.

After taking a look to data dictionary we will drop the next columns:

- ID: It is only an identifier and does not give any value
- Condition 2: Since if it is near one important location we will keep only Condition 1
- YearBuilt: We have the same info in YearRemoAdd
- RoofMtl, Extirior1st, Exterior2nd, MasVnrType, ExterQual, ExterCond: For material we will keep only the Overall Material  Column
- BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF: We will keep only the basement area
- Heating: We will keep only the Heating Quality and Condition
- BsmtFullBath, BsmtHalfBath: Only will check if it has basement
- KitchenQual: We will only take into account number of kitchens, as the qulity were alrady consider in the overall quality
- Functional: Take it into account on the quality
- FirplacesQu: Only taking into account the number of fireplaces}
- GarageType, GarageYrBlt, GarageFinish, GarageArea, GarageQual, GarageCond: We will only take into account the GarageCars, as it is the main purpose of garage.
- PoolQC: We would only take into account the PoolArea
- Fence: We will change this to Fence or No Fence
- MiscFeature: We would take MiscValb
- MasVnrArea: We will not use it, since it has to many missing values

### LotFrontage

In [6]:
features['LotFrontage'].describe()

count    962.000000
mean      70.408524
std       25.109095
min       21.000000
25%       60.000000
50%       70.000000
75%       80.000000
max      313.000000
Name: LotFrontage, dtype: float64

For LotFrotage it appears that mean would be a good value for Imputer, so we will use that

### Alley

In [7]:
features['Alley'].value_counts()

Grvl    42
Pave    28
Name: Alley, dtype: int64

According to Data Dictionary, Alley has 3 attributes:

- Grvl	Gravel
- Pave	Paved
- NA 	No alley access

So it appears pandas is thinking that NA is a null value, and this is not the case, so we need to replace NA for another value and convert it to another possible class

### Electrical

In [8]:
features['Electrical'].value_counts()

SBrkr    1065
FuseA      74
FuseF      24
FuseP       3
Mix         1
Name: Electrical, dtype: int64

In this case it is only 1 value with missing data, so we will use most frequent as imputer

### Fence

In [9]:
features['Fence'].value_counts()

MnPrv    132
GdPrv     48
GdWo      44
MnWw      11
Name: Fence, dtype: int64

In this case for Fence we will replace evrything so it is only Fence or No Fence

### PIPELINES

In [10]:
def drop_columns(X):
    columns = ['Id', 'Condition2', 'YearBuilt', 'RoofMatl', 'Exterior1st', 
                'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
                'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating',
                'BsmtFullBath', 'BsmtHalfBath','KitchenQual','Functional',
                'FireplaceQu', 'GarageType', 'PoolQC', 'MiscFeature',
                'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageArea', 
                'GarageQual', 'GarageCond','PoolQC','MiscFeature', 'MasVnrArea',
                'RoofStyle',
               ]
    
    return X.drop(columns, axis=1)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


error = 'neg_mean_absolute_error'
#features = drop_columns(features)
num_features = features.select_dtypes(include=np.number)
cat_features = features.select_dtypes(exclude=np.number)

num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
#             ('std_scaler', StandardScaler()),
        ])

cat_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
            ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ])

column_transformer = ColumnTransformer([
            ('num', num_pipeline, list(num_features.columns)),
            ('cat', cat_pipeline, list(cat_features.columns)),
        ])


In [11]:
housing_prep = column_transformer.fit_transform(features)

### SELECTING AND TUNNING THE MODEL

#### RANDOM FOREST

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
        {'n_estimators': [100, 150], 'max_features':[30, 40, 50]},
        {'bootstrap': [False], 'n_estimators': [100, 150], 'max_features':[30, 40, 50]},
    ]




forest_reg = RandomForestRegressor()

grid_search_forest = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring=error,
                           return_train_score=True)

grid_search_forest.fit(housing_prep, housing_labels)

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_sam

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_sam

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_sam

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [40]:
grid_search_forest.best_params_

{'bootstrap': False,
 'max_depth': 80,
 'max_features': 40,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 100}

In [38]:
np.sqrt(abs(grid_search_forest.best_score_))

130.35302900472928

#### SVM

In [15]:
# from sklearn.svm import SVR
# from sklearn.model_selection import GridSearchCV

# param_grid = [
#     {'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
# ]

# svr= SVR()
# grid_search_svm = GridSearchCV(svr, param_grid, cv=5,
#                            scoring=error,
#                            return_train_score=True)

# grid_search_svm.fit(housing_prep, housing_labels)

In [16]:
# np.sqrt(abs(grid_search_svm.best_score_))

In [17]:
# grid_search_svm.best_params_

#### LINEAR REGRESSION WITH REGULARIZATION

In [18]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 10, 100, 1000]},
]

linear_reg= Ridge()
grid_search_ridge = GridSearchCV(linear_reg, param_grid, cv=5,
                           scoring=error,
                           return_train_score=True)

grid_search_ridge.fit(housing_prep, housing_labels)

GridSearchCV(cv=5, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid=[{'alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
                                    0.9, 1, 10, 100, 1000]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_absolute_error', verbose=0)

In [19]:
np.sqrt(abs(grid_search_ridge.best_score_))

161.80131745292405

In [20]:
grid_search_ridge.best_params_

{'alpha': 1000}

#### Linear Regression without Regularization

In [21]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(housing_prep, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
from sklearn.metrics import mean_squared_error

np.sqrt(mean_squared_error(housing_labels, linear_reg.predict(housing_prep)))

18694.00147095373

### Saving the model

In [23]:
import joblib

forest_model = grid_search_forest.best_estimator_
linear_model = linear_reg
joblib.dump(forest_model, 'forest_model.pkl')
joblib.dump(linear_model, 'linear_model.pkl')

['linear_model.pkl']

## TEST SET EVALUATION

In [24]:
test_set = pd.read_csv('test_set.csv')
test_features = test_set.iloc[:,:-1]
test_labels = test_set.iloc[:, -1]

In [25]:
#test_features = drop_columns(test_features)

In [26]:
test_prepared = column_transformer.transform(test_features)

In [27]:
pred_forest = forest_model.predict(test_prepared)
pred_linear = linear_model.predict(test_prepared)

In [28]:
from sklearn.metrics import mean_absolute_error

forest_error = np.sqrt(mean_absolute_error(test_labels, pred_forest))
linear_error = np.sqrt(mean_absolute_error(test_labels, pred_linear))

In [29]:
print(f'Forest error: {forest_error}')
print(f'Linear error: {linear_error}')

Forest error: 140.56626673020008
Linear error: 140.00367305346478


We will use FOREST MODEL

### PREDICTIONS FOR COMPETITION

In [30]:
test_compete = pd.read_csv('test.csv')

In [31]:
test_features = test_compete

In [32]:
#test_features = drop_columns(test_features)

In [33]:
test_prepared = column_transformer.transform(test_features)

In [34]:
predictions = forest_model.predict(test_prepared)

In [35]:
import csv

with open('Predictions.csv', 'w', newline='') as csvfile: #crea y escribe un archivo csv
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['Id', 'SalePrice'])
    for i  in zip(test_compete['Id'], predictions):
        writer.writerow(i)