In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

In [2]:
X = pd.read_csv('Housing_dataset_train.csv')
X_test = pd.read_csv('Housing_dataset_test.csv')
sub = pd.read_csv('Sample_submission.csv')

In [3]:
y = X.price              
X.drop(['price'], axis=1, inplace=True)

In [6]:
from sklearn.model_selection import train_test_split
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [7]:
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [9]:
# Preprocessing for categorical data
numerical_cols = list(X_train.select_dtypes(exclude=['object']).columns)
categorical_cols = list(X_train.select_dtypes(include=['object']).columns)

In [10]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)
imputer.fit(X_train[numerical_cols].values)
X_train_mat = imputer.transform(X_train[numerical_cols].values)
X_valid_mat = imputer.transform(X_valid[numerical_cols].values)
X_test_mat = imputer.transform(X_test[numerical_cols].values)

In [11]:
categorical_transformer = SimpleImputer(strategy='most_frequent')
categorical_transformer.fit(X_train[categorical_cols])
X_train_cat = categorical_transformer.transform(X_train[categorical_cols].values)
X_valid_cat = categorical_transformer.transform(X_valid[categorical_cols].values)
X_test_cat = categorical_transformer.transform(X_test[categorical_cols].values)



In [12]:
train_X = np.hstack((X_train_mat, X_train_cat))
valid_X = np.hstack((X_valid_mat, X_valid_cat))
test_X = np.hstack((X_test_mat, X_test_cat))

In [13]:
train_X_df = pd.DataFrame(train_X, columns = numerical_cols + categorical_cols, index = X_train.index)
valid_X_df = pd.DataFrame(valid_X, columns = numerical_cols + categorical_cols, index = X_valid.index)
test_X_df = pd.DataFrame(test_X, columns = numerical_cols + categorical_cols, index = X_test.index)
train_X_df.head()

Unnamed: 0,ID,bedroom,bathroom,parking_space,loc,title
5341,11717.0,5.03574,3.0,3.0,Kano,Terrace duplex
8028,1438.0,4.0,2.715874,4.0,Kaduna,Penthouse
9440,1998.0,9.0,2.0,5.0,Ekiti,Detached duplex
10399,1984.0,3.0,1.0,1.0,Adamawa,Flat
6518,8291.0,1.0,6.0,1.0,Delta,Penthouse


In [14]:
# categoricals features
for col in categorical_cols:
    train_X_df[col] = train_X_df[col].astype(str)
    valid_X_df[col] = valid_X_df[col].astype(str)
    test_X_df[col] = test_X_df[col].astype(str)

In [15]:
#importing library and building model
from catboost import CatBoostRegressor
model=CatBoostRegressor(iterations=1000, learning_rate=0.05,  loss_function='RMSE', logging_level='Silent')

In [16]:
model.fit(train_X_df,y_train,cat_features=categorical_cols,eval_set=(valid_X_df,y_valid),plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x19e83d6ea60>

In [17]:
from sklearn.metrics import mean_absolute_error

predictions = model.predict(valid_X_df)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 275017.00004736666


In [24]:
# Preprocessing for categorical data
numerical_cols = list(X_test.select_dtypes(exclude=['object']).columns)
categorical_cols = list(X_test.select_dtypes(include=['object']).columns)

In [25]:
# categoricals features
for col in categorical_cols:
    X_test[col] = X_test[col].astype(str)

In [18]:
preds_test = model.predict(test_X_df)

In [28]:
preds_test.shape

(6000,)

In [27]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             6000 non-null   int64 
 1   loc            6000 non-null   object
 2   title          6000 non-null   object
 3   bedroom        6000 non-null   int64 
 4   bathroom       6000 non-null   int64 
 5   parking_space  6000 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 281.4+ KB


In [None]:
sub

In [29]:
# Save test predictions to file
output = pd.DataFrame({'ID': sub['ID'],
                       'price': preds_test})
output.to_csv('submission_new.csv', index=False)