In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
data_frame = pd.read_csv("./train.csv")
data_frame.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
data_frame.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [4]:
data_frame.shape

(1460, 81)

## Data preprocessing

**Fills missing values in a DataFrame randomly based on existing value distribution.**

In [5]:
def fillNa(data_frame):
    for column in data_frame.columns:
        if data_frame[column].isnull().sum() > 0:
            existing_values = data_frame[column].dropna()
            value_distribution = existing_values.value_counts(normalize=True)
            missing_count = data_frame[column].isnull().sum()
            new_values = np.random.choice(value_distribution.index, size=missing_count, p=value_distribution.values)
            data_frame.loc[data_frame[column].isnull(), column] = new_values

In [6]:
fillNa(data_frame)

In [7]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

**Encoding categorical values into number values**

In [8]:
def encode(data_frame):
    categorical_columns = [col for col in data_frame.columns if data_frame[col].dtype == 'object']

    encoder = OrdinalEncoder()

    for column in categorical_columns:
        if not pd.api.types.is_numeric_dtype(data_frame[column]):
            data_frame[column] = encoder.fit_transform(data_frame[[column]])

In [9]:
encode(data_frame)

In [10]:
data_frame

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3.0,65.0,8450,1.0,1.0,3.0,3.0,0.0,...,0,1.0,2.0,2.0,0,2,2008,8.0,4.0,208500
1,2,20,3.0,80.0,9600,1.0,0.0,3.0,3.0,0.0,...,0,2.0,2.0,2.0,0,5,2007,8.0,4.0,181500
2,3,60,3.0,68.0,11250,1.0,0.0,0.0,3.0,0.0,...,0,2.0,0.0,2.0,0,9,2008,8.0,4.0,223500
3,4,70,3.0,60.0,9550,1.0,0.0,0.0,3.0,0.0,...,0,2.0,2.0,2.0,0,2,2006,8.0,0.0,140000
4,5,60,3.0,84.0,14260,1.0,0.0,0.0,3.0,0.0,...,0,0.0,2.0,2.0,0,12,2008,8.0,4.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3.0,62.0,7917,1.0,0.0,3.0,3.0,0.0,...,0,2.0,2.0,2.0,0,8,2007,8.0,4.0,175000
1456,1457,20,3.0,85.0,13175,1.0,1.0,3.0,3.0,0.0,...,0,1.0,2.0,2.0,0,2,2010,8.0,4.0,210000
1457,1458,70,3.0,66.0,9042,1.0,0.0,3.0,3.0,0.0,...,0,0.0,0.0,2.0,2500,5,2010,8.0,4.0,266500
1458,1459,20,3.0,68.0,9717,1.0,0.0,3.0,3.0,0.0,...,0,0.0,0.0,2.0,0,4,2010,8.0,4.0,142125


**Removing outliers**

In [11]:
from sklearn.ensemble import IsolationForest

In [12]:
def remove_outliers(X, y):
    # Konvertujemo DataFrame u NumPy niz
    X = np.array(X)
    y = np.array(y)
    # Inicijalizacija IsolationForest modela
    clf = IsolationForest(contamination=0.1, random_state=42)  # contamination je procenat outlejera
    # Fitovanje modela
    clf.fit(X)
    # Predikcija outlejera
    outliers = clf.predict(X)
    # Izdvajanje indeksa outlejera
    inlier_indices = np.where(outliers == 1)[0]
    # Uklanjanje outlejera iz X i y
    X_no_outliers = X[inlier_indices]
    y_no_outliers = y[inlier_indices]
    return pd.DataFrame(X_no_outliers), y_no_outliers

# Data train

In [13]:
X = data_frame.drop(['SalePrice', 'Id'], axis = 1)
Y = data_frame['SalePrice']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
cx = X_train.columns

In [15]:
X_train, y_train = remove_outliers(X_train, y_train)
X_train.columns = cx

In [16]:
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

***Searching best hyperparameters for RandomForest on validation set***

In [17]:
def train(X, y):
 
    model = RandomForestRegressor()

    param_grid = {
        'n_estimators': [10, 20, 50, 100, 200, 300],
        'max_depth': [3, 5, 15, 20, 50, 150]
    }

    # Inicijalizacija GridSearchCV objekta
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    grid_search.fit(X, y)
    # Najbolji model sa najboljim parametrima
    best_model = grid_search.best_estimator_
    # Treniranje najboljeg modela na celom skupu podataka
    best_model.fit(X, y)

    return best_model

In [18]:
model = train(X_train, y_train)

In [19]:
def test(y_true, y_pred):
    print(r2_score(y_true, y_pred))

In [20]:
test(y_test, model.predict(X_test))

0.8582971816624156


***We are good with 85% r2_score so we are going to train with full data***

In [21]:
X, Y = remove_outliers(X, Y)
X.columns = cx
model = train(X, Y)

In [23]:
test_set = pd.read_csv('./test.csv')
id1 = test_set['Id']
test_set.drop('Id', axis = 1, inplace = True)
fillNa(test_set)
encode(test_set)
test_set


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,2.0,80.0,11622,1.0,1.0,3.0,3.0,0.0,4.0,...,120,0,0.0,2.0,2.0,0,6,2010,8.0,4.0
1,20,3.0,81.0,14267,1.0,1.0,0.0,3.0,0.0,0.0,...,0,0,0.0,1.0,0.0,12500,6,2010,8.0,4.0
2,60,3.0,74.0,13830,1.0,0.0,0.0,3.0,0.0,4.0,...,0,0,0.0,2.0,1.0,0,3,2010,8.0,4.0
3,60,3.0,78.0,9978,1.0,1.0,0.0,3.0,0.0,4.0,...,0,0,0.0,2.0,2.0,0,6,2010,8.0,4.0
4,120,3.0,43.0,5005,1.0,0.0,0.0,1.0,0.0,4.0,...,144,0,0.0,2.0,2.0,0,1,2010,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,4.0,21.0,1936,1.0,0.0,3.0,3.0,0.0,4.0,...,0,0,0.0,1.0,2.0,0,6,2006,8.0,4.0
1455,160,4.0,21.0,1894,1.0,0.0,3.0,3.0,0.0,4.0,...,0,0,1.0,1.0,2.0,0,4,2006,8.0,0.0
1456,20,3.0,160.0,20000,1.0,0.0,3.0,3.0,0.0,4.0,...,0,0,1.0,1.0,2.0,0,9,2006,8.0,0.0
1457,85,3.0,62.0,10441,1.0,0.0,3.0,3.0,0.0,4.0,...,0,0,0.0,2.0,2.0,700,7,2006,8.0,4.0


In [24]:
predict = model.predict(test_set)
answer = pd.DataFrame({'Id': id1, 'SalePrice': predict}).set_index('Id')
answer.to_csv('answer.csv')

In [25]:
answer

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,128954.111265
1462,157361.645990
1463,185747.376459
1464,184297.656783
1465,200941.366703
...,...
2915,84526.282459
2916,86637.678452
2917,152536.166667
2918,113869.192092
