In [137]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import seaborn as sns


train = pd.read_csv('train.csv')
test_X = pd.read_csv('test.csv')

test_ID = test_X['Id']

train_X = train.drop(['SalePrice'], axis=1)
train_Y = train['SalePrice']

#print(train_X)
train_test_data = [train_X, test_X]

#Drop columns which have >50% value as NULL
drop_columns = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id']

for dataset in train_test_data:
    dataset.drop(drop_columns, axis=1, inplace=True)

#Handle missing values
for dataset in train_test_data:
    dataset['LotFrontage'].fillna(dataset['LotFrontage'].mean(), inplace=True)
    dataset['BsmtQual'].fillna(dataset['BsmtQual'].mode()[0], inplace=True)
    dataset['FireplaceQu'].fillna(dataset['FireplaceQu'].mode()[0], inplace=True)
    dataset['GarageType'].fillna(dataset['GarageType'].mode()[0], inplace=True)
    dataset['GarageYrBlt'].fillna(dataset['GarageYrBlt'].mean(), inplace=True)
    dataset['GarageFinish'].fillna(dataset['GarageFinish'].mode()[0], inplace=True)
    dataset['GarageQual'].fillna(dataset['GarageQual'].mode()[0], inplace=True)
    dataset['GarageCond'].fillna(dataset['GarageCond'].mode()[0], inplace=True)
    dataset['MasVnrType'].fillna(dataset['MasVnrType'].mode()[0], inplace=True)
    dataset['MasVnrArea'].fillna(dataset['MasVnrArea'].mean(), inplace=True)
    dataset['BsmtFinSF1'].fillna(dataset['BsmtFinSF1'].mean(), inplace=True)
    dataset['BsmtFinSF2'].fillna(dataset['BsmtFinSF2'].mean(), inplace=True)
    dataset['BsmtUnfSF'].fillna(dataset['BsmtUnfSF'].mean(), inplace=True)
    dataset['TotalBsmtSF'].fillna(dataset['TotalBsmtSF'].mean(), inplace=True)


test_X['MSZoning'].fillna(test_X['MSZoning'].mode()[0], inplace=True)
test_X['Utilities'].fillna(test_X['Utilities'].mode()[0], inplace=True)
test_X['Exterior1st'].fillna(test_X['Exterior1st'].mode()[0], inplace=True)
test_X['Exterior2nd'].fillna(test_X['Exterior2nd'].mode()[0], inplace=True)
test_X['BsmtFullBath'].fillna(test_X['BsmtFullBath'].mode()[0], inplace=True)
test_X['BsmtHalfBath'].fillna(test_X['BsmtHalfBath'].mode()[0], inplace=True)
test_X['KitchenQual'].fillna(test_X['KitchenQual'].mode()[0], inplace=True)
test_X['Functional'].fillna(test_X['Functional'].mode()[0], inplace=True)
test_X['GarageCars'].fillna(test_X['GarageCars'].mode()[0], inplace=True)
test_X['GarageArea'].fillna(test_X['GarageArea'].mean(), inplace=True)
test_X['SaleType'].fillna(test_X['SaleType'].mode()[0], inplace=True)

#Handling categorical data

categorical_columns=['MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood',
         'Condition2','BldgType','Condition1','HouseStyle','SaleType',
        'SaleCondition','ExterCond',
         'ExterQual','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
        'RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Heating','HeatingQC',
         'CentralAir',
         'Electrical','KitchenQual','Functional',
         'FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive']

combined_data = pd.concat([train_X, test_X], axis=0)

#print(combined_data.shape)
label_encoder = LabelEncoder()

for col in categorical_columns:
    combined_data[col] = label_encoder.fit_transform(combined_data[col].astype(str))

train_X = combined_data[:1460]
test_X = combined_data[1460:]


#test_X.columns[test_X.isnull().any()]
#train_X.columns[test_X.isnull().any()]



In [139]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = DecisionTreeRegressor()
scoring = 'accuracy'
score = cross_val_score(clf, train_X, train_Y, cv=k_fold, n_jobs = 1, scoring= scoring)
#print(score)
round(np.mean(score)*100, 2)

0.89

In [140]:
clf.fit(train_X, train_Y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [141]:
y_pred = clf.predict(test_X)



In [144]:
print(test_X.shape)

(1459, 75)


In [145]:
submission = pd.DataFrame({'Id' : test_ID, 
                           'SalePrice' : y_pred
                          })
submission.to_csv('submission.csv', index=False)

submission_file = pd.read_csv('submission.csv')

        Id  SalePrice
0     1461   129000.0
1     1462   155000.0
2     1463   192000.0
3     1464   177500.0
4     1465   213500.0
5     1466   130500.0
6     1467   138800.0
7     1468   177500.0
8     1469   189000.0
9     1470   129500.0
10    1471   206300.0
11    1472    88000.0
12    1473    88000.0
13    1474   148500.0
14    1475   139400.0
15    1476   446261.0
16    1477   232000.0
17    1478   315000.0
18    1479   232000.0
19    1480   385000.0
20    1481   285000.0
21    1482   202500.0
22    1483   176432.0
23    1484   213490.0
24    1485   184000.0
25    1486   224900.0
26    1487   410000.0
27    1488   147000.0
28    1489   198900.0
29    1490   193500.0
...    ...        ...
1429  2890   105900.0
1430  2891   117000.0
1431  2892    34900.0
1432  2893    89500.0
1433  2894    34900.0
1434  2895   372402.0
1435  2896   235000.0
1436  2897   197500.0
1437  2898   150900.0
1438  2899   225000.0
1439  2900   152000.0
1440  2901   168000.0
1441  2902   175000.0
1442  2903