In [1]:
#Importing required packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
#Importing given datasets.
dataset = pd.read_csv("Data.csv")
test = pd.read_csv('test.csv')

In [3]:
dataset.columns[dataset.isnull().any()].tolist()

['CRIM', 'ZN', 'INDUS', 'CHAS', 'AGE', 'LSTAT']

In [4]:
test.columns[test.isnull().any()].tolist()

[]

In [5]:
test.head()

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
1,6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1


In [2]:
#Imputation for missing values

In [6]:
dataset["CRIM"] = dataset["CRIM"].fillna(dataset["CRIM"].mean())
dataset["AGE"] = dataset["AGE"].fillna(dataset["AGE"].mean())
dataset["LSTAT"] = dataset["LSTAT"].fillna(dataset["LSTAT"].mean())

dataset["ZN"] = dataset["ZN"].fillna(dataset["ZN"].mode()[0])
dataset["CHAS"] = dataset["CHAS"].fillna(dataset["CHAS"].mode()[0])
dataset["INDUS"] = dataset["INDUS"].fillna(dataset["INDUS"].mode()[0])

In [None]:
#Feature Engineering

In [7]:
Age_band = pd.DataFrame()
dataset['Age_band'] = 0
dataset.loc[(dataset['AGE']>20) & (dataset['AGE']<=40), 'Age_band'] = 1
dataset.loc[(dataset['AGE']>40) & (dataset['AGE']<=60), 'Age_band'] = 2
dataset.loc[(dataset['AGE']>60) & (dataset['AGE']<=80), 'Age_band'] = 3
dataset.loc[(dataset['AGE']>80) & (dataset['AGE']<=100), 'Age_band'] = 4

In [8]:
age_band = pd.DataFrame()
test['age_band'] = 0
test.loc[(test['age']>20) & (test['age']<=40), 'age_band'] = 1
test.loc[(test['age']>40) & (test['age']<=60), 'age_band'] = 2
test.loc[(test['age']>60) & (test['age']<=80), 'age_band'] = 3
test.loc[(test['age']>80) & (test['age']<=100), 'age_band'] = 4

In [9]:
dummies_train = pd.DataFrame(pd.get_dummies(dataset['Age_band'], prefix = 'ab', drop_first = True))
dummies_test = pd.DataFrame(pd.get_dummies(test['age_band'], prefix = 'ab', drop_first = True))

In [10]:
dataset = pd.concat([dataset, dummies_train], axis = 1)
test = pd.concat([test, dummies_test], axis = 1)

In [11]:
dataset.drop(['AGE','Age_band'], axis = 1, inplace = True)
test.drop(['age','age_band'], axis = 1, inplace= True)

In [13]:
#Feature Scaling
sc_train = StandardScaler()
sc_test = StandardScaler()
sc_train.fit(dataset)
sc_test.fit(test)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [14]:
#Splitting the dataset into two sets.
#indep - Set containing independent features.
#dep - set containing dependent features.
indep = dataset.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]]
dep = dataset["MEDV"]

In [None]:
#Dimensionality Reduction for both training and test set

In [15]:
pca_train = PCA()
pca_train.fit(indep)
pca_test = PCA()
pca_test.fit(test)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [16]:
#Splitting given dataset into training and validation(test) sets.
(indep_train, indep_test, dep_train, dep_test) = train_test_split(indep,dep)

In [17]:
rfr = RandomForestRegressor()

In [18]:
rfr.fit(indep_train, dep_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
dep_pred = rfr.predict(indep_test)

In [21]:
mse = mean_squared_error(dep_test, dep_pred)

In [22]:
mse

10.786507086614174

In [24]:
score = r2_score(dep_test, dep_pred)

In [25]:
score

0.8630563767116932

In [26]:
test_set = pd.DataFrame(test)

In [27]:
test_set.head()

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,dis,rad,tax,ptratio,black,lstat,ab_1,ab_2,ab_3,ab_4
0,3,0.02729,0.0,7.07,0,0.469,7.185,4.9671,2,242,17.8,392.83,4.03,0,0,1,0
1,6,0.02985,0.0,2.18,0,0.458,6.43,6.0622,3,222,18.7,394.12,5.21,0,1,0,0
2,8,0.14455,12.5,7.87,0,0.524,6.172,5.9505,5,311,15.2,396.9,19.15,0,0,0,1
3,9,0.21124,12.5,7.87,0,0.524,5.631,6.0821,5,311,15.2,386.63,29.93,0,0,0,1
4,10,0.17004,12.5,7.87,0,0.524,6.004,6.5921,5,311,15.2,386.71,17.1,0,0,0,1


In [28]:
test_set.drop('ID',axis = 1, inplace = True)

In [29]:
test_set.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,dis,rad,tax,ptratio,black,lstat,ab_1,ab_2,ab_3,ab_4
0,0.02729,0.0,7.07,0,0.469,7.185,4.9671,2,242,17.8,392.83,4.03,0,0,1,0
1,0.02985,0.0,2.18,0,0.458,6.43,6.0622,3,222,18.7,394.12,5.21,0,1,0,0
2,0.14455,12.5,7.87,0,0.524,6.172,5.9505,5,311,15.2,396.9,19.15,0,0,0,1
3,0.21124,12.5,7.87,0,0.524,5.631,6.0821,5,311,15.2,386.63,29.93,0,0,0,1
4,0.17004,12.5,7.87,0,0.524,6.004,6.5921,5,311,15.2,386.71,17.1,0,0,0,1


In [31]:
MEDV = pd.Series(rfr.predict(test_set)) 

In [32]:
predictions = pd.concat([test['ID'], MEDV], axis = 1)

In [33]:
predictions = predictions.rename(columns = {0:"MEDV"})

In [34]:
predictions.head()

Unnamed: 0,ID,MEDV
0,3,34.42
1,6,26.4
2,8,23.61
3,9,15.93
4,10,19.72


In [35]:
pd.options.display.float_format = '{:,.2f}'.format

In [36]:
predictions.head(5)

Unnamed: 0,ID,MEDV
0,3,34.42
1,6,26.4
2,8,23.61
3,9,15.93
4,10,19.72


In [37]:
predictions.to_csv("Results.csv",index=False)