In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

file_path = '../input/melbourne-housing-snapshot/melb_data.csv'

data = pd.read_csv(file_path)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
corrmat = data.corr()
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corrmat, vmax=.8, square=True);


In [None]:
cor=data.corr()
highest_corr_features=cor.index[abs(cor["Price"])>0.1]
plt.figure(figsize=(10,10))
a=sns.heatmap(data[highest_corr_features].corr(),annot=True)

In [None]:
missing_value_count_by_column=data.isnull().sum()
print('missing value count by column : ')
print(missing_value_count_by_column[missing_value_count_by_column>0])

In [None]:
y=data.Price
a=data.drop(['Price'],axis=1)
x=a.select_dtypes(exclude=['object'])
x_train,x_valid,y_train,y_valid=train_test_split(x,y,train_size=0.8,test_size=0.2)

In [None]:
missing_all_col=[col for col in x_train.columns if x_train[col].isnull().any()]
reduced_x_train=x_train.drop(missing_all_col,axis=1)
reduced_x_valid=x_valid.drop(missing_all_col,axis=1)

In [None]:
from sklearn.impute import SimpleImputer
my_imputer=SimpleImputer()
impute_x_train=pd.DataFrame(my_imputer.fit_transform(x_train))
impute_x_valid=pd.DataFrame(my_imputer.transform(x_valid))
impute_x_train.columns=x_train.columns
impute_x_valid.columns=x_valid.columns

In [None]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
max_leaf_nodes = [ 25, 100, 250, 500,700,1100]
result=1e18
for max_leaf_nodes in [25,100,250, 500,700,1100]:
    my_mae = get_mae(max_leaf_nodes,reduced_x_train,reduced_x_valid,y_train,y_valid)
    if my_mae < result:
        best_tree_size=max_leaf_nodes
        result=my_mae
    print("Max leaf nodes: %d  \t\t Mean absulate Error:  %d" %(max_leaf_nodes, my_mae))
print( best_tree_size)

In [None]:
max_leaf_nodes = [5, 25, 50, 100, 250, 500,700,120,1100]
result=1e18
for max_leaf_nodes in [5,25, 50,100,250, 500,700,120,1100]:
    my_mae = get_mae(max_leaf_nodes,impute_x_train,impute_x_valid,y_train,y_valid)
    if my_mae < result:
        impute_best_tree_size=max_leaf_nodes
        result=my_mae
    print("Max leaf nodes: %d  \t\t Mean absulate Error:  %d" %(max_leaf_nodes, my_mae))
print( impute_best_tree_size)

In [None]:
def score_dataset(x_train, x_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=best_tree_size,random_state=0)
    model.fit(x_train, y_train)
    pre=model.predict(x_valid)
    return mean_absolute_error(y_valid, pre)


In [None]:
print(score_dataset(reduced_x_train,reduced_x_valid,y_train,y_valid))

In [None]:
def score_datasets(x_train, x_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=impute_best_tree_size,random_state=0)
    model.fit(x_train, y_train)
    pre=model.predict(x_valid)
    return mean_absolute_error(y_valid, pre)

In [None]:
print(score_datasets(impute_x_train,impute_x_valid,y_train,y_valid))

In [None]:
missing_all_col=[col for col in a.columns if a[col].isnull().any()]
reduced_a=a.drop(missing_all_col,axis=1)

In [None]:
a_train,a_valid,y_train,y_valid=train_test_split(reduced_a,y,train_size=0.8,test_size=0.2)

In [None]:
a_train.info()

In [None]:
low_cardinality_col=[cols for cols in reduced_a.columns if reduced_a[cols].nunique() < 10  and 
                     reduced_a[cols].dtype=="object"]
numeric_col=[col for col in reduced_a.columns if reduced_a[col].dtype in ['int64','float64']]
my_col=low_cardinality_col+numeric_col
label_a_train=a_train[my_col].copy()
label_a_valid=a_valid[my_col].copy()



In [None]:
label_a_train.info()

In [None]:
s = (label_a_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
for col in low_cardinality_col:
    label_a_train[col]=label_encoder.fit_transform(a_train[col])
    label_a_valid[col]=label_encoder.transform(a_valid[col])
    

In [None]:
 label_a_train.info()

In [None]:
max_leaf_nodes = [5, 25, 50, 100, 250, 500,700,120,1100]
result=1e18
for max_leaf_nodes in [5,25, 50,100,250, 500,700,120,1100]:
    my_mae = get_mae(max_leaf_nodes,label_a_train,label_a_valid,y_train,y_valid)
    if my_mae < result:
        encoder_best_tree_size=max_leaf_nodes
        result=my_mae
    print("Max leaf nodes: %d  \t\t Mean absulate Error:  %d" %(max_leaf_nodes, my_mae))
print( encoder_best_tree_size)

In [None]:
model = RandomForestRegressor(n_estimators=encoder_best_tree_size,random_state=0)
model.fit(label_a_train, y_train)
pre=model.predict(label_a_valid)
b= mean_absolute_error(y_valid, pre)

In [None]:
print(b)

In [None]:
b_train,b_valid,y_train,y_valid=train_test_split(a,y,train_size=0.8,test_size=0.2)

In [None]:
cardinality_col=[cols for cols in a.columns if a[cols].nunique() < 10  and 
                     a[cols].dtype=="object"]
numeric=[col for col in a.columns if a[col].dtype in ['int64','float64']]
my=cardinality_col+numeric
label_b_train=b_train[my].copy()
label_b_valid=b_valid[my].copy()

In [None]:
encoder=LabelEncoder()
for col in cardinality_col:
    label_b_train[col]=encoder.fit_transform(b_train[col])
    label_b_valid[col]=encoder.transform(b_valid[col])

In [None]:
imputer=SimpleImputer()
impute_b_train=pd.DataFrame(imputer.fit_transform( label_b_train))
impute_b_valid=pd.DataFrame(imputer.transform( label_b_train))
impute_b_train.columns= label_b_train.columns
impute_b_valid.columns= label_b_train.columns

In [None]:
max_leaf_nodes = [50, 100, 250, 500,1100]
result=1e18
for max_leaf_nodes in [50,100,250, 500,1100]:
    my_mae = get_mae(max_leaf_nodes,impute_b_train,impute_b_valid,y_train,y_valid)
    if my_mae < result:
        encoder_best_tree=max_leaf_nodes
        result=my_mae
    print("Max leaf nodes: %d  \t\t Mean absulate Error:  %d" %(max_leaf_nodes, my_mae))
print(best_tree)

In [None]:
model = RandomForestRegressor(n_estimators=encoder_best_tree,random_state=0)
model.fit(label_b_train, y_train)
pre=model.predict(label_b_valid)
b1= mean_absolute_error(y_valid, pre)
print(b1)