## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Importing Data

In [None]:
train=pd.read_csv('training_obl.csv')
test=pd.read_csv('test_obl.csv')

## Enquiring Data

In [None]:
train.head()

In [None]:
test.head()

In [None]:
pandas_profiling.ProfileReport(train)

In [None]:
train.describe()

In [None]:
train.info()

<i>All are integer values</i>

In [None]:
train.columns

In [None]:
%matplotlib inline
sns.distplot(train.Price)
print(train.Price.skew())
print(train.Price.kurt())

In [None]:
train[train['Area(total)']-train['Roof(Area)']-train['Lawn(Area)']<0]

In [None]:
train.shape

In [None]:
train['Roof(Area)'].describe()

In [None]:
from scipy import stats
from scipy.stats import norm, skew #for some statistics

# Plot histogram and probability
fig = plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train['Price'] , fit=norm);
(mu, sigma) = norm.fit(train['Price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Price distribution')
plt.subplot(1,2,2)
res = stats.probplot(train['Price'], plot=plt)
plt.suptitle('Before transformation')

# Apply transformation
train['SalePrice'] = np.log1p(train.Price )
# New prediction
# y_train = train.SalePrice.values
# y_train_orig = train.Price


# Plot histogram and probability after transformation
fig = plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train['SalePrice'] , fit=norm);
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
plt.subplot(1,2,2)
res = stats.probplot(train['SalePrice'], plot=plt)
plt.suptitle('After transformation')

In [None]:
train.columns

In [None]:
train=train.drop(columns=['SalePrice'])
# train=train.drop(columns=['Price'])
# train=train.rename(columns={"SalePrice":"Price"})
# train.head()

In [None]:
train.columns[:-1:]

In [None]:
vars = train.columns[:-1:]
# vars = numerical_features
figures_per_time = 3
count = 0 
y = train.Price
for var in vars:
    x = train[var]
#     print(y.shape,x.shape)
    plt.figure(count//figures_per_time,figsize=(25,5))
    plt.subplot(1,figures_per_time,np.mod(count,3)+1)
    plt.scatter(x, y);
    plt.title('f model: T= {}'.format(var))
    count+=1

## Training

### Error metrics

In [None]:
from sklearn.metrics import mean_squared_error
def rmse(y_actual,y_predicted):
    return np.sqrt(mean_squared_error(y_actual, y_predicted))

### Train Test Split

In [None]:
train.columns[1:-1:]

In [None]:
train=train.drop(columns=['id'])

In [None]:
y=train.Price
train=train.drop(columns=['Price'])
x=train
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

### Light GBM

In [None]:
from lightgbm import LGBMRegressor

In [None]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4, #was 3
                                       learning_rate=0.01, 
                                       n_estimators=8000, #8000
                                       max_bin=100, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2, # 'was 0.2'
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

In [None]:
lightgbm.fit(X_train,y_train)

In [None]:
predictions=lightgbm.predict(X_test)

In [None]:
print("Error=",rmse(y_test,predictions))

In [None]:
lightgbm.feature_importances_

In [None]:
X_test.columns

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=X_test.columns,y=lightgbm.feature_importances_)

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor(max_depth=10,n_estimators=1000)

In [None]:
model.fit(X_train,y_train)

In [None]:
predictions=model.predict(X_test)

In [None]:
print("Error=",rmse(y_test,predictions))

In [None]:
%matplotlib inline
plt.figure(figsize=(10,10))
sns.barplot(x=X_test.columns,y=model.feature_importances_)

### Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
GBR = GradientBoostingRegressor(n_estimators=100, max_depth=4)

In [None]:
GBR.fit(X_train,y_train)

In [None]:
predictions=GBR.predict(X_test)

In [None]:
print("Error=",rmse(y_test,predictions))

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=X_test.columns,y=GBR.feature_importances_)

### XGBoost

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(#colsample_bytree=1,
                 #gamma=0,                
                 learning_rate=0.01,
                 max_depth=4,
                 #min_child_weight=0.5,
                 n_estimators=2000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.25,
                 subsample=1,
                 seed=42)

In [None]:
model.fit(X_train.as_matrix(),y_train.as_matrix())

In [None]:
predictions=model.predict(X_test.as_matrix())

In [None]:
print("Error=",rmse(y_test,predictions))

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=X_test.columns,y=model.feature_importances_)

In [None]:
import xgboost
#from matplotlib.pylab import rcParams

In [None]:
#rcParams['figure.figsize']=10,10
xgboost.to_graphviz(model)

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
pred=[]
Area=[i for i in range(200,401)]

In [None]:
X_test.as_matrix()

In [None]:
X_test.as_matrix()[0]

In [None]:
from copy import deepcopy

In [None]:
for i in Area:
    y=deepcopy(X_test.as_matrix()[0])
    y[0]=i
    pred.append(model.predict(np.array([y])))
    
pred

In [None]:
plt.title('Model values on increasing areas')
plt.xlabel('Area')
plt.ylabel('Price')
plt.scatter(x=Area,y=pred)

In [None]:
pred=[]
bed=[i for i in range(2,8)]

In [None]:
for i in bed:
    y=deepcopy(X_test.as_matrix()[0])
    y[1]=i
    pred.append(model.predict(np.array([y])))
    
pred

In [None]:
plt.title('Model values on increasing bedroom')
plt.xlabel('Bedroom')
plt.ylabel('Price')
plt.scatter(x=bed,y=pred)

In [None]:
X_test.describe()

In [None]:
pred=[]
washroom=[i for i in range(1,8)]
for i in washroom:
    y=deepcopy(X_test.as_matrix()[0])
    y[2]=i
    pred.append(model.predict(np.array([y])))
plt.title('Model values on increasing washroom')
plt.xlabel('Washroom')
plt.ylabel('Price')
plt.scatter(x=washroom,y=pred) 

In [None]:
pred=[]
roof=[i for i in range(50,151)]
for i in roof:
    y=deepcopy(X_test.as_matrix()[0])
    y[3]=i
    pred.append(model.predict(np.array([y])))
plt.title('Model values on increasing roof area')
plt.xlabel('roof')
plt.ylabel('Price')
plt.scatter(x=roof,y=pred) 

In [None]:
pred=[]
lawn=[i for i in range(51,101)]
for i in lawn:
    y=deepcopy(X_test.as_matrix()[0])
    y[4]=i
    pred.append(model.predict(np.array([y])))
plt.title('Model values on increasing lawn area')
plt.xlabel('lawn')
plt.ylabel('Price')
plt.scatter(x=lawn,y=pred) 

In [None]:
pred=[]
floors=[i for i in range(1,6)]
for i in floors:
    y=deepcopy(X_test.as_matrix()[0])
    y[5]=i
    pred.append(model.predict(np.array([y])))
plt.title('Model values on increasing floors')
plt.xlabel('Floors')
plt.ylabel('Price')
plt.scatter(x=floors,y=pred) 

### ADA Boost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
model=AdaBoostRegressor(learning_rate=0.01,
                 n_estimators=1000,
                 loss='linear')

In [None]:
model.fit(X_train,y_train)

In [None]:
predictions=model.predict(X_test)

In [None]:
print("Error=",rmse(y_test,predictions))

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=X_test.columns,y=model.feature_importances_)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
model=DecisionTreeRegressor(max_depth=5,max_features=9)

In [None]:
model.fit(X_train,y_train)

In [None]:
predictions=model.predict(X_test)

In [None]:
print("Error=",rmse(y_test,predictions))

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=X_test.columns,y=model.feature_importances_)

### Linear Regression

In [None]:
from sklearn import linear_model

In [None]:
model = linear_model.LinearRegression(normalize=True)

In [None]:
#Fit the model
model.fit(X_train, y_train)

In [None]:
predictions=model.predict(X_test)

In [None]:
print("Error=",rmse(y_test,predictions))

## Test Data

In [None]:
pd.read_csv('sample.csv').head()

In [None]:
train.columns

In [None]:
test.columns

In [None]:
test=test.rename(columns={"Nbedroom":"Nbedrooms","Nwashroom":"Nwashrooms"})

In [None]:
test.head()

In [None]:
test_data=test[test.columns[1::]]

In [None]:
ans=model.predict(test_data)

In [None]:
df=pd.DataFrame({"id":test.id,"Price":ans})

In [None]:
df.to_csv('xgb_est.csv',index=False)

In [None]:
df