In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

from lightgbm import LGBMRegressor



from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.preprocessing import LabelEncoder,RobustScaler
from sklearn.model_selection import KFold, cross_val_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_id = train['Id']
test_id = test['Id']
train.drop('Id', axis=1 ,inplace=True)
test.drop('Id', axis=1 ,inplace=True)

num_c = [c for c in train.columns if train[c].dtype!= object]

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [3]:
#fillna for num_c
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars','MasVnrArea','BsmtFinSF1','BsmtFinSF2'
           ,'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BsmtUnfSF','TotalBsmtSF'):
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].median())
test['LotFrontage'] = test['LotFrontage'].fillna(train['LotFrontage'].median())

In [4]:
newTrain=train[num_c]
pd.options.mode.chained_assignment = None
newTrain['CentralAir']=train['CentralAir'].apply(lambda x:1 if x=='Y' else 0)

In [5]:
#Using log transformation on variable we are predicting to get rid of skewness
newTrain['SalePrice']=np.log1p(newTrain['SalePrice'])

In [6]:
X_test=test[[c for c in num_c if c !='SalePrice']]
X_test['CentralAir']=test['CentralAir'].apply(lambda x:1 if x=='Y' else 0)

In [7]:
X_train=newTrain[[c for c  in newTrain.columns if c not in ['SalePrice']]].values
log_y_train=newTrain['SalePrice'].ravel()


model=LinearRegression(normalize=True)

In [8]:
model.fit(X_train,log_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [9]:
log_y_pred=model.predict(X_train)

In [10]:
log_y_test=model.predict(X_test)

In [11]:
y_pred=np.expm1(log_y_pred)
y_test=np.expm1(log_y_test)
y_train=np.expm1(log_y_train)


In [12]:
# Calculate metrics
print("Mean Absolute Error: %.2f" %mean_absolute_error(y_train, y_pred))
print("Mean Squared Error: %.2f" %mean_squared_error(y_train, y_pred))
print("Root Mean Squared Error: %.2f" %np.sqrt(mean_squared_error(y_train, y_pred)))
print("Coefficient of Determination: %.2f" %r2_score(y_train, y_pred))

Mean Absolute Error: 17691.27
Mean Squared Error: 1444591820.41
Root Mean Squared Error: 38007.79
Coefficient of Determination: 0.77


In [13]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )
lgb_model = lightgbm.fit(X_train, log_y_train)
log_prediction_train=lgb_model.predict(X_train)
log_prediction_test = lgb_model.predict(X_test)


prediction_train=np.expm1(log_prediction_train)
prediction_test=np.expm1(log_prediction_test)

print("Mean Absolute Error: %.2f" %mean_absolute_error(y_train,prediction_train))
print("Mean Squared Error: %.2f" %mean_squared_error(y_train, prediction_train))
print("Root Mean Squared Error: %.2f" %np.sqrt(mean_squared_error(y_train, prediction_train)))
print("Coefficient of Determination: %.2f" %r2_score(y_train, prediction_train))

Mean Absolute Error: 11214.00
Mean Squared Error: 304660974.13
Root Mean Squared Error: 17454.54
Coefficient of Determination: 0.95


In [14]:

subDf=pd.DataFrame({'Id':test_id.values})
subDf['SalePrice']=prediction_test
subDf.to_csv("submission.csv", index=False)