In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import numpy as np

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

#import lightgbm as lgb
import xgboost as xgb
from pandas import DataFrame
import math
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [2]:
#cutdown_rows = 1000
cutdown_rows = 0

LABEL = 'Price'

floats = ['location.latitude', 'location.longitude', 'bedrooms', 'bathrooms',
          'nearestStation', 'nearestTram', 'nearestUnderground', 'nearestOverground',
          ]

categories = ['tenure.tenureType',
              'analyticsProperty.soldSTC',
              'analyticsProperty.preOwned',
              #'sharedOwnership.sharedOwnership',
              #
              'analyticsProperty.propertyType',  # 'propertyType',
              #'analyticsProperty.propertySubType',
              'borough',
              ]
custom = [
    'sharedOwnership.sharedOwnership',
    'analyticsProperty.priceQualifier',
    'keyFeatures'
]
#categories = []

features = floats.copy()
features.extend(categories)
features.extend(custom)
features.insert(0, LABEL)
features

['Price',
 'location.latitude',
 'location.longitude',
 'bedrooms',
 'bathrooms',
 'nearestStation',
 'nearestTram',
 'nearestUnderground',
 'nearestOverground',
 'tenure.tenureType',
 'analyticsProperty.soldSTC',
 'analyticsProperty.preOwned',
 'analyticsProperty.propertyType',
 'borough',
 'sharedOwnership.sharedOwnership',
 'analyticsProperty.priceQualifier',
 'keyFeatures']

In [3]:
def get_source_dataframe(rows=cutdown_rows):
    try:
        df = pd.read_csv('../data/source/df_listings.csv', on_bad_lines='skip', index_col=0)
    except:
        df = pd.read_csv('https://raw.githubusercontent.com/jayportfolio/capstone_streamlit/main/data/final/df_listings.csv', on_bad_lines='skip', index_col=0)

    df = df[features]

    if rows and rows > 0:
        df = df[:rows]
    return df


def create_train_test_data(df_orig, return_index=False, drop_nulls=True):
    df = df_orig.copy()

    if drop_nulls:
        df.dropna(inplace=True)

    if return_index:
        df.reset_index(inplace=True)

    for column in categories:
        df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
        df.drop([column], axis=1, inplace=True)  # now drop the original column (you don't need it anymore),

    ins = df.pop('index')
    df.insert(1, 'index2', ins)
    df.insert(0, 'index', ins)

    #features = df[df.columns[1:]].values
    features = df[df.columns[2:]].values
    #labels = df[LABEL].values
    labels = df.iloc[:, 0:2].values

    if not return_index:
        return train_test_split(features, labels, train_size=0.9, random_state=RANDOM_STATE)
    else:
        X_train1, X_test1, y_train1, y_test1 = train_test_split(features, labels, train_size=0.9, random_state=RANDOM_STATE)
        X_train_index = X_train1[:, 0].reshape(-1, 1)
        y_train_index = y_train1[:, 0].reshape(-1, 1)
        X_test_index = X_test1[:, 0].reshape(-1, 1)
        y_test_index = y_test1[:, 0].reshape(-1, 1)
        #X_train1 = X_train1[:,3:]
        X_train1 = X_train1[:, 1:]
        y_train1 = y_train1[:, 1].reshape(-1, 1)
        #X_test1 = X_test1[:,3:]
        X_test1 = X_test1[:, 1:]
        y_test1 = y_test1[:, 1].reshape(-1, 1)

        # X_train_index = pd.to_numeric(X_train_index, 'coerce').astype(int)
        # y_train_index = pd.to_numeric(y_train_index, 'coerce').astype(int)
        # X_test_index = pd.to_numeric(X_test_index, 'coerce').astype(int)
        # y_test_index = pd.to_numeric(y_test_index, 'coerce').astype(int)

        return X_train1, X_test1, y_train1, y_test1, X_train_index, X_test_index, y_train_index, y_test_index


X_train, X_test, y_train, y_test, X_train_index, X_test_index, y_train_index, y_test_index = create_train_test_data(
    get_source_dataframe(), return_index=True, drop_nulls=False)
X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_train_index.shape, X_test_index.shape, y_train_index.shape, y_test_index.shape,
X_train[0]

NameError: name 'RANDOM_STATE' is not defined

In [None]:
df = get_source_dataframe()
df_orig = df.copy()
print(df.shape)
df[:5]

In [None]:
if 'nearestTram' in features:
    df['nearestTram'] = df['nearestTram'].fillna(99)
    df['nearestOverground'] = df['nearestOverground'].fillna(99)
    df['nearestUnderground'] = df['nearestUnderground'].fillna(99)
    df['nearestStation'] = df['nearestStation'].fillna(99)

    imputer = SimpleImputer(strategy='constant', fill_value=99)
    imputer.fit(df['nearestTram'].values.reshape(-1, 1))
    df['nearestTram'] = imputer.transform(df['nearestTram'].values.reshape(-1, 1))
    df['nearestUnderground'] = imputer.transform(df['nearestUnderground'].values.reshape(-1, 1))
    df['nearestOverground'] = imputer.transform(df['nearestOverground'].values.reshape(-1, 1))
    df['nearestStation'] = imputer.transform(df['nearestStation'].values.reshape(-1, 1))

if 'keyFeatures' in features:
    df['keyFeatures'] = df['keyFeatures'].str.lower()

if 'analyticsProperty.priceQualifier' in features:

    if 'keyFeatures' in features:

        # df[df['keyFeatures'].str.contains('shared ownership')]

        df['sharedOwnership'] = (
                (df['sharedOwnership.sharedOwnership'] == True) |
                (df['analyticsProperty.priceQualifier'] == 'Shared ownership') |
                (df['keyFeatures'].str.contains('shared ownership'))
        )

        df.drop(['keyFeatures'], axis=1, inplace=True)
    else:
        df['sharedOwnership'] = (
                (df['sharedOwnership.sharedOwnership'] == True) |
                (df['analyticsProperty.priceQualifier'] == 'Shared ownership')
        )

    df['sharedOwnership'] = pd.to_numeric(df['sharedOwnership'], 'coerce').dropna().astype(int)
    df.drop(['sharedOwnership.sharedOwnership'], axis=1, inplace=True)

    if 'analyticsProperty.priceQualifier' not in categories:
        df.drop(['analyticsProperty.priceQualifier'], axis=1, inplace=True)

    #df.drop(['shared_ownership'], axis=1, inplace=True)

df

In [None]:
#df[df['keyFeatures'].str.contains('shared ownership')]
#df['keyFeatures'] = df['keyFeatures'].apply(lambda x: x.astype(str).str.upper())
#df

In [None]:

df

In [None]:
df.info()
df.describe()

In [None]:
X_train, X_test, y_train, y_test, X_train_index, X_test_index, y_train_index, y_test_index = create_train_test_data(df,
                                                                                                                    return_index=True,
                                                                                                                    drop_nulls=True)
#X_train[:5]


print(X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_train_index.shape, X_test_index.shape,
      y_train_index.shape, y_test_index.shape)
#X_train_index
print(type(X_train))
X_train[0]

In [None]:
#imputer = SimpleImputer(strategy='mean')
#imputer.fit(X_train[6])
#X_train[6] = imputer.transform(X_train[6])

In [None]:
md = [6, 10, 50]
lr = [0.01, 0.02, 0.05]
ne = [100, 500, 1000]
cb = [0.7, 1.0]
params = { 'max_depth': md,  #Maximum tree depth for base learners
           'learning_rate': lr,  #Boosting learning rate
           'n_estimators': ne,  #Number of gradient boosted trees. Equivalent to number of boosting rounds
           'colsample_bytree': cb} #Subsample ratio of columns when constructing each tree

params = { 'max_depth': [6], #Maximum tree depth for base learners
           'learning_rate': [0.01], #Boosting learning rate
           'n_estimators': [100], #Number of gradient boosted trees. Equivalent to number of boosting rounds
           'colsample_bytree': [0.7]} #Subsample ratio of columns when constructing each tree

# params = { 'max_depth': [50], #Maximum tree depth for base learners
#            'learning_rate': [0.05], #Boosting learning rate
#            'n_estimators': [500], #Number of gradient boosted trees. Equivalent to number of boosting rounds
#            'colsample_bytree': [0.7]} #Subsample ratio of columns when constructing each tree

# params = { 'max_depth': md.index(0), #Maximum tree depth for base learners
#            'learning_rate': lr.index(0), #Boosting learning rate
#            'n_estimators': ne.index(0), #Number of gradient boosted trees. Equivalent to number of boosting rounds
#            'colsample_bytree': cb.index(0)} #Subsample ratio of columns when constructing each tree

xgb_reg = xgb.XGBRegressor(seed=20)

xgb_grid = GridSearchCV(estimator=xgb_reg, param_grid=params, cv=2)
#xgb_grid = RandomizedSearchCV(estimator=xgb_reg, param_grid=params, cv=2)

xgb_grid.fit(X_train,y_train)

model = xgb_grid

In [None]:
result = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

result = xgb_grid.predict(X_test)
R2 = r2_score(y_test,result)
MSE = mean_squared_error(y_test,result)
RMSE = math.sqrt(MSE)
print('-'*10+'XGB'+'-'*10)
print('R square Accuracy: ',R2)
print('Mean Squared Error Accuracy: ',MSE)
print('Root Mean Squared Error: ',RMSE)

In [None]:
print(result)

In [None]:
result = result.reshape((-1, 1))

print(y_test_index.reshape((-1, 1)).shape)
print(result.reshape((-1, 1)).shape)
print(y_test.shape)

print(y_test_index.shape)
print(result.shape)
print(y_test.shape)

In [None]:
compare = np.hstack((y_test_index, y_test, result))
#compare[0:4]

In [None]:
compare_df = DataFrame(compare, columns=['reference', 'actual', 'predicted'])
compare_df['difference'] = abs(compare_df['actual'] - compare_df['predicted'])
compare_df['diff 1 %'] = abs((compare_df['actual'] - compare_df['predicted']) / compare_df['actual'] * 100)
compare_df['diff 2 %'] = abs((compare_df['actual'] - compare_df['predicted']) / compare_df['predicted']) * 100
compare_df['reference'] = compare_df['reference'].astype(int)
compare_df.set_index('reference', inplace=True)
compare_df

In [None]:
compare_df.join(df_orig)
# 85514838
# 115470422

In [None]:
model.score(X_test, y_test)