In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt

In [None]:
# loading data
X = pd.read_csv('train_final_complete.csv')[['built_year', 'num_beds', 'num_baths', 'lat', 'lng', 'size_sqft',
                    'tenure_group', 'subzone_per_price_encoded',
                    'property_type_ordinal',
                    #mrt
                    'dist_to_nearest_important_mrt_rounded',
                    #schools
                    'number_of_nearby_primary_schools',
                    'number_of_nearby_secondary_schools',
                    #shopping mall
                    'number_of_nearby_shopping_malls',
                    #CR
                    #'name_of_nearest_IHL_ordinal',
                    'name_of_nearest_BN_ordinal',
                    'name_of_nearest_CR_ordinal']].to_numpy()

Y_per_price = pd.read_csv('train_final_complete.csv')[['per_price']].to_numpy()
Y_price = pd.read_csv('train_final_complete.csv')[['price']].to_numpy()
df_final = pd.read_csv('train_final_complete.csv')

In [None]:
def gen_seg(minthre,maxthre,gt):
    return gt[(gt.per_price>minthre)&(gt.per_price<=maxthre)].index

def segment():
    thres = [[0, 600], [600, 1200], [1200, 1800], [1800, 2400], [2400, 3000], [3000, df_final.per_price.max() + 100]]
    mses = []

    ss_x = StandardScaler()
    train_x = ss_x.fit_transform(X)

    svr = SVR()
    svr.fit(train_x, Y_per_price)

    for thre in thres:
        idxs = gen_seg(thre[0], thre[1], df_final)
        gt = df_final.iloc[idxs].per_price
        pred = svr.predict(train_x[idxs])
        mses.append(mean_squared_error(gt, pred))

    plt.plot(['segment1', 'segment2', 'segment3', 'segment4', 'segment5', 'segment6'], mses)
    plt.xlabel('Segment')  # 设置x轴的标签文本
    plt.ylabel('RMSE')  # 设置y轴的标签文本
    plt.show()

In [None]:
def svr_test_per_price():
    ss_x = StandardScaler()
    train_x = ss_x.fit_transform(X)


    svr = SVR()
    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    parameters = [{'kernel': ['rbf', 'poly', 'sigmoid'], 'gamma': [1e-3, 0.01, 1/15, ], 'C': [1]}]  # , 10, 100, 1000, 10000]}]
    print("Tuning hyper-parameters")
    svr = GridSearchCV(svr, parameters, cv=5, scoring=scorer)
    svr.fit(train_x, Y_per_price)
    print(svr.best_params_)


    test_X1 = pd.read_csv('test_final_complete_cleaned.csv')[['built_year', 'num_beds', 'num_baths', 'lat', 'lng', 'size_sqft',
                    'tenure_group', 'subzone_per_price_encoded',
                    'property_type_ordinal',
                    #mrt
                    'dist_to_nearest_important_mrt_rounded',
                    #schools
                    'number_of_nearby_primary_schools',
                    'number_of_nearby_secondary_schools',
                    #shopping mall
                    'number_of_nearby_shopping_malls',
                    #CR
                    #'name_of_nearest_IHL_ordinal',
                    'name_of_nearest_BN_ordinal',
                    'name_of_nearest_CR_ordinal']].to_numpy()
    test_X = test_X1.copy()
    test_X = ss_x.transform(test_X)

    predict_Y = svr.predict(test_X)
    # print(predict_Y)
    result = predict_Y * test_X1[:, 5]
    # print(result)

    df_predict = pd.DataFrame(result)
    print(df_predict)

    # MSE = mean_squared_error()

    import os
    os.makedirs('folder', exist_ok=True)
    df_predict.to_csv('folder/out_pp.csv')

In [None]:
segment()

In [None]:
svr_test_per_price()