In [44]:
import numpy as np
import matplotlib.pylab as plt
plt.style.use('ggplot')
#from feature_engineering import add_retning
from sklearn.model_selection import train_test_split
import pandas as pd
import sys
sys.path.append('../data')
from sklearn import svm
#from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
#from sklearn.svm import SVC
from sklearn.preprocessing import scale

from display_data import import_data

In [45]:
def add_radius(df):
    #adds radius column to dataframe
    df['radius'] = np.sqrt((df['latitude']-55.75)**2 + (df['longitude']-37.55)**2)
    return df

def group_by_feature_and_price(df, feature, data_train = pd.DataFrame()):
    #makes a new column with the mean price for each group to all rows
    if data_train.empty:
        grouped = df.groupby([feature])

        mean_price = grouped['price'].mean()

        df_merged = pd.merge(df,mean_price, on=feature, how='left')

        column_name = str(feature + '_price')

        df[column_name] = df_merged['price_y']
    else:
        grouped = data_train.groupby([feature])

        mean_price = grouped['price'].mean()

        df_merged = pd.merge(df,mean_price, on=feature, how='left')

        column_name = str(feature + '_price')

        df[column_name] = df_merged['price']
    return df

def one_hot_encode(df, column_to_encode):
    """one hots encode for one singel column"""
    encoded_df = pd.get_dummies(df[[column_to_encode]].astype(str))
    df = pd.concat([df,encoded_df],axis=1)
    return df

def one_hot_encode_multiple(df, list_of_columns):
    """takes in multiple columns and runs one hot encode for each column"""
    for column_to_encode in list_of_columns:
        #print(column_to_encode)
        df = one_hot_encode(df, column_to_encode)
    return df

def add_direction(df):
    """adds direction to dataframe, can be one of eight categories (N,S,W,E)"""
    #straight up (north)
    normal_vector = np.array([0,1])
    #normal_vector = np.tile(normal_vector,(df.shape[0],1))
    #normal_vector = normal_vector.reshape((2,-df.shape[0]))
    temp = pd.DataFrame()
    temp['latitude'] = df['latitude']-55.75
    temp['longitude'] = df['longitude']-37.56


    apartment_vector = temp[['latitude','longitude']].to_numpy()


    #print(np.shape(apartment_vector), np.shape(normal_vector))
    angles = []
    for vector in apartment_vector:
        if vector[0] < 0:
            temp_angle = -angle_between(vector,normal_vector)
        else:
            temp_angle = angle_between(vector,normal_vector)
        angles.append(temp_angle)

    angles = [element * 10 for element in angles]


    angles_series = pd.Series(np.array(angles))
    #angles_series.plot.hist()

    df['direction'] = angles_series
    max = df.direction.max().round()
    min = df.direction.min().round()
    bins = [min,min*7/8,min*5/8,min*3/8,min/8,max/8,max*3/8,max*5/8,max*7/8,max]
    rounded_bins = [element.round() for element in bins]
    #print(rounded_bins)
    direction = pd.cut(df.direction, bins= rounded_bins,labels=['S','SW','W','NW','N','NE','E','SE','S'],ordered=False)
    df['direction'] = direction
    return df

def unit_vector(vector):
    """returns the unit vector if the vector"""
    return vector/np.linalg.norm(vector)

def angle_between(v1,v2):
    """returns angle between two vectors in radian"""
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u,v2_u),-1,1))

def fix_radius(data_test):
    """should just be called on test data"""
    #fix all radius issues
    data_test._set_value(23,'radius',0.203899)
    data_test._set_value(90,'radius',0.203899)
    data_test._set_value(2511,'radius',0.218159)
    data_test._set_value(5090,'radius',0.218159)
    data_test._set_value(6959,'radius',0.218159)
    data_test._set_value(8596,'radius',0.218159)
    data_test._set_value(4719, 'radius',0.19580)
    data_test._set_value(9547, 'radius',0.19520)
    data_test._set_value(2529, 'radius', np.sqrt((37.464994-37.55)**2+(55.627666-55.75)**2))

    return data_test

def drop_n_largest(data_train):
    """drops 4 largest values, should only be called on training set"""
    #drop 4 largest from training data, maybe not samrt, but we will see
    data_train.drop([3217,21414,15840,13938])
    return data_train

def clean_data(all_data):
    """cleans the data with all the knowledge we have so far"""


    #fix ceiling issues
    all_data.loc[all_data['ceiling'] > 50,'ceiling']*=0.01
    all_data.loc[all_data['ceiling'] > 25, 'ceiling']*=0.1
    all_data.loc[all_data['ceiling'] < 0.5,'ceiling'] = float('NaN')
    """"
    #fix area_kitchen and area_living issues
    all_data['living'] = all_data.area_living/all_data.area_total
    all_data['kitchen'] = all_data.area_total/all_data.area_kitchen

    all_data.loc[all_data['living'] > 1,'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[all_data['living'] > 1,'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100

    all_data.loc[all_data.area_living/all_data.area_total > 1, 'area_living'] = float('NaN')

    all_data['living'] = all_data.area_living/all_data.area_total
    all_data['kitchen'] = all_data.area_kitchen/all_data.area_total

    all_data['sum_area'] = all_data.area_living + all_data.area_kitchen
    all_data.loc[all_data['sum_area'] == 100, 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[all_data['sum_area'] == 100, 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100

    #this fixed some few rows.
    #when printing we see some other very clear "precentage situations, fixing these
    all_data.loc[all_data['sum_area'] == 38.5, 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[all_data['sum_area'] == 38.5, 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[all_data['sum_area'] == 52.7, 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[all_data['sum_area'] == 52.7, 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[all_data['sum_area'] == 71.6, 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[all_data['sum_area'] == 71.6, 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 80) & (all_data['area_kitchen'] == 20), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 80) & (all_data['area_kitchen'] == 20), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 80) & (all_data['area_kitchen'] == 15), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 80) & (all_data['area_kitchen'] == 15), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 80) & (all_data['area_kitchen'] == 10), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 80) & (all_data['area_kitchen'] == 10), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 80) & (all_data['area_kitchen'] == 30), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 80) & (all_data['area_kitchen'] == 30), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 10), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 10), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 15), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 15), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 20), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 20), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 25), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 25), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 30), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 30), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 50), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 90) & (all_data['area_kitchen'] == 50), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 60) & (all_data['area_total'] > 120), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] == 60) & (all_data['area_total'] > 120), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] < 70) & (all_data['area_total'] > 120)&(all_data['rooms'] < 3), 'area_living'] = all_data['area_living'] * all_data['area_total']/100
    all_data.loc[(all_data['sum_area'] < 70) & (all_data['area_total'] > 120)&(all_data['rooms'] < 3), 'area_kitchen'] = all_data['area_kitchen'] * all_data['area_total']/100
    """
    return all_data

def add_high_up(df):
    """add exponetital function to determine how high up a building is"""
    high_up = df.floor/df.stories
    high_up_exp = np.exp(high_up) - 1
    euler = np.exp(1)

    df['high_up'] = high_up_exp
    df['high_up'].where(df['high_up'] > euler, euler)


    return df

PREPROCESSING

In [46]:
data_SVM, data_test_SVM = import_data()

# Missin data 1
#imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#imp.fit(train)
#Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
#train_imp = imp.transform(train)

Y = data_SVM.price
test_id = data_test_SVM.id

radius = True
# Add radius
if(radius):
    data_SVM = add_radius(data_SVM)
    data_test = add_radius(data_test_SVM)
    #coordinates = ['latitude', 'longitude']
    #data = data.drop(columns = coordinates)
    #data_test = data_test.drop(columns = coordinates)


data_SVM = data_SVM.drop(columns = ['area_living', 'area_kitchen'])
data_test_SVM = data_test_SVM.drop(columns = ['area_living', 'area_kitchen'])

categorical_data = ['seller','layout', 'windows_court', 'windows_street', 'condition', 'building_id','new','district','street',
                    'address', 'material', 'elevator_without', 'elevator_passenger', 'elevator_service', 'parking','garbage_chute', 'heating']
cleaning = True

if(cleaning):
    data_test_SVM = fix_radius(data_test_SVM)
    data_SVM = clean_data(data_SVM)
    data_test_SVM = clean_data(data_test_SVM)

data_columns_SVM = list(data_SVM.columns)
numerical_data = list(set(data_columns_SVM)-set(categorical_data))
print(numerical_data)
for column in numerical_data:
    mean = data_SVM[column].mean()
    data_SVM[column] = data_SVM[column].replace(np.NaN, mean)
    if column != 'price':
        mean_test = data_test_SVM[column].mean()
        data_test_SVM[column] = data_test_SVM[column].replace(np.NAN,mean)

#Features
radius = True
district_mean_price = False #her er det nan
ohe = True
direction = True
high_up = True
#add high up


if(high_up):
    data_SVM = add_high_up(data_SVM)
    data_test_SVM = add_high_up(data_test_SVM)

if(district_mean_price):
    data_SVM = group_by_feature_and_price(data_SVM,'district')
    data_test_SVM = group_by_feature_and_price(data_test_SVM,'district',data_train=data_SVM)
    data_SVM = data_SVM.drop(columns=['price', 'id'])


if(direction):
    data_SVM = add_direction(data_SVM)
    data_test_SVM = add_direction(data_test_SVM)
    data_SVM = one_hot_encode(data_SVM, 'direction')
    data_test_SVM = one_hot_encode(data_test_SVM, 'direction')
    data_SVM = data_SVM.drop(columns = 'direction')
    data_test_SVM = data_test_SVM.drop(columns = 'direction')


#One hot encode data
if(ohe):
    encode_categorical_data_SVM = ['seller','layout', 'windows_court', 'windows_street', 'condition','new','district',
                     'material', 'elevator_without', 'elevator_passenger', 'elevator_service', 'parking','garbage_chute', 'heating']
    data_SVM = one_hot_encode_multiple(data_SVM,encode_categorical_data_SVM)
    data_test_SVM = one_hot_encode_multiple(data_test_SVM,encode_categorical_data_SVM)
#####HER MÅ DET FIKSES 
# skal man logtransformere`?
#data_SVM['price'] = np.log(data_SVM['price'])/np.log(15)

scaler = MinMaxScaler() # mapper alt til mellom 0 og 1, default
data_SVM[numerical_data] = scaler.fit_transform(data_SVM[numerical_data])
#Y = scale(Y) ##FIKS her
#Y = data_SVM['price']
data_SVM = data_SVM.drop(columns=['price', 'id'])

#Drop cat_data
data_SVM = data_SVM.drop(columns = categorical_data) # har one-hot encoda lengre oppe
data_test_SVM = data_test_SVM.drop(columns=['id'])
data_test_SVM = data_test_SVM.drop(columns = categorical_data) #må huske testdataen
 
## Rot under her

nan_values = data_SVM.isna().any()
nan_columns = nan_values.any()


columns_with_nan = data_SVM.columns[nan_columns].tolist()
#print(nan_values)
#print(columns_with_nan)
for column in data_SVM.columns:
    if data_SVM[column].isna().any():
        print(column)

print(Y)
if (Y.values < 0).any():
    print('Gunnar')

['area_total', 'phones', 'constructed', 'bathrooms_shared', 'latitude', 'ceiling', 'stories', 'price', 'longitude', 'rooms', 'bathrooms_private', 'radius', 'loggias', 'id', 'floor', 'balconies']
0         7139520.0
1        10500000.0
2         9019650.0
3        10500000.0
4        13900000.0
            ...    
23280    13300000.0
23281    15854300.0
23282    19800000.0
23283    29999000.0
23284    10950000.0
Name: price, Length: 23285, dtype: float64


In [47]:
X_train_SVM, X_test_SVM, y_train_SVM, y_test_SVM = train_test_split(data_SVM, Y, test_size=0.2, random_state=42)
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all()
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

#data.info()
#print(len(Y))
#print(data.corr())
kaggle = False
if(kaggle):
    SVM_model = svm.SVR()
    Y = np.log(Y)/np.log(15)
    SVM_model.fit(data_SVM,Y)
    result_SVM = SVM_model.predict(data_test)

    result_SVM = 15**result_SVM
    submission = pd.DataFrame()
    submission['id'] = test_id
    submission['price_prediction'] = result_SVM
    submission.to_csv('submission_SVM.csv', index= False)


    

else:
    #y_train = np.log(y_train_SVM)/np.log(15)
    SVM_model = svm.SVR()
    y_train_SVM = np.log(y_train_SVM)/np.log(15)
    SVM_model.fit(X_train_SVM,y_train_SVM)
    prediction = SVM_model.predict(X_test_SVM)
    prediction = 15**prediction
    print(type(prediction))
    print(len(prediction))
    cunt = 0
    meanp = prediction.mean()
    for i, p in enumerate(prediction):
        if p < 0:
            cunt += 1
            prediction[i] = meanp
    print(cunt)


    #prediction = 15**prediction
    rmsle = root_mean_squared_log_error(y_test_SVM,prediction)
    print("first run", rmsle)


<class 'numpy.ndarray'>
4657
0
first run 0.2690268990590189


Forbedringer: 
Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data. For example, scale each attribute on the input vector X to [0,1] or [-1,+1], or standardize it to have mean 0 and variance 1. Note that the same scaling must be applied to the test vector to obtain meaningful results. This can be done easily by using a Pipeline: