In [3]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, Ridge
from sklearn import svm

import numpy.polynomial.polynomial as poly

In [2]:
def get_clean_data(file):
    # Reads panadas dataframe
    df = pd.read_csv(file)
    # Convert datetime to a number
    df['SALE DATE'] = pd.to_datetime(df['SALE DATE']).astype(np.int64)
    # Drop other unneeded columns
    data = df.drop(columns = ["NEIGHBORHOOD", "APARTMENT NUMBER", "BUILDING CLASS CATEGORY", "TAX CLASS AT PRESENT", "BUILDING CLASS AT PRESENT", "EASE-MENT", "ADDRESS", "ZIP CODE", "BUILDING CLASS AT TIME OF SALE", "TOTAL UNITS"])
    # Drop other boroughs
    # data = data[(data['BOROUGH'] != 1) & (data['BOROUGH'] != 2) & (data['BOROUGH'] != 5)]
    data = data[(data['BOROUGH'] == 3) | (data['BOROUGH'] == 4)]
    # Convert all other strings to int values
    data = data.astype(int)
    # print(x.info())
    return data

# SVM

In [4]:
def preprocess_normalize_data(df):
    df_x_prescale = df[["BLOCK","LOT","RESIDENTIAL UNITS","COMMERCIAL UNITS","LAND SQUARE FEET", "GROSS SQUARE FEET","YEAR BUILT","TAX CLASS AT TIME OF SALE","SALE PRICE", "SALE DATE"]]
    df_y = df[["BOROUGH"]]
    # print(df_x_prescale)

    x_scaled_array = preprocessing.scale(df_x_prescale)
    y_array = np.array(df_y)

    df_z_scaled = df_x_prescale.copy()
    # print(df_z_scaled)
    # apply normalization technique to Sale Date
    column = 'SALE DATE'
    df_z_scaled[column] = (df_z_scaled[column] - df_z_scaled[column].mean()) / df_z_scaled[column].std()
    # print(df_z_scaled)
    
    X_train, X_test, y_train, y_test = train_test_split(x_scaled_array, y_array, test_size = 0.8, train_size = 0.2)
    return X_train, y_train

In [None]:
def train_SVM(X_train, y_train):
    svm_model = svm.SVC(probability = False, kernel = 'linear', C = 100000000) # regular logistic regression, no regularization
    svm_model.fit(X_train, y_train)
    return svm_model.coef_, svm_model.intercept_, svm_model.score(X_train, y_train)

In [None]:
data = get_clean_data("cleaned_data.csv")
# print(data)
# print(data.info(verbose=True))

# X_train, y_train = no_feature_scaling(data)
# X_train, y_train = preprocess_scale_data(data)
X_train, y_train = preprocess_normalize_data(data)
# X_train, y_train = preprocess_standard_scalar(data)

# print(X_train)
# print(y_train)

coef, inter, score = train_SVM(X_train, y_train)
print("Coefficients", coef)
print("Intercept", inter)
print("Score", score)is 

  return f(*args, **kwargs)
Coefficients [[ 120.13902545 -143.80097198   55.70782465 1246.46281734  284.78840327
    65.74696255  119.98168755 -107.56617546 -194.44481325    3.00286102]]
Intercept [-130.82863892]
Score 0.7437810945273632


# Degree 2 Polynomial Feature Transformation

In [None]:
def poly_SVM(X_train, y_train):
    svm_model = svm.SVC(probability = False, kernel = 'poly', degree=2, C = 100000000) # regular logistic regression, no regularization
    svm_model.fit(X_train, y_train)
    return svm_model.score(X_train, y_train)

In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)

score = poly_SVM(X_train, y_train)
print("Score", score)

  return f(*args, **kwargs)
Score 0.838691159586682


# RBF Feature Transformation

In [None]:
def rbf_SVM(X_train, y_train):
    svcrbf = svm.SVC(probability=False, kernel="rbf", C=2.8, gamma=.0073,verbose=10)
    svcrbf.fit(X_train, y_train)
    return svcrbf.score(X_train, y_train)

In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)

score = rbf_SVM(X_train, y_train)
print("Score", score)

  return f(*args, **kwargs)
[LibSVM]Score 0.7682740145426713


# Ridge Regularization

In [None]:
def ridge_regularization(X_train, y_train, alpha):
    c = 1 /alpha
    svm_model = svm.SVC(probability = False, kernel = 'poly', C = c) # regular logistic regression, no regularization
    svm_model.fit(X_train, y_train)
    return svm_model.score(X_train, y_train)

In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)
score = ridge_regularization(X_train, y_train, 0.5)
print("Score", score)

  return f(*args, **kwargs)
Score 0.7644469957902794


In [6]:
def ridge_regularization_linear(X_train, y_train, alpha):
    c = 1 /alpha
    svm_model = svm.SVC(probability = False, kernel = 'linear', C = c) # regular logistic regression, no regularization
    svm_model.fit(X_train, y_train)
    return svm_model.score(X_train, y_train)

In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)
score = ridge_regularization_linear(X_train, y_train, 0.5)
print("Score", score)

  return f(*args, **kwargs)
Score 0.7552621507845388


In [5]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)
ridge_alpha_space = np.linspace(0,1,11)
for i in ridge_alpha_space:
    score = ridge_regularization_linear(X_train, y_train, i)
    # print("Coefficients", coef)
    # print("Intercept", inter)
    print("Alpha", i, "Score", score)

  
  return f(*args, **kwargs)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=90fab6ac-739b-4c97-b68c-ff3c09208031' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>