In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression, Ridge

import numpy.polynomial.polynomial as poly

In [None]:
def get_clean_data(file):
    # Reads panadas dataframe
    df = pd.read_csv(file)
    # Convert datetime to a number
    df['SALE DATE'] = pd.to_datetime(df['SALE DATE']).astype(np.int64)
    # Drop other unneeded columns
    data = df.drop(columns = ["NEIGHBORHOOD", "APARTMENT NUMBER", "BUILDING CLASS CATEGORY", "TAX CLASS AT PRESENT", "BUILDING CLASS AT PRESENT", "EASE-MENT", "ADDRESS", "ZIP CODE", "BUILDING CLASS AT TIME OF SALE", "TOTAL UNITS"])
    # Drop other boroughs
    # data = data[(data['BOROUGH'] != 1) & (data['BOROUGH'] != 2) & (data['BOROUGH'] != 5)]
    data = data[(data['BOROUGH'] == 3) | (data['BOROUGH'] == 4)]
    # Convert all other strings to int values
    data = data.astype(int)
    # print(x.info())
    return data

In [None]:
def preprocess_normalize_data(df):
    df_x_prescale = df[["BLOCK","LOT","RESIDENTIAL UNITS","COMMERCIAL UNITS","LAND SQUARE FEET", "GROSS SQUARE FEET","YEAR BUILT","TAX CLASS AT TIME OF SALE","SALE PRICE", "SALE DATE"]]
    df_y = df[["BOROUGH"]]
    # print(df_x_prescale)

    x_scaled_array = preprocessing.scale(df_x_prescale)
    y_array = np.array(df_y)

    df_z_scaled = df_x_prescale.copy()
    # print(df_z_scaled)
    # apply normalization technique to Sale Date
    column = 'SALE DATE'
    df_z_scaled[column] = (df_z_scaled[column] - df_z_scaled[column].mean()) / df_z_scaled[column].std()
    # print(df_z_scaled)
    
    X_train, X_test, y_train, y_test = train_test_split(x_scaled_array, y_array, test_size = 0.8, train_size = 0.2)
    return X_train, y_train

# Regular Logistic Regression

In [None]:
def train_logistic(X_train, y_train):
    logreg = LogisticRegression(C = 100000000) # regular logistic regression, no regularization
    logreg.fit(X_train, y_train)
    return logreg.coef_, logreg.intercept_, logreg.score(X_train, y_train)

In [None]:
data = get_clean_data("cleaned_data.csv")
# print(data)
# print(data.info(verbose=True))

# X_train, y_train = no_feature_scaling(data)
# X_train, y_train = preprocess_scale_data(data)
X_train, y_train = preprocess_normalize_data(data)
# X_train, y_train = preprocess_standard_scalar(data)

# print(X_train)
# print(y_train)

coef, inter, score = train_logistic(X_train, y_train)
print("Coefficients", coef)
print("Intercept", inter)
print("Score", score)

Coefficients [[ 9.50880458e-01 -1.44402694e+00  4.85175815e-01  5.11669711e+00
   4.05724729e-01 -8.82870072e-01  2.20630365e+00 -5.51493602e-01
   8.38500332e-02  4.26622461e-03]]
Intercept [-1.33855231]
Score 0.736318407960199
  return f(*args, **kwargs)


# Polynomial Feature Transformation

In [None]:
def polynomial_regression(X_train, y_train, degree):
    poly_features = preprocessing.PolynomialFeatures(degree = degree, include_bias = False)
    print(X_train)
    X_poly = poly_features.fit_transform(X_train)
    return X_poly

In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)
X_poly = polynomial_regression(X_train, y_train, 2)
# X_poly = polynomial_regression2(X_train, y_train)
coef, inter, score = train_logistic(X_poly, y_train)
print("Coefficients", coef)
print("Intercept", inter)
print("Score", score)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[[ 1.02405612e+00 -4.27306904e-01 -6.94127163e-02 ... -5.82562547e-01
  -1.29400880e-01  7.37536948e-01]
 [ 4.67694342e-01 -3.72052178e-01 -6.94127163e-02 ... -5.82562547e-01
   2.35039097e-01  1.11991542e+00]
 [ 7.44740826e-01  5.67723518e+00 -1.44045808e-01 ...  3.53036575e+00
  -2.30684167e-01  1.11991542e+00]
 ...
 [ 2.88458628e-01 -5.02453331e-01  5.22037544e-03 ... -5.82562547e-01
  -1.22674880e-02  5.88151611e-02]
 [ 3.73412827e-01  5.89014444e-03 -1.44045808e-01 ...  3.53036575e+00
   4.37380848e-01  3.36039553e-01]
 [-8.73926576e-01  1.72099683e+00 -6.94127163e-02 ...  7.88413554e-01
   8.55310251e-02  7.47096410e-01]

# Ridge Regularization

In [None]:
def ridge_regularization(X_train, y_train, alpha):
    c = 1 /alpha
    logreg = LogisticRegression(C = c) # regular logistic regression, no regularization
    logreg.fit(X_train, y_train)
    return logreg.coef_, logreg.intercept_, logreg.score(X_train, y_train)

In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)
coef, inter, score = ridge_regularization(X_train, y_train, 0.5)
# print("Coefficients", coef)
# print("Intercept", inter)
print("Score", score)

Score 0.7403367776502104
  return f(*args, **kwargs)


In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)
ridge_alpha_space = np.linspace(0,1,11)
for i in ridge_alpha_space:
    coef, inter, score = ridge_regularization(X_train, y_train, i)
    # print("Coefficients", coef)
    # print("Intercept", inter)
    print("Alpha", i, "Score", score)

  
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
Alpha 0.0 Score 0.7491389207807119
Alpha 0.1 Score 0.7502870264064294
Alpha 0.2 Score 0.7502870264064294
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
Alpha 0.30000000000000004 Score 0.7500956754688098
Alpha 0.4 Score 0.7499043245311902
Alpha 0.5 Score 0.7499043245311902
  return f(*args, **kwargs)
  return f(*args, **kwargs)
Alpha 0.6000000000000001 Score 0.7493302717183314
Alpha 0.7000000000000001 Score 0.7481821660926139
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
Alpha 0.8 Score 0.7477994642173746
Alpha 0.9 Score 0.7479908151549942
Alpha 1.0 Score 0.7479908151549942


# Lasso Regularization

In [None]:
def lasso_regularization(X_train, y_train, alpha):
    c = 1 /alpha
    logreg = LogisticRegression(C = c, penalty = 'l1', solver='liblinear') # regular logistic regression, no regularization
    logreg.fit(X_train, y_train)
    return logreg.coef_, logreg.intercept_, logreg.score(X_train, y_train)

In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)
coef, inter, score = lasso_regularization(X_train, y_train, 0.5)
# print("Coefficients", coef)
# print("Intercept", inter)
print("Score", score)

Score 0.7455032529659396
  return f(*args, **kwargs)


In [None]:
data = get_clean_data("cleaned_data.csv")
X_train, y_train = preprocess_normalize_data(data)
ridge_alpha_space = np.linspace(0,1,11)
for i in ridge_alpha_space:
    coef, inter, score = lasso_regularization(X_train, y_train, i)
    # print("Coefficients", coef)
    # print("Intercept", inter)
    print("Alpha", i, "Score", score)

  
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
Alpha 0.0 Score 0.58017604286261
Alpha 0.1 Score 0.7298124760811328
Alpha 0.2 Score 0.7296211251435132
Alpha 0.30000000000000004 Score 0.7290470723306545
Alpha 0.4 Score 0.7288557213930348
Alpha 0.5 Score 0.7288557213930348
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
Alpha 0.6000000000000001 Score 0.7286643704554152
Alpha 0.7000000000000001 Score 0.7282816685801761
Alpha 0.8 Score 0.7280903176425565
Alpha 0.9 Score 0.7277076157673172
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
Alpha 1.0 Score 0.7277076157673172


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=90fab6ac-739b-4c97-b68c-ff3c09208031' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>