In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet # linear regression package
from sklearn.model_selection import train_test_split # split dataset
from sklearn.metrics import mean_squared_error as mse # Measurement metric

ModuleNotFoundError: No module named 'pandas_profiling'

In [2]:
# helper functions
def clean_null_values(df, column):
  df[column] = pd.to_numeric(df[column], errors='coerce')
  df[column].fillna(0, inplace=True)

def convert_to_string(df, column):
  df[column] = df[column].apply(lambda x: str(x))

In [3]:
df = pd.read_csv('nyc-rolling-sales.csv')

clean_null_values(df, 'SALE PRICE')
clean_null_values(df, 'LAND SQUARE FEET')
clean_null_values(df, 'GROSS SQUARE FEET')

df = df[((df['SALE PRICE'] > 10000) &
         (df['LAND SQUARE FEET'] > 2) &
         (df['GROSS SQUARE FEET'] > 0) &
         (df['YEAR BUILT'] > 0))]

FileNotFoundError: [Errno 2] File b'nyc-rolling-sales.csv' does not exist: b'nyc-rolling-sales.csv'

In [None]:
from datetime import datetime

# we had an idea to convert the dates to a number
# df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])
# df['SALE DATE'] = df['SALE DATE'].apply(lambda x: x.timestamp())
df['YEAR BUILT'] = pd.to_datetime(df['YEAR BUILT'])
df['YEAR BUILT'] = df['YEAR BUILT'].apply(lambda x: x.timestamp())

# we also had an idea to make the sale month a column
df['SALE MONTH'] = df['SALE DATE'].apply(lambda x: int(x.split('-')[1]))
df.head()

In [None]:
# numerical features
all_cols = ['RESIDENTIAL UNITS',
            'TOTAL UNITS',
            'SALE MONTH',
            'SALE PRICE']

# convert to string so that we can make them dummies
convert_to_string(df, 'BOROUGH')
convert_to_string(df, 'TAX CLASS AT TIME OF SALE')
# convert_to_string(df, 'TAX CLASS AT PRESENT')
convert_to_string(df, 'BUILDING CLASS CATEGORY')

dummy_df = pd.get_dummies(df[['BOROUGH',
                              'TAX CLASS AT TIME OF SALE',
                              'BUILDING CLASS CATEGORY']])
# dummy_df.head()

all_df = pd.concat([df[all_cols], dummy_df], axis=1)
# all_df.head()

train_cols = all_df.columns.values.tolist()
train_cols.remove('SALE PRICE')

X = all_df[train_cols]
y = all_df['SALE PRICE']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
# Initialize model
model = LinearRegression()
# from sklearn import tree
# model = tree.DecisionTreeClassifier()
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(n_estimators=10)
# model = ElasticNet()

# Fit model
model.fit(X_train, y_train)
# Predict prices for test data
y_predicted = model.predict(X_test)
# Get model score (R2) and RSME
model_score = model.score(X_test, y_test)
model_rsme = np.sqrt(mse(y_predicted, y_test))

print(model_rsme)

# get coefficients
coefficient = model.coef_

# get intercept
intercept = model.intercept_

In [None]:
# put coefficients into dataframe
# pair the feature names with the coefficients
coefficients_pd = pd.DataFrame((list(zip(train_cols, model.coef_ * 1000))), columns = ['Features', 'Coefficients'])

# print table of coefficients for contribution chart
#coefficients_pd

# Plot Contribution chart
coefficients_pd.sort_values('Coefficients').set_index('Features').plot(kind = 'barh', color = 'orange')