# Import Statements

In [3]:
import pickle
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
from collections import defaultdict

import re
from datetime import datetime
import glob

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

# Necessary imports
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge #ordinary linear regression + w/ ridge regularization
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

pd.set_option('display.max_columns', 500)

# Functions

In [4]:
def load_pickle(file_location):
    with open(file_location, "rb") as picklefile:
        df = pickle.load(picklefile)
    return(df)

# Clean Data

In [5]:
df = load_pickle("all_midtown_data.pkl")

In [6]:
# Remove Building Type column. They are all Condos
df = df.drop("Building Type", axis=1)

# Sort values by date sold
df = df.sort_values(by = 'sold_date_dt', ascending = False)

# Drop duplicate rows
#df_no_dup = df.drop_duplicates(subset = 'full_name', keep = 'first').reset_index(drop = True)
#print("1. Duplicates Removed")

# Add season
df['sold_season'] = (df['sold_date_dt']
                     .dt
                     .month
                     .map({1 : 'Winter', 
                           2 : 'Winter',
                           3 : 'Spring',
                           4 : 'Spring',
                           5 : 'Spring',
                           6 : 'Summer',
                           7 : 'Summer',
                           8 : 'Summer',
                           9 : 'Fall',
                           10 : 'Fall',
                           11 : 'Fall',
                           12 : 'Winter'}))
print("2. Season Added")

# Add days on market column, and convert datetime timedelta to float
df['days_on_market_1'] = df['sold_date_dt'] - df['Listed']
df['days_on_market_1'] = (df['days_on_market_1']/np.timedelta64(1, 'D'))
print("3. Days on Market Added")

# Fill to indicate Studio
df['beds'] = df['beds'].fillna(0)
print("4. Filled Beds to indicate Studio")

### Fixing rows where there is no condo ###
# Find where apt = NaN
df_apt_na = df[df['apt_floor'].isna()]

# Fill Apt nulls, move bldg address to correct place, and delete bldg name
for index, row in df_apt_na.iterrows():
    #print(row['apt_floor'])
    #print(np.isnan(row['apt_floor']))
    try:
        if np.isnan(row['apt_floor']):
            df.loc[index, 'apt_floor'] = int(re.findall('\d+', row['bldg_addr'])[0]) # assign value in full dataframe
            df.loc[index, 'bldg_addr'] = df.loc[index]['bldg_name'] # move building address over to correct column
            df.loc[index, 'bldg_name'] = np.nan # clears building name
            #print(index)
        else:
            continue
    except:
        continue
print("5. Apt Nulls Filled")

# Rename Columns
df = df.rename(columns = {"Listed": "listed", 
                 "Days on Market": "days_on_market", 
                 "Neighborhood": "neighborhood", 
                 "Monthly Common Charges": "monthly_common_charges", 
                 "Monthly Real Estate Taxes": "monthly_real_estate_taxes", 
                 "Minimum Down Payment": "minimum_down_payment", 
                 "Doorman": "doorman", 
                 "Last Price Change": "last_price_change"})
print("6. Renamed Columns")

2. Season Added
3. Days on Market Added
4. Filled Beds to indicate Studio
5. Apt Nulls Filled
6. Renamed Columns


In [7]:
#df_clean['year'] = df_clean['sold_date_dt'].apply(lambda x: x.strftime('%Y'))
df['year'] = df['sold_date_dt'].apply(lambda x: x.strftime('%Y'))

# Want only 2 bedrooms or less
df = df[df['beds'] < 2.5]

# Convert days on market to int
df['days_on_market'] = pd.to_numeric(df['days_on_market'])

In [8]:
df.shape

(22937, 25)

In [9]:
# 22945 values initially
# 15008 values once duplicates are dropped
df_drop = df.drop_duplicates(subset=['full_name', 'sold_date']).reset_index(drop = True)

In [10]:
df_drop.describe()
# Want apartments with only 0, 1, 2 bedrooms

Unnamed: 0,apt_floor,bldg_rating,sold_price,price_per_sqft,square_feet,beds,baths,days_on_market,monthly_common_charges,monthly_real_estate_taxes,minimum_down_payment,days_on_market_1
count,14978.0,14392.0,15003.0,6445.0,6445.0,15003.0,14975.0,7083.0,7006.0,6707.0,2856.0,7083.0
mean,159.778408,75.730753,1315659.0,1447.735764,951.232894,1.174498,1.459065,1789.0096,1306.234,937.680781,0.109436,47.166173
std,630.924406,11.08555,1224740.0,550.755976,384.575335,0.688777,0.565285,1052.018098,17071.24,1114.621562,0.055149,835.310532
min,1.0,44.0,100000.0,190.0,285.0,0.0,1.0,2.0,206.0,101.0,0.1,-6004.0
25%,7.0,68.0,689000.0,1099.0,668.0,1.0,1.0,895.5,649.0,523.0,0.1,48.0
50%,16.0,78.0,985000.0,1338.0,878.0,1.0,1.0,1689.0,885.0,773.0,0.1,126.0
75%,32.0,84.0,1540000.0,1650.0,1176.0,2.0,2.0,2562.0,1301.75,1152.0,0.1,237.0
max,6504.0,99.0,22000000.0,6705.0,3895.0,2.0,5.0,4517.0,1426830.0,75980.0,0.9,3706.0


# Geolocation

In [45]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here", timeout = 3)
location = geolocator.geocode(df_drop['bldg_addr'][0] + " NYC")

In [13]:
target = ['sold_price']
bldg = ['full_name', 'bldg_name', 'bldg_addr'] #'bldg_name', 'bldg_addr']
cont = ['bldg_rating', 
        'apt_floor',
        'square_feet', 
        'beds', 
        'baths', 
        'monthly_common_charges', 
        'monthly_real_estate_taxes']
cat = ['sold_season', 'neighborhood', 'year']
model_columns = bldg + cont + cat + target

In [14]:
df_graph = df_drop[model_columns].dropna().reset_index(drop = True)
df_graph = df_graph[df_graph['sold_price'] < 8000000]

In [23]:
df_graph['bldg_addr'][0]

'145 East 48th Street'

In [40]:
test = geolocator.geocode(df_graph['bldg_addr'][0] + ' NYC')
print(test.latitude)
print(test.longitude)
#test.raw['address']['postcode']


40.75515
-73.9723602987829


In [39]:
test

Location(Cosmopolitan, 145, East 48th Street, Turtle Bay, Manhattan Community Board 5, Manhattan, New York County, NYC, New York, 10017, USA, (40.75515, -73.9723602987829, 0.0))

In [48]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")

from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
#df['location'] = df['name'].apply(geocode)

#df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)

In [49]:
print(geocode(df_graph['bldg_addr'] + " NYC"))

RateLimiter caught an error, retrying (0/2 tries). Called with (*(0           145 East 48th Street NYC
1           350 West 42nd Street NYC
2             211 Madison Avenue NYC
3           350 West 57th Street NYC
4           500 West 43rd Street NYC
                    ...             
4387        150 West 51st Street NYC
4388        224 East 52nd Street NYC
4389        240 East 47th Street NYC
4390    845 United Nations Plaza NYC
4391    845 United Nations Plaza NYC
Name: bldg_addr, Length: 4375, dtype: object,), **{}).
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/geopy/geocoders/base.py", line 355, in _call_geocoder
    page = requester(req, timeout=timeout, **kwargs)
  File "/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/anaconda3/lib/python3.6/urllib/request.p

None


In [47]:
addr_list = []
lat_list = []
long_list = []

c = 0
bad_c = 0

for row, value in df_graph['bldg_addr'].items():
    location = geolocator.geocode(value + " NYC")
    
    try:
        lat_list.append(location.latitude)
        long_list.append(location.longitude)
        addr_list.append(value)
        
        c = c + 1
        print("count: ", c)
        print(value)
    except:
        bad_c = bad_c + 1
        print("did not work: ", value)
        print("bad count: ", bad_c)
        continue
    
    time.sleep(2)
        
        

GeocoderQuotaExceeded: HTTP Error 429: Too Many Requests

In [None]:
for row, value in df_graph['bldg_addr'].items():
    print(value)

In [None]:
addr_list = []
lat_list = []
long_list = []

geolocator = Nominatim(user_agent="specify_your_app_name_here")

for row, value in df_graph['bldg_addr'].items():
    try:
        location = geolocator.geocode(value + " NYC")
        lat_list.append(location.latitutde)
        long_list.append(location.longitude)
        addr_list.append(value)
        print("good: ", value)
    except:
        lat_list.append(np.nan)
        long_list.append(np.nan)
        addr_list.append(value)
        print("not good: ", value)

In [None]:
# import the library
import folium
import pandas as pd
 
# Make a data frame with dots to show on the map
data = pd.DataFrame({
   'lat':[-58, 2, 145, 30.32, -4.03, -73.57, 36.82, -38.5],
   'lon':[-34, 49, -38, 59.93, 5.33, 45.52, -1.29, -12.97],
   'name':['Buenos Aires', 'Paris', 'melbourne', 'St Petersbourg', 'Abidjan', 'Montreal', 'Nairobi', 'Salvador'],
   'value':[10,12,40,70,23,43,100,43]
})
data
 
# Make an empty map
m = folium.Map(location=[20,0], tiles="Mapbox Bright", zoom_start=2)
 
# I can add marker one by one on the map
for i in range(0,len(data)):
   folium.Circle(
      location=[data.iloc[i]['lon'], data.iloc[i]['lat']],
      popup=data.iloc[i]['name'],
      radius=data.iloc[i]['value']*10,
      color='crimson',
      fill=True,
      fill_color='crimson'
   ).add_to(m)
 
# Save it as html
m.save('mymap.html')


In [None]:
print(location.raw)
{'place_id': '9167009604', 'type': 'attraction', ...}

# Overarching graphs

In [None]:
df_final = df_drop[df_drop['sold_price'] < 8000000]
df_final.head()

In [None]:
plt.bar(df_final['sold_season'].value_counts())

# Investigating Days on Market -- won't include

In [None]:
pd.options.display.max_colwidth = 200

In [None]:
df_drop[['full_name', 'sold_date_dt', 'listed', 'days_on_market', 'days_on_market_1', 'url']].describe()

In [None]:
df_drop[df_drop['days_on_market_1'] < df_drop['days_on_market']]

In [None]:
df_drop[df_drop['days_on_market_1'] > 1000]

In [None]:
df_drop[['full_name', 'sold_date_dt', 'listed', 'days_on_market', 'url']].info()

# EDA

In [None]:
target = ['sold_price']
bldg = ['full_name'] #'bldg_name', 'bldg_addr']
cont = ['bldg_rating', 
        'apt_floor',
        'square_feet', 
        'beds', 
        'baths', 
        'monthly_common_charges', 
        'monthly_real_estate_taxes']
cat = ['sold_season', 'neighborhood', 'year']
model_columns = bldg + cont + cat + target

In [None]:
smaller_df = df_drop.loc[:, model_columns]
#smaller_df.shape
smaller_df.info()

In [None]:
# Drop NAs
smaller_df = smaller_df.dropna().reset_index(drop = True)
# smaller_df.head()

In [None]:
smaller_df.info()

In [None]:
smaller_df.describe()

In [None]:
sns.pairplot(smaller_df, diag_kind='kde')

In [None]:
smaller_df.corr()

In [None]:
# Remove outliers. 21 outliers greater than 8 Million
smaller_df_no_out = smaller_df[smaller_df['sold_price'] < 6000000]

In [None]:
sns.distplot(smaller_df['sold_price'] )

In [None]:
smaller_df_no_out.head()

In [None]:
# Get Dummies
df_model = pd.get_dummies(smaller_df_no_out.drop('full_name', axis = 1), columns = ['sold_season', 'neighborhood'])

# Remove large outliers from sold

df_model.head()

In [None]:
len(df_model)

In [None]:
# Define X values and y values
X = df_model.drop('sold_price', axis = 1)
y = df_model['sold_price']

In [None]:
df_model.head()

In [None]:
# weird = 10, 14
# normal = 11, 12, 13
rs = 10

In [None]:
# hold out 20% of the data for final testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state= rs)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

# Looking at different random states

In [None]:
sns.distplot(X['apt_floor'][X['apt_floor'] < 100])

In [None]:
plt.scatter(smaller_df_no_out['apt_floor'][smaller_df_no_out['apt_floor'] < 100], smaller_df_no_out['sold_price'][smaller_df_no_out['apt_floor'] < 100])

In [None]:
rs_10 = 10 #weird
rs_14 = 14 #weird
rs_11 = 11 #normal
X_train10, X_test10, y_train10, y_test10 = train_test_split(X, y, test_size=.2, random_state= rs_10)
X_train14, X_test14, y_train14, y_test14 = train_test_split(X, y, test_size=.2, random_state= rs_14)
X_train11, X_test11, y_train11, y_test11 = train_test_split(X, y, test_size=.2, random_state= rs_11)

In [None]:
smaller_df_no_out.columns

In [None]:
# 14 is weird
smaller_df_no_out.loc[list(X_train14.index), :]['baths'].value_counts()
#smaller_df_no_out.loc[list(X_train14.index), :]['monthly_common_charges'].mean()

In [None]:
smaller_df_no_out['neighborhood'].value_counts()

In [None]:
smaller_df_no_out['sold_season'].value_counts()

In [None]:
smaller_df_no_out['year'].value_counts()

In [None]:
smaller_df_no_out.loc[list(X_test14.index), :]['baths'].value_counts()
#smaller_df_no_out.loc[list(X_test14.index), :]['monthly_common_charges'].mean()

In [None]:
#smaller_df_no_out.loc[list(X_test14.index), :]['year'].value_counts()

In [None]:
#smaller_df_no_out.loc[list(X_train14.index), :]['sold_season'].value_counts()

In [None]:
smaller_df_no_out.loc[list(X_train11.index), :]['beds'].value_counts()
#smaller_df_no_out.loc[list(X_train11.index), :]['monthly_common_charges'].mean()

In [None]:
smaller_df_no_out.loc[list(X_test11.index), :]['beds'].value_counts()

# Simple Model - 1 Variable

In [None]:
#sns.distplot(df_model['sold_price'][df_model['sold_price'] <10000000])
#plt.scatter(df_model['sold_price'])

In [None]:
column_1 = ['square_feet','monthly_common_charges', 'monthly_real_estate_taxes', 'beds', 'baths', 'apt_floor'] # 'beds', 'baths', 
one_col_model = LinearRegression()
one_col_model.fit(X_train.loc[:, column_1], y_train)
one_col_model.score(X_train.loc[:, column_1], y_train)
#one_col_model.coef_, one_col_model.intercept_

In [None]:
one_col_model.score(X_test.loc[:, column_1],y_test)

In [None]:
cross_val_score(X_train.loc[:, column_1], y_train, cv = 5, scoring = 'r2')

# Comparing Simple Models

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
# Models

lm = LinearRegression()
lm_reg = Ridge(alpha = 10)

kf = KFold(n_splits=20, shuffle=True, random_state = 1001)

In [None]:
# Linear Regression
np.mean(cross_val_score(lm, X_train, y_train, # estimator, features, target
                cv=kf, # number of folds 
                scoring='r2')) # scoring metric
print(cross_val_score(lm, X_train, y_train, # estimator, features, target
                cv=kf, # number of folds 
                scoring='r2'))

In [None]:
# Ridge Regression
np.mean(cross_val_score(lm_reg, X_train, y_train, # estimator, features, target
                cv=kf, # number of folds 
                scoring='r2')) # scoring metric
print(cross_val_score(lm_reg, X_train, y_train, # estimator, features, target
                cv=kf, # number of folds 
                scoring='r2'))

In [None]:
# Linear Regression. Compare Train and Test
lm.fit(X_train, y_train)
print("Train Score: ", lm.score(X_train, y_train))
print("Test Score: ", lm.score(X_test, y_test))

In [None]:
# Ridge Regression. Compare Train and Test
lm_reg.fit(X_train, y_train)
print("Train Score: ", lm_reg.score(X_train, y_train))
print("Test Score: ", lm_reg.score(X_test, y_test))

# Ridge Regression with Standard Scaler

In [None]:
X_train.head()

In [None]:
# Scale Training data and Transform testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Run Cross Validation on Ridge Regression with scaled features. It is similar to simple models
np.mean(cross_val_score(lm_reg, X_train_scaled, y_train, cv=kf, scoring = 'r2'))

In [None]:
# Ridge Regression
lm_reg = Ridge(alpha=1)
lm_reg.fit(X_train_scaled, y_train)
print(lm_reg.score(X_train_scaled, y_train))
print(lm_reg.score(X_test_scaled, y_test))

In [None]:
from sklearn.linear_model import LassoCV
from sklearn import linear_model
#lasso_model = LassoCV()
lasso_model_2 = linear_model.Lasso(alpha = 100)
#lasso_model_3 = linear_model.Lasso(alpha = 1000)
lasso_model_2.fit(X_train_scaled, y_train)
print("Trained Score: ", lasso_model_2.score(X_train_scaled, y_train))
print("Test Score: ", lasso_model_2.score(X_test_scaled, y_test))

In [None]:
lasso_model_2.fit(X_train_scaled,y_train)
pred = lasso_model_2.predict(X_test_scaled)

In [None]:
# Plot residuals
res = y_test - pred
plt.scatter(pred, res)
plt.title("Residual plot")
plt.xlabel("prediction")
plt.ylabel("residuals")

In [None]:
np.mean(cross_val_score(lasso_model_2, X_train_scaled, y_train, cv=kf, scoring = 'r2'))
# score is 0.51

In [None]:
lm_poly = PolynomialFeatures(degree=2) 
lm_poly.fit

# Feature Interactions / Polynomial

In [None]:
X_train.head()

In [None]:
poly = PolynomialFeatures(degree=2) 

# Poly Transform adds new values to the tran, test, and validation sets
#X_train_poly = poly.fit_transform(X_train.values) # fits and transforms the data in one spot
X_train_poly = poly.fit_transform(X_train)
# scale is part of the model. So, need to fit to the training data. And test this fit on the 
# test data.

#X_test_poly = poly.transform(X_test.values)
X_test_poly = poly.transform(X_test)

In [None]:
# So we know what the columns are for polynomial-ized variables
poly_train_df = pd.DataFrame(poly.fit_transform(X_train), columns = poly.get_feature_names(input_features = X_train.columns))
poly_col = poly_train_df.columns

In [None]:
Poly_train_df = pd.DataFrame(poly.fit_transform(X_train), columns = poly.get_feature_names(input_features = X_train.columns))

In [None]:
# Values show that we need to scale. Model is over fit
lm_poly = LinearRegression()

kf = KFold(n_splits=5, shuffle=True, random_state = 71)
print("Mean value fom cv: ", np.mean(cross_val_score(lm_poly, X_train_poly, y_train, cv=kf, scoring='r2')))
cross_val_score(lm_poly, X_train_poly, y_train, cv=kf, scoring='r2')

In [None]:
# Run linear Regression
lm_poly = LinearRegression()

lm_poly.fit(X_train_poly, y_train)
print("Train Score: ", lm_poly.score(X_train_poly, y_train))
print("Test Score: ", lm_poly.score(X_test_poly, y_test))

# Standardize Features, and run Regularization Models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Fit standard scalar to the X train values to get mean and std. dev.
std = StandardScaler()
std.fit(X_train_poly)

In [None]:
# Applies scalar to training set. Subtracts mean and divides by st. dev. for every value
X_tr = std.transform(X_train_poly)
X_te = std.transform(X_test_poly)

In [None]:
lasso_model = LassoCV()
lasso_model.fit(X_tr, y_train)

In [None]:
lasso_model.alpha_

In [None]:
np.mean(cross_val_score(lasso_model, X_tr, y_train, # estimator, features, target
                cv=kf, # number of folds 
                scoring='r2')) # scoring metric

In [None]:
lasso_model.score(X_tr, y_train)

In [None]:
lasso_model.score(X_te, y_test)

In [None]:
feature_list = list(zip(poly_col, lasso_model.coef_))

In [None]:
sorted(feature_list, key=lambda tup: tup[1])[:10]

In [None]:
sorted(feature_list, key=lambda tup: tup[1], reverse = True)[:10]

In [None]:
std = StandardScaler()
std.fit(X_train_poly)

X_tr = std.transform(X_train_poly)
X_te = std.transform(X_test_poly)

In [None]:
np.mean(cross_val_score(lm_reg, X_tr, y_train, cv = kf, scoring = 'r2'))
#cross_val_score(lm_reg, X_tr, y_train, cv = kf, scoring = 'r2')

In [None]:
lm_reg = Ridge(alpha=1)
lm_reg.fit(X_tr, y_train)
print(lm_reg.score(X_tr, y_train))
print(lm_reg.score(X_te, y_test))

In [None]:
feature_list = list(zip(poly_col, lm_reg.coef_))
sorted(feature_list, key=lambda tup: tup[1])[:10]

In [None]:
sorted(feature_list, key=lambda tup: tup[1], reverse = True)[:10]

# Understanding Model Performance

In [None]:
X = df_model.drop('sold_price', axis = 1)
y = df_model['sold_price']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
plt.figure(figsize=(20,5))

rgr = LinearRegression()
rgr.fit(X,y)
pred = rgr.predict(X)

In [None]:
lm_reg = Ridge(alpha=1)
lm_reg.fit(X_tr, y_train)
pred = lm_reg.predict(X_te)

In [None]:
X_te.shape

In [None]:
y_test.shape

In [None]:
# Plot residuals
res = y_test - pred
plt.scatter(pred, res)
plt.title("Residual plot")
plt.xlabel("prediction")
plt.ylabel("residuals")

# res is NEGATIVE when prediction is LARGER than actual (over predicting)
# res is POSITIVE when prediction is SMALLER than actual (under predicting)

In [None]:
import scipy.stats as stats
# Plot Q-Q plot
#plt.subplot(1, 3, 3)
#Generates a probability plot of sample data against the quantiles of a 
# specified theoretical distribution 
stats.probplot(res, dist="norm", plot=plt)
plt.title("Normal Q-Q plot")

# Has problems predicting really large and reall small values
# Comparing the residuals to a normal distribution
# This is HEAVY TAILED

In [None]:
lm_reg = Ridge(alpha=1)
lm_reg.fit(X_tr, y_train)
pred = lm_reg.predict(X_te)

In [None]:
# Plot your predicted values on the x-axis, and your residuals on the y-axis
data = pd.DataFrame()
data['predict']= lm_reg.predict(X_te)
data['resid']= y_test - data['predict']
with sns.axes_style('white'):
    plot=data.plot(kind='scatter',
                  x='predict',y='resid',alpha=0.2,figsize=(10,6))

# Heteroskedasticity is shown here. Residuals look like a "tornado"

In [None]:
# Higher prediction, the more negative the residuals, meaning 
# the model is over predicting.

# Lower prediction, the more positive the residuals, meaning
# the model is under predicting.

In [None]:
# inspect histogram
#y_test[y_test < 10000000].hist(bins=25)
#plt.dist('Histogram of Dependent Variable (User Counts)');

#y_test[y_test < 10000000].hist(bins=25)
plt.dist(y_test[y_test < 10000000]);

In [None]:
len(y[y < 10000000])

In [None]:
len(y)

In [None]:
# diagnose/inspect residual normality using qqplot:
stats.probplot(data['resid'], dist="norm", plot=plt)
plt.title("Normal Q-Q plot")
plt.show()

# Investigating Target Variable

In [None]:
df_clean.head()

In [None]:
plt.scatter(df_clean['sold_date_dt'], df_clean['sold_price'])

In [None]:
sns.distplot(df_clean['sold_price'])

In [None]:
len(df_clean) - len([df_clean['sold_price'] < 8000000])

In [None]:
sns.distplot(df_model['sold_price'][df_model['sold_price'] < 10000000])