In [13]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

from sklearn import linear_model, metrics, svm, neighbors, gaussian_process, ensemble, neural_network
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

from PIL import Image
from tqdm import tqdm
from pandas_profiling import ProfileReport

import keras, re, string
from keras import applications
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [34]:
df = pd.read_csv("../data.csv")

In [35]:
df = df[df["totPurchaseAmt"] >= 50000]
df = df[df["totPurchaseAmt"] <= 5000000]

df = df[df["yearBuilt"] <= 2020]
df = df[df["yearBuilt"] >= 1940]

df = df[df["zipcode"] <= 56763]
df = df[df["zipcode"] >= 55001]

df = df[df["livingArea"] <= 30000]
df = df[df["livingArea"] >= 1000]

df.drop_duplicates("mediumImageLink", keep = "first", inplace = True)

df = df[
    ["totPurchaseAmt", "bathrooms", "latitude", "longitude",
     "bedrooms", "yearBuilt", "livingArea",
     "averageSchoolRating", "zipcode", "description"]
]

# df = pd.get_dummies(data=df, columns=["bathrooms", "bedrooms", "yearBuilt", "zipcode"])
df.dropna(inplace=True)

In [36]:
# profile = ProfileReport(df, title='Pandas Profiling Report', minimal=True, html={'style':{'full_width':True}})
# profile.to_widgets()

In [37]:
def remove_mypunct(corpus):

    corpus = map(lambda x: re.sub(r"(#|@|http)\S+", "", x), corpus) #get rid of hashtags 
    corpus = map(lambda x: re.sub(r"(\t|\n|\v|…|“|”)", "", x), corpus) #get rid of whitesape
    corpus = map(lambda x:  re.sub(r"""\w*\d\w*""", ' ', x.lower()), corpus) #get rid of numbers
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    corpus = map(lambda x: punc_re.sub(' ', x), corpus)
    corpus = map(lambda x: re.sub('[\W_]+', " ", x), corpus) #get rid of emojis
    
    return list(corpus)

sid = SentimentIntensityAnalyzer()
score_tuples = []
score = []

for i in df.index.values:
    clean_house = remove_mypunct([df.description[i]])
    ss = sid.polarity_scores(clean_house[0])
    df.at[i, 'sentiment'] = float(ss['compound'])

df = df.drop("description", axis=1)

In [38]:
grouped = df.groupby('zipcode')
df = grouped.filter(lambda x: x['zipcode'].count() > 50)

In [39]:
df.head()

Unnamed: 0,totPurchaseAmt,bathrooms,latitude,longitude,bedrooms,yearBuilt,livingArea,averageSchoolRating,zipcode,sentiment
0,152000.0,1.0,45.108779,-93.257808,2.0,1984,1300.0,4.0,55432,0.0516
1,198000.0,2.0,45.237189,-93.409535,4.0,1996,1716.0,5.333333,55303,0.4939
3,415000.0,2.0,45.278217,-93.407533,3.0,1985,3108.0,6.333333,55303,0.2617
5,262000.0,2.0,45.164166,-93.297836,3.0,1985,2158.0,4.666667,55433,0.9136
6,280000.0,2.0,45.2837,-93.332023,3.0,1976,1993.0,7.333333,55304,0.9428


In [40]:
X = df.drop("totPurchaseAmt", axis=1)
y = df["totPurchaseAmt"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
scaler = StandardScaler()
scaler.fit(X_train)  # Don't cheat - fit only on training data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)  # apply same transformation to test data

In [42]:
cnn_model = Sequential()

n_cols = X_train.shape[1]
cnn_model.add(Dense(16, activation='relu', input_shape=(n_cols,)))
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(256, activation='relu'))
cnn_model.add(Dense(256, activation='relu'))
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dense(16, activation='relu'))
cnn_model.add(Dense(1))

cnn_model.compile(optimizer='adam', loss='mean_absolute_error', metrics=["mae"])

In [43]:
models = {
    "Neural Network" : cnn_model,
    "Linear Regression" : linear_model.LinearRegression(), 
    "Support Vector Machine" : svm.SVR(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Ridge Regression" : linear_model.Ridge(alpha=.5),
    "Least Angle Regression" : linear_model.LassoLars(alpha=.1),
    "Bayesian Ridge Regression" : linear_model.BayesianRidge(),
    "SGD Regressor" : linear_model.SGDRegressor(),
    "Nearest Neighbors Regression" : neighbors.KNeighborsRegressor(),
    "Gaussian Process" : gaussian_process.GaussianProcessRegressor(),
    "Random Forest" : ensemble.RandomForestRegressor(),
    'Gradient Boosting Regressor' : ensemble.GradientBoostingRegressor()
}

for model_name in models:
    model = models[model_name]

    if model_name == "Neural Network":
        model.fit(X_train, y_train, validation_split=0.1, epochs=100, verbose=0)
    else:
        model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print ("=== " + model_name + " ===")
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print ('\n')

=== Neural Network ===
Mean Absolute Error: 43976.7384117077
Mean Squared Error: 5035399992.825651
Root Mean Squared Error: 70960.55237119882


=== Linear Regression ===
Mean Absolute Error: 51607.98075011635
Mean Squared Error: 5812126589.324943
Root Mean Squared Error: 76237.3044468713


=== Support Vector Machine ===
Mean Absolute Error: 88614.29571089693
Mean Squared Error: 17085552354.757128
Root Mean Squared Error: 130711.71468065564


=== Decision Tree ===
Mean Absolute Error: 63427.79605107159
Mean Squared Error: 10593577560.066628
Root Mean Squared Error: 102925.10655844194


=== Ridge Regression ===
Mean Absolute Error: 51607.766441197506
Mean Squared Error: 5812123660.181337
Root Mean Squared Error: 76237.28523617127


=== Least Angle Regression ===
Mean Absolute Error: 51605.48812942028
Mean Squared Error: 5812007282.749407
Root Mean Squared Error: 76236.5219743753


=== Bayesian Ridge Regression ===
Mean Absolute Error: 51603.24412544775
Mean Squared Error: 5812075156.1968

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = ensemble.RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train, y_train)

In [None]:
y_pred = rf_random.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print ('\n')