In [83]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

from sklearn import linear_model, metrics, svm, neighbors, gaussian_process, ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

from PIL import Image
from tqdm import tqdm
from pandas_profiling import ProfileReport

import keras, re, string
from keras import applications
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [84]:
df = pd.read_csv("../data.csv")

In [85]:
df = df[df["totPurchaseAmt"] >= 50000]
df = df[df["totPurchaseAmt"] <= 5000000]

df = df[df["yearBuilt"] <= 2020]
df = df[df["yearBuilt"] >= 1940]

df = df[df["zipcode"] <= 56763]
df = df[df["zipcode"] >= 55001]

df = df[df["livingArea"] <= 30000]
df = df[df["livingArea"] >= 1000]

df.drop_duplicates("mediumImageLink", keep = "first", inplace = True)

df = df[
    ["totPurchaseAmt", "bathrooms", 
     "bedrooms", "yearBuilt", "livingArea",
     "averageSchoolRating", "zipcode", "description"]
]

# df = pd.get_dummies(data=df, columns=["bathrooms", "bedrooms", "yearBuilt", "zip"])
df.dropna(inplace=True)

In [None]:
profile = ProfileReport(df, title='Pandas Profiling Report', minimal=True, html={'style':{'full_width':True}})
profile.to_widgets()

In [86]:
def remove_mypunct(corpus):

    corpus = map(lambda x: re.sub(r"(#|@|http)\S+", "", x), corpus) #get rid of hashtags 
    corpus = map(lambda x: re.sub(r"(\t|\n|\v|…|“|”)", "", x), corpus) #get rid of whitesape
    corpus = map(lambda x:  re.sub(r"""\w*\d\w*""", ' ', x.lower()), corpus) #get rid of numbers
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    corpus = map(lambda x: punc_re.sub(' ', x), corpus)
    corpus = map(lambda x: re.sub('[\W_]+', " ", x), corpus) #get rid of emojis
    
    return list(corpus)

sid = SentimentIntensityAnalyzer()
score_tuples = []
score = []

for i in df.index.values:
    clean_house = remove_mypunct([df.description[i]])
    ss = sid.polarity_scores(clean_house[0])
    score_tuples.append((float(ss['compound']), df.description[i], i))
    score.append(float(ss['compound']))
    df.at[i, 'sentiment'] = float(ss['compound'])

df = df.drop("description", axis=1)

In [87]:
X = df.drop("totPurchaseAmt", axis=1)
y = df["totPurchaseAmt"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
scaler = StandardScaler()
scaler.fit(X_train)  # Don't cheat - fit only on training data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)  # apply same transformation to test data

In [89]:
cnn_model = Sequential()

n_cols = X_train.shape[1]
cnn_model.add(Dense(256, activation='relu', input_shape=(n_cols,)))
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dense(16, activation='relu'))
cnn_model.add(Dense(1))

cnn_model.compile(optimizer='adam', loss='mean_absolute_error', metrics=["mae"])

In [90]:
models = {
    "Neural Network" : cnn_model,
    "Linear Regression" : linear_model.LinearRegression(), 
    "Support Vector Machine" : svm.SVR(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Ridge Regression" : linear_model.Ridge(alpha=.5),
    "Least Angle Regression" : linear_model.LassoLars(alpha=.1),
    "Bayesian Ridge Regression" : linear_model.BayesianRidge(),
    "SGD Regressor" : linear_model.SGDRegressor(),
    "Nearest Neighbors Regression" : neighbors.KNeighborsRegressor(),
    "Gaussian Process" : gaussian_process.GaussianProcessRegressor(),
    "Random Forest" : ensemble.RandomForestRegressor()
}

for model_name in models:
    model = models[model_name]

    if model_name == "Neural Network":
        model.fit(X_train, y_train, validation_split=0.1, epochs=100, verbose=0)
    else:
        model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print ("=== " + model_name + " ===")
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print ('\n')

=== Linear Regression ===
Mean Absolute Error: 62581.172128688726
Mean Squared Error: 12537097189.124357
Root Mean Squared Error: 111969.17963941843


=== Support Vector Machine ===
Mean Absolute Error: 98794.486181673
Mean Squared Error: 29167052399.978455
Root Mean Squared Error: 170783.64207376086


=== Decision Tree ===
Mean Absolute Error: 79771.48318039055
Mean Squared Error: 26148040546.066387
Root Mean Squared Error: 161703.55761722248


=== Ridge Regression ===
Mean Absolute Error: 62581.1558601933
Mean Squared Error: 12537312892.243763
Root Mean Squared Error: 111970.14286069194


=== Least Angle Regression ===
Mean Absolute Error: 62576.743598568944
Mean Squared Error: 12537572465.819353
Root Mean Squared Error: 111971.30197429765


=== Bayesian Ridge Regression ===
Mean Absolute Error: 62580.848213142424
Mean Squared Error: 12541101117.906061
Root Mean Squared Error: 111987.05781431201


=== SGD Regressor ===
Mean Absolute Error: 62854.62039002184
Mean Squared Error: 125670

In [91]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [93]:
rf = ensemble.RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  5.2min


KeyboardInterrupt: 