In [1]:
import pandas as pd
import random

total_rows = 998403
rows_desired = 10000
rows_to_skip = total_rows - rows_desired

skips = sorted(random.sample(range(total_rows)[1:], rows_to_skip))
dta = pd.read_csv('./training.csv', skiprows = skips)

In [2]:
dta.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,Id
0,,,,0.0,0.0,4.0,,,,,...,,52011.0,1196269.0,2016.0,1144258.0,14592.4,,,,11751547
1,,,,1.0,0.0,,,1.0,,,...,,271309.0,387288.0,2016.0,115979.0,4920.64,,,,11780147
2,,,,3.0,3.0,,,3.0,,,...,,229699.0,536878.0,2016.0,307179.0,5621.28,,,,14737547
3,,,,0.0,0.0,,,,,,...,,,406100.0,2016.0,406100.0,4847.65,,,,167656147
4,,,,2.5,4.0,,,2.5,,993.0,...,,221047.0,442094.0,2016.0,221047.0,4652.66,,,61110010000000.0,17093678


In [3]:
#clean data - drop rows w/o values in taxamount column, also drop NA values
import numpy as np
from sklearn import preprocessing

dta_noNA = dta
dta_noNA = dta_noNA.dropna(axis=0, subset=["taxamount"])

dta_noNA = dta_noNA.dropna(axis=1, how="any")

x = dta_noNA.drop(["parcelid", "structuretaxvaluedollarcnt", "taxvaluedollarcnt", "assessmentyear", "landtaxvaluedollarcnt",
                  "taxamount", "taxdelinquencyflag", "hashottuborspa", "propertycountylandusecode", 
                   "propertyzoningdesc"], axis = 1, errors="ignore")

print(dta_noNA.drop(["parcelid", "structuretaxvaluedollarcnt", "taxvaluedollarcnt", "assessmentyear", "landtaxvaluedollarcnt",
                  "taxamount", "taxdelinquencyflag", "hashottuborspa", "propertycountylandusecode", 
                     "propertyzoningdesc"], axis = 1, errors="ignore").columns)

y = dta_noNA["taxamount"]

scalingModel = preprocessing.StandardScaler().fit(x)
x_scaled = scalingModel.transform(x)

Index(['bathroomcnt', 'bedroomcnt', 'fips', 'latitude', 'longitude',
       'propertylandusetypeid', 'rawcensustractandblock', 'regionidcounty',
       'roomcnt', 'Id'],
      dtype='object')


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.linear_model import HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import make_regression

print("Random Forest Regression:")
RFR = RandomForestRegressor(n_estimators=10)
print(min(cross_val_score(RFR, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

print("Linear Regression:")
LR = LinearRegression()
print(min(cross_val_score(LR, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

print("OMP:")
OMP = OrthogonalMatchingPursuit()
print(min(cross_val_score(OMP, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

print("LASSO:")
LASSO = linear_model.Lasso(max_iter=1000000)
print(min(cross_val_score(LASSO, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

print("RIDGE:")
ridgeReg = Ridge(max_iter=10000)
print(min(cross_val_score(ridgeReg, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

print("SVR:")
SVR = LinearSVR(tol=1e-5, max_iter=1000000)
print(min(cross_val_score(SVR, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

print("Huber:")
HUBER = HuberRegressor()
print(min(cross_val_score(HUBER, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

print("Single Tree:")
TREE = DecisionTreeRegressor(random_state=1693)
print(min(cross_val_score(TREE, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

print("Adaboost Regressor:")
AR = AdaBoostRegressor(base_estimator=None, n_estimators=5, learning_rate=1.0, loss='linear', random_state=None)
print(min(cross_val_score(AR, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

Random Forest Regression:
-2657.0170885469224
Linear Regression:
-3068.3442950298045
OMP:
-3064.7392207849757
LASSO:
-3065.8151747593806
RIDGE:
-3066.2106976700347
SVR:
-3190.590595893886
Huber:
-2853.101903627652
Single Tree:
-3308.356084762866
Adaboost Regressor:
-3201.346801558401


In [6]:
from sklearn.ensemble import VotingRegressor

ensemble = VotingRegressor(estimators=[('RFR', RFR),
                                      ('LR', LR),
                                      ('OMP', OMP),
                                       ('LASSO', LASSO),
                                       ('RIDGE', ridgeReg),
                                       ('SVR', SVR),
                                       ('HUBER', HUBER),
                                       ('TREE', TREE),
                                      ('AR', AR),])

print("Ensemble Model Accuracy:")
print(min(cross_val_score(ensemble, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

Ensemble Model Accuracy:
-2690.7315391874586


In [7]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor


print("HistGradient:")
hg = HistGradientBoostingRegressor(loss='least_squares', learning_rate=0.1, max_iter=150, max_leaf_nodes=35, max_depth=None, min_samples_leaf=20, l2_regularization=0.0, max_bins=255, scoring=None, validation_fraction=0.1, n_iter_no_change=None, tol=1e-07, verbose=0, random_state=None)
print(min(cross_val_score(hg, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

HistGradient:
-2640.4066315685927


In [8]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingRegressor

print("Gradient:")
gbc = GradientBoostingRegressor(n_estimators=150, learning_rate=1.0, max_depth=2, random_state=0).fit(x_scaled, y)
print(min(cross_val_score(gbc, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

Gradient:
-2791.5434218498344


In [9]:
from sklearn.ensemble import VotingRegressor

new = VotingRegressor(estimators=[('RFR', RFR),
                                      ('hg', hg),
                                      ('gbc', gbc)])

print("Ensemble Model Accuracy:")
print(min(cross_val_score(new, x_scaled, y, cv=5, scoring='neg_mean_absolute_error')))

Ensemble Model Accuracy:
-2581.621069115841


In [10]:
test_dta = pd.read_csv('./test.csv')
test_dta.columns

x_test = test_dta[['bathroomcnt', 'bedroomcnt', 'fips', 'latitude', 'longitude', 'propertylandusetypeid', 
                  'rawcensustractandblock', 'regionidcounty', 'roomcnt', 'Id']]

x_test_scaled = scalingModel.transform(x_test)

#replace missing data w the mean of that column
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
x_test_scaled = imputer.fit_transform(x_test_scaled)

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
new.fit(x_scaled, y)
new.predict(x_test_scaled)

array([17686.95492858,  9669.08811485,  6327.85851794, ...,
        5600.98864594, 23866.4045299 , 10741.65552478])

In [15]:
output_predictions = test_dta[["Id"]]
output_predictions["Predicted"] = new.predict(x_test_scaled)
output_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Id,Predicted
0,11404347,17686.954929
1,11781947,9669.088115
2,12144747,6327.858518
3,13117147,3166.767041
4,13117747,1362.017019
...,...,...
99102,12380956,4051.691416
99103,17192487,4077.514720
99104,13027001,5600.988646
99105,14358678,23866.404530


In [16]:
output_predictions.to_csv("video_estimates.csv", index=False)