In [1]:
#importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

from sklearn.metrics import confusion_matrix

In [2]:
#retrieve the data

reader = open("data.txt","r")

columnNames = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX", "PTRATIO","B","LSTAT","MEDV"]

df = pd.DataFrame(columns = columnNames).astype(float)

allArrays = []
for i in reader:
    splitArr = i.strip().split(" ")
    splitArr = [float(x) for x in splitArr if len(x)>0] #remove blank whitespaces from .split()
    #print(splitArr)
    allArrays.append(splitArr)
reader.close()
    
for i in range(0,len(allArrays),2):
    allArrays[i].extend(allArrays[i+1]) #add every 2nd list to first list (format of document)
    newRowSeries = pd.Series(allArrays[i], index=df.columns) #convert to series
    df = df.append(newRowSeries,ignore_index = True) #add to dataframe

del allArrays #to free up space

In [3]:
#start looking at data
all_Y = df["MEDV"]
all_X = df.drop("MEDV",axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(all_X,all_Y,random_state=42)

#variables that I predict will have an impact
crimes = all_X["CRIM"]
roomNum = all_X["RM"]
age = all_X["AGE"]
dis = all_X["DIS"]
b = all_X["B"]
ptratio = all_X["PTRATIO"]
#plt.hist(roomNum) #bell curve centered around 6, slightly slightly right skewed
#plt.hist(crimes) #strongly skewed right, 0-10 makes up a majority of data, 10-20, 20-30, others
#plt.hist(age) #skewed left: most vals from 80-100, 0-80 is approx uniform
#plt.hist(dis) #very strongly skewed to the right, almost like a staircase
#plt.hist(b) #very strongly skewed to the left, most of values in 350-400
#plt.hist(ptratio) #bell curve around 18 with spikes at 22 and 15

In [4]:
#use feature importance to narrow down the important variables

#train forest classifier on training data
importance_regressor = RandomForestRegressor()
importance_regressor.fit(X_train,Y_train) #fit the data on the training data
importance_regressor.score(X_test,Y_test) #score the data based on testing data

#add the feature importance in
feature_importances = pd.DataFrame(importance_regressor.feature_importances_,
                                   index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)

#after seing the dataframe, the most important columns are below
columns = ["RM","LSTAT"]

#split between training and testing data
X = all_X[columns]
X_train,X_test,Y_train,Y_test = train_test_split(X,all_Y,test_size=0.2,random_state=42)

feature_importances

Unnamed: 0,importance
LSTAT,0.416247
RM,0.405662
DIS,0.060696
CRIM,0.039723
PTRATIO,0.015936
AGE,0.013412
TAX,0.01257
B,0.01141
NOX,0.011075
INDUS,0.006157


In [5]:
#calls for the regression models

#svm
svm_model = svm.SVR(C=1.0,epsilon=0.2)
svm_model.fit(X_train,Y_train)
svm_predictions = svm_model.predict(X_test)

#random forest
rf_model = RandomForestRegressor(max_depth = 5, random_state=0)
rf_model.fit(X_train,Y_train)
rf_predictions = rf_model.predict(X_test)

#linear regressor
lin_model = LinearRegression()
lin_model.fit(X_train,Y_train)
lin_predictions = lin_model.predict(X_test)

#ridge regression
ridge_model = linear_model.Ridge(alpha=0.5)
ridge_model.fit(X_train,Y_train)
ridge_predictions = ridge_model.predict(X_test)

#bayesian regression
bay_model = linear_model.BayesianRidge()
bay_model.fit(X_train,Y_train)
bay_predictions = bay_model.predict(X_test)

In [6]:
#look at the accuracy

#R squared values
print("svm r squared: ",r2_score(Y_test,svm_predictions))
print("random forest r squared: ",r2_score(Y_test,rf_predictions))
print("linear regression r squared: ",r2_score(Y_test,lin_predictions))
print("ridge r squared: ",r2_score(Y_test,ridge_predictions))
print("bayes r squared: ",r2_score(Y_test,bay_predictions),"\n")

#Mean absolute errors (MAE)
print("svm MAE: ",mean_absolute_error(Y_test,svm_predictions))
print("random forest MAE ",mean_absolute_error(Y_test,rf_predictions))
print("linear regression MAE ",mean_absolute_error(Y_test,lin_predictions))
print("ridge MAE ",mean_absolute_error(Y_test,ridge_predictions))
print("bay MAE ",mean_absolute_error(Y_test,bay_predictions))

svm r squared:  0.6368684392843291
random forest r squared:  0.7300307981564056
linear regression r squared:  0.5739577415025857
ridge r squared:  0.5742858090094847
bayes r squared:  0.5752747722458944 

svm MAE:  3.364327058962326
random forest MAE  2.9508983305568717
linear regression MAE  3.8987597213823584
ridge MAE  3.8985289312589892
bay MAE  3.8978173233194697


In [7]:
#make sure no overfit is happening by using cross validation
svm_cvmodel = svm.SVR(C=1.0,epsilon=0.2)
rf_cvmodel = RandomForestRegressor(max_depth = 5, random_state=0)
lin_cvmodel = LinearRegression() #bayes and ridge regression are basically this anyways, will just use this to check CV

print("cross validations: \n")
scores = cross_val_score(svm_cvmodel,X,all_Y,cv=10,scoring="r2") #gives 10 different scores
print("svm: ",np.mean(scores))
scores = cross_val_score(rf_cvmodel,X,all_Y,cv=10,scoring="r2") #gives 10 different scores
print("random forest: ",np.mean(scores))
scores = cross_val_score(lin_cvmodel,X,all_Y,cv=10,scoring="r2") #gives 10 different scores
print("lin regression: ",np.mean(scores))

#there does seem to be overfit...

cross validations: 

svm:  0.2571762079516185
random forest:  0.42891299656816634
lin regression:  -0.03303612768229912
