In [1]:
# import packages that will be used for plotting and analyzing the data

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
from scipy import stats
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
# read in the data file for analysis

df = pd.read_csv('Data_Files/df_household_clean.csv', low_memory=False)
df.head()

Unnamed: 0,CONTROL,total_rooms,housing_cost,monthly_rent,garage,num_dining,num_laundry,gut_rehab,is_condo,stairs_broken,...,fuse_blow,home_better_than_last,nh_better_than_last,manager_onsite,adequacy,hud_subsidized,number_upkeep_probs,num_bathrooms,household_income,family_income
0,11000001,7,1250 to 1499,,Yes,1,0,No,No,,...,No fuses / breakers blown in the last 3 months,,,,Adequate,,Less than 3 upkeep problems,2.5 bathrooms,113000.0,113000.0
1,11000002,7,1500 to 1999,,Yes,0,0,,No,No,...,No fuses / breakers blown in the last 3 months,,,,Adequate,,Less than 3 upkeep problems,2.5 bathrooms,29000.0,29000.0
2,11000005,8,700 to 799,,Yes,1,2,No,No,,...,No fuses / breakers blown in the last 3 months,,,,Moderately Inadequate,,3 or 4 upkeep problems,3 bathrooms,69900.0,69900.0
3,11000007,8,1500 to 1999,,Yes,1,1,Yes,No,,...,No fuses / breakers blown in the last 3 months,,,,Moderately Inadequate,,3 or 4 upkeep problems,2.5 bathrooms,162700.0,162700.0
4,11000010,5,450 to 499,370.0,No,0,0,,No,,...,No fuses / breakers blown in the last 3 months,,,,Adequate,Public housing tenants and tenants in privatel...,Less than 3 upkeep problems,1.5 bathrooms,21000.0,21000.0


In [3]:
rating_house_bin = [0 if i <=5 else 1 for i in df['rating_house']]

rating_nh_bin = [0 if i <=5 else 1 for i in df['rating_neighborhood']]

df['rating_house_bin'] = rating_house_bin
df['rating_nh_bin'] = rating_nh_bin


In [4]:
X = df.drop(['rating_house', 'rating_neighborhood', 'rating_house_bin', 'rating_nh_bin'], axis=1)
y = df.rating_house_bin

X2 = X.select_dtypes(include=[np.number])
X2.head()


Unnamed: 0,CONTROL,total_rooms,monthly_rent,num_dining,num_laundry,hh_age,year_moved_in,num_people,year_built,stories,unit_floors,num_bedrooms,num_kitchens,household_income,family_income
0,11000001,7,,1,0,49.0,2000.0,3.0,2000,2,2.0,3,1,113000.0,113000.0
1,11000002,7,,0,0,77.0,2005.0,2.0,2000,2,2.0,3,1,29000.0,29000.0
2,11000005,8,,1,2,69.0,1995.0,3.0,1970,1,1.0,4,1,69900.0,69900.0
3,11000007,8,,1,1,49.0,1997.0,4.0,1960,3,3.0,4,1,162700.0,162700.0
4,11000010,5,370.0,0,0,71.0,2005.0,1.0,1970,1,1.0,3,1,21000.0,21000.0


In [26]:
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X2)
X2 = imp.transform(X2)
X2 = pd.DataFrame(X2)
X2.columns = ['CONTROL', 'total_rooms', 'monthly_rent', 'num_dining', 'num_laundry', 'hh_age', 'year_moved_in', 'num_people', 'year_built', 'stories', 'unit_floors', 'num_bedrooms', 'num_kitchens', 'household_income', 'family_income']
X2.head()

Unnamed: 0,CONTROL,total_rooms,monthly_rent,num_dining,num_laundry,hh_age,year_moved_in,num_people,year_built,stories,unit_floors,num_bedrooms,num_kitchens,household_income,family_income
0,11000001.0,7.0,958.508025,1.0,0.0,49.0,2000.0,3.0,2000.0,2.0,2.0,3.0,1.0,113000.0,113000.0
1,11000002.0,7.0,958.508025,0.0,0.0,77.0,2005.0,2.0,2000.0,2.0,2.0,3.0,1.0,29000.0,29000.0
2,11000005.0,8.0,958.508025,1.0,2.0,69.0,1995.0,3.0,1970.0,1.0,1.0,4.0,1.0,69900.0,69900.0
3,11000007.0,8.0,958.508025,1.0,1.0,49.0,1997.0,4.0,1960.0,3.0,3.0,4.0,1.0,162700.0,162700.0
4,11000010.0,5.0,370.0,0.0,0.0,71.0,2005.0,1.0,1970.0,1.0,1.0,3.0,1.0,21000.0,21000.0


In [6]:
X3 = X.select_dtypes(include=[object])
X3.head()


Unnamed: 0,housing_cost,garage,gut_rehab,is_condo,stairs_broken,stairs,musty,petty_crime,near_transit,risk_of_flood,...,roach,sewerbreakdowns,fuse_blow,home_better_than_last,nh_better_than_last,manager_onsite,adequacy,hud_subsidized,number_upkeep_probs,num_bathrooms
0,1250 to 1499,Yes,No,No,,No,Never,Disagree,Disagree,Disagree,...,No signs in the last 12 months,No breakdowns in the last 3 months,No fuses / breakers blown in the last 3 months,,,,Adequate,,Less than 3 upkeep problems,2.5 bathrooms
1,1500 to 1999,Yes,,No,No,Yes,Never,Disagree,Agree,Disagree,...,No signs in the last 12 months,No breakdowns in the last 3 months,No fuses / breakers blown in the last 3 months,,,,Adequate,,Less than 3 upkeep problems,2.5 bathrooms
2,700 to 799,Yes,No,No,,,,Disagree,Disagree,Disagree,...,No signs in the last 12 months,No breakdowns in the last 3 months,No fuses / breakers blown in the last 3 months,,,,Moderately Inadequate,,3 or 4 upkeep problems,3 bathrooms
3,1500 to 1999,Yes,Yes,No,,,,Agree,Disagree,Disagree,...,No signs in the last 12 months,No breakdowns in the last 3 months,No fuses / breakers blown in the last 3 months,,,,Moderately Inadequate,,3 or 4 upkeep problems,2.5 bathrooms
4,450 to 499,No,,No,,,,Agree,Disagree,Disagree,...,No signs in the last 12 months,No breakdowns in the last 3 months,No fuses / breakers blown in the last 3 months,,,,Adequate,Public housing tenants and tenants in privatel...,Less than 3 upkeep problems,1.5 bathrooms


In [7]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

le = LabelEncoder()

#le.fit_transform(X3.astype(str))
X3 = X3.astype(str).apply(le.fit_transform)
X3.head()

Unnamed: 0,housing_cost,garage,gut_rehab,is_condo,stairs_broken,stairs,musty,petty_crime,near_transit,risk_of_flood,...,roach,sewerbreakdowns,fuse_blow,home_better_than_last,nh_better_than_last,manager_onsite,adequacy,hud_subsidized,number_upkeep_probs,num_bathrooms
0,3,1,0,0,2,0,3,1,1,1,...,0,1,5,3,4,4,0,3,2,2
1,4,1,2,0,0,1,3,1,0,1,...,0,1,5,3,4,4,0,3,2,2
2,15,1,0,0,2,2,5,1,1,1,...,0,1,5,3,4,4,1,3,0,3
3,4,1,1,0,2,2,5,0,1,1,...,0,1,5,3,4,4,1,3,0,2
4,12,0,2,0,2,2,5,0,1,1,...,0,1,5,3,4,4,0,1,2,0


In [8]:
'''from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp.fit(X3)
X3 = imp.transform(X3)'''

"from sklearn.preprocessing import Imputer\n\nimp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)\nimp.fit(X3)\nX3 = imp.transform(X3)"

In [9]:
encoder = OneHotEncoder()

encoder.fit(X3)

X3 = encoder.transform(X3).toarray()
X3 = pd.DataFrame(X3)
X3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,262,263,264,265,266,267,268,269,270,271
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X3, y, test_size=0.3, random_state=42)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

logreg_cv.fit(X_train, y_train)

print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameter: {'C': 0.051794746792312128}
Tuned Logistic Regression Accuracy: 0.9191914235949268


In [12]:
# Instantiate a logistic regression classifier: logreg

logreg = LogisticRegression(C=0.051794746792312128)
logreg.fit(X_train, y_train)

LogisticRegression(C=0.05179474679231213, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [13]:
y_pred = logreg.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  259  1190]
 [  153 15868]]
             precision    recall  f1-score   support

          0       0.63      0.18      0.28      1449
          1       0.93      0.99      0.96     16021

avg / total       0.91      0.92      0.90     17470



In [30]:
X4 = X2.join(X3, how='left')
X4.head()

Unnamed: 0,CONTROL,total_rooms,monthly_rent,num_dining,num_laundry,hh_age,year_moved_in,num_people,year_built,stories,...,262,263,264,265,266,267,268,269,270,271
0,11000001.0,7.0,958.508025,1.0,0.0,49.0,2000.0,3.0,2000.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11000002.0,7.0,958.508025,0.0,0.0,77.0,2005.0,2.0,2000.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11000005.0,8.0,958.508025,1.0,2.0,69.0,1995.0,3.0,1970.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11000007.0,8.0,958.508025,1.0,1.0,49.0,1997.0,4.0,1960.0,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11000010.0,5.0,370.0,0.0,0.0,71.0,2005.0,1.0,1970.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X4, y, test_size=0.3, random_state=42)

In [34]:
# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

logreg2 = LogisticRegression()
logreg_cv2 = GridSearchCV(logreg2, param_grid, cv=5)

logreg_cv2.fit(X_train, y_train)

print("Tuned Logistic Regression Parameter: {}".format(logreg_cv2.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv2.best_score_))

Tuned Logistic Regression Parameter: {'C': 1.0000000000000001e-05}
Tuned Logistic Regression Accuracy: 0.9141132890120943


In [35]:
# Instantiate a logistic regression classifier: logreg

logreg = LogisticRegression(C=1.0000000000000001e-05)
logreg.fit(X_train, y_train)

LogisticRegression(C=1e-05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
y_pred2 = logreg.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

[[    0  1449]
 [    0 16021]]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1449
          1       0.92      1.00      0.96     16021

avg / total       0.84      0.92      0.88     17470



  'precision', 'predicted', average, warn_for)


In [None]:
#from sklearn.decomposition import PCA
#pca = PCA()
#pca.fit(X_train)


In [None]:
#features = range(pca.n_components_)
#plt.bar(features, pca.explained_variance_)

In [None]:
#pca = PCA(n_components=0)
#pca.fit(X_train)
#X_train = pca.transform(X_train)


In [None]:
'''#Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

logreg_cv.fit(X_train, y_train)

print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))'''

In [None]:
'''logreg = LogisticRegression(penalty='l1', C=.00001)
logreg.fit(X_train, y_train)'''



In [None]:
y_pred = logreg.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))