# Classifying

This notebook tests a number of classification algorithms to determine if it is possible to classify census tracts into their respective clusters (representing gentrification status) using the census' current demographic state. 

In [1]:
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import sys
sys.path.append('../')
import Classifier_helper_functions as helper
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [146]:
# import the csv that has the clusters assigned to each tract
df = pd.read_csv('../clusters.csv',index_col=0)

In [147]:
# subset tracts and clusters
df1=df.iloc[:,0:2]

In [148]:
# import the data from the 2000 census 
df2 = pd.read_csv('../Final_2000_data.csv',index_col=0)
df3 = pd.read_csv('../Final_2000_data_sample.csv',index_col=0)

In [149]:
# join the dataframes
df = df1.merge(df2,how='inner',on='tractid')

df_final = df.merge(df3,how='inner',on='tractid')
df_final.set_index('tractid',inplace=True)

In [150]:
df_final['cluster'].value_counts()

2    82
1    77
0    31
Name: cluster, dtype: int64

In [151]:
df_final.columns

Index(['cluster', 'state', 'county', 'tract', 'POP00', 'percent_white00',
       'percent_black00', 'percent_asian00', 'percent_hispanic00',
       'percent_indian00', 'percent_chinese00', 'percent_filip00',
       'percent_japan00', 'percent_korean00', 'percent_viet00',
       'percent_mex00', 'percent_pr00', 'percent_cuban00',
       'percent_vacant_housing00', 'percent_occupied_housing00',
       'percent_under18_00', 'percent_60andup_00', 'percent_75andup_00',
       'percent_owneroccupied_00', 'percent_renteroccupied_00',
       'percent_non-white00', 'INCPC00', 'HINC00', 'MHMVAL00', 'MRENT00',
       'percent_foreign_born', 'percent_naturalized',
       'percent_recent_immigrants(10)', 'percent_other_languages',
       'percent_hs_degree_orless', 'percent_4yrcollege_degree_ormore',
       'percent_married', 'percent_unemployed', 'percent_employed',
       'percent_professional', 'percent_manufacturing',
       'percent_self_employed', 'percent_poverty', 'percent_houses_30yrsago',

In [152]:
# split into X and Y
X = df_final[['percent_white00', 'percent_black00',
       'percent_asian00', 'percent_hispanic00', 'percent_indian00',
       'percent_chinese00', 'percent_filip00', 'percent_japan00',
       'percent_korean00', 'percent_viet00', 'percent_mex00', 'percent_pr00',
       'percent_cuban00', 'percent_vacant_housing00',
       'percent_occupied_housing00', 'percent_under18_00',
       'percent_60andup_00', 'percent_75andup_00', 'percent_owneroccupied_00',
       'percent_renteroccupied_00', 'percent_non-white00', 'INCPC00', 'HINC00',
       'MHMVAL00', 'MRENT00', 'percent_foreign_born', 'percent_naturalized',
       'percent_recent_immigrants(10)', 'percent_other_languages',
       'percent_hs_degree_orless', 'percent_4yrcollege_degree_ormore',
       'percent_married', 'percent_unemployed', 'percent_employed',
       'percent_professional', 'percent_manufacturing',
       'percent_self_employed', 'percent_poverty', 'percent_houses_30yrsago',
       'percent_multiunit_houses']]
Y = df_final['cluster']
Y=Y.astype('str')

In [162]:
# train test split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=.3)

## SMOTE & scale

In [163]:
from imblearn.over_sampling import SMOTE

In [164]:
print("Before OverSampling, counts of label '2': {} \n".format(sum(y_train=='2')))
print("Before OverSampling, counts of label '1': {}".format(sum(y_train=='1')))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train=='0')))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '2': {}".format(sum(y_train_res=='2')))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res=='1')))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res=='0')))

Before OverSampling, counts of label '2': 60 

Before OverSampling, counts of label '1': 51
Before OverSampling, counts of label '0': 22 

After OverSampling, the shape of train_X: (180, 40)
After OverSampling, the shape of train_y: (180,) 

After OverSampling, counts of label '2': 60
After OverSampling, counts of label '1': 60
After OverSampling, counts of label '0': 60


In [165]:
X_train_smoted = pd.DataFrame(X_train_res, columns=X_train.columns)
y_train_smoted = pd.DataFrame(y_train_res)

In [166]:
## scale the data
from sklearn import preprocessing

x = X_train_smoted.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X_train_scaled_smoted = pd.DataFrame(x_scaled, columns = X_train_smoted.columns)

x = X_test.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X_test_scaled = pd.DataFrame(x_scaled, columns = X_train_smoted.columns)

### Dummy Classifier

In [167]:
# create instance of the class
dummy_clf = DummyClassifier('most_frequent')
dummy_clf.fit(X_train_scaled_smoted,y_train_smoted)
y_hat_train = dummy_clf.predict(X_train_scaled_smoted)
y_hat_test = dummy_clf.predict(X_test_scaled)

In [178]:
print(accuracy_score(y_hat_train,y_train_smoted))
print(accuracy_score(y_hat_test,y_test))

0.3333333333333333
0.15789473684210525


### KNN

In [180]:
knn = KNeighborsClassifier()
parameters={'n_neighbors':[10,12,13,14,15,16,17,18,19,20]}
clf_knn_GS = GridSearchCV(knn,parameters,cv=5)
clf_knn_GS.fit(X_train_scaled_smoted,y_train_smoted)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [10, 12, 13, 14, 15, 16, 17, 18, 19,
                                         20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [193]:
clf_knn.best_params_

{'n_neighbors': 10}

In [194]:
knn = KNeighborsClassifier(n_neighbors=10)
clf_knn.fit(X_train_scaled_smoted,y_train_smoted)
y_hat_train_knn = clf_knn.predict(X_train_scaled_smoted)
y_hat_test_knn= clf_knn.predict(X_test_scaled)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


In [195]:
print(accuracy_score(y_hat_train_knn,y_train_smoted))
print(accuracy_score(y_hat_test_knn,y_test))

0.6888888888888889
0.45614035087719296


### Decision Tree

In [185]:
dt = DecisionTreeClassifier()
parameters={'criterion':['gini','entropy'],
            'max_depth':[5,10,15],
            'min_samples_split':[2,3,4,5,6,7],
            'min_samples_leaf':[1,2,3,4,5,6,7],
            'max_features':[1,2,3,4,5,6,7]   
}
clf_dt = GridSearchCV(dt,parameters,cv=5)
clf_dt.fit(X_train_scaled_smoted,y_train_smoted)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15],
                         '

In [198]:
clf_dt.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 6}

In [203]:
dt = DecisionTreeClassifier(criterion='gini',max_depth=10,max_features=2,min_samples_leaf=1,min_samples_split=6)
dt.fit(X_train_scaled_smoted,y_train_smoted)
y_hat_train_dt = dt.predict(X_train_scaled_smoted)
y_hat_test_dt= dt.predict(X_test_scaled)
print(accuracy_score(y_hat_train_dt,y_train_smoted))
print(accuracy_score(y_hat_test_dt,y_test))

0.9111111111111111
0.5263157894736842


### Random Forest

In [201]:
rf = RandomForestClassifier()
parameters={'criterion':['gini','entropy'],
            'max_depth':[10,15,20],
            'min_samples_split':[2,3,4,5,6,7],
            'min_samples_leaf':[1,2,3,4,5,6,7],
            'max_features':[1,2,3,4,5,6,7]
    }
clf_rf = GridSearchCV(rf,parameters,cv=5,n_jobs=-1)
clf_rf.fit(X_train_scaled_smoted,y_train_smoted)

  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [204]:
clf_rf.best_params_

{'criterion': 'entropy',
 'max_depth': 15,
 'max_features': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 7}

In [205]:
rf = RandomForestClassifier(criterion='entropy',max_depth=15,max_features=5,min_samples_leaf=2,min_samples_split=7)
rf.fit(X_train_scaled_smoted,y_train_smoted)
y_hat_train_rf = clf_rf.predict(X_train_scaled_smoted)
y_hat_test_rf= clf_rf.predict(X_test_scaled)
print(accuracy_score(y_hat_train_rf,y_train_smoted))
print(accuracy_score(y_hat_test_rf,y_test))

0.9166666666666666
0.5789473684210527


  


### XGBoost

In [207]:
clf_xgb = XGBClassifier()
param_grid = {
    "learning_rate": [0.1,0.2,0.3,0.4],
    'max_depth': [2,3,4,5,6],
    'min_child_weight': [10,12,13,14,15],
    'subsample': [0.7],
    'n_estimators': [5, 30, 100, 250],
}

In [208]:
grid_clf = GridSearchCV(clf_xgb, param_grid, scoring='accuracy', cv=5, n_jobs=1)
grid_clf.fit(X_train_scaled_smoted,y_train_smoted)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [211]:
grid_clf.best_params_

{'learning_rate': 0.2,
 'max_depth': 2,
 'min_child_weight': 10,
 'n_estimators': 100,
 'subsample': 0.7}

In [212]:
xgb = XGBClassifier(learning_rate=.2,max_depth=2,min_child_weight=10,subsample=.7,n_estimators=100)
xgb.fit(X_train_scaled_smoted,y_train_smoted)
y_hat_train_xgb = xgb.predict(X_train_scaled_smoted)
y_hat_test_xgb= xgb.predict(X_test_scaled)
print(accuracy_score(y_hat_train_xgb,y_train_smoted))
print(accuracy_score(y_hat_test_xgb,y_test))

0.9444444444444444
0.47368421052631576


## Predict using 2010 data

In [None]:
# import the data from the 2000 census 
df2 = pd.read_csv('CSV_files/Final_2010_data.csv',index_col=0)
df3 = pd.read_csv('CSV_files/Final_2010_data_sample.csv',index_col=0)

In [None]:
# join the dataframes
df = df3.merge(df2,how='inner',on='tractid')