# Imports

### 3rd Party Python

In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, Imputer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier

In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

### Custom Classes

The following classes were created in the Data Cleaning notebook, they are used to assist in the creation of a pipeline that cleans the data and prepares it for models.

The basis for FeatureExtractor classes where provided by Richard, upon which I expanded. CategoricalExtractor was given by Richard as is.

In [2]:
class MultipleFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols 
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X[self.cols].values 

In [3]:
class MultipleFeaturesMap(BaseEstimator, TransformerMixin):
    def __init__(self, cols, vals):
        self.cols = cols
        self.vals = vals
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        df = X[self.cols].copy()
        for col in self.cols:
            df[col] = X[col].map(self.vals)
            
        return df.values

In [4]:
class FeatureMap(BaseEstimator, TransformerMixin):
    def __init__(self, column, vals):
        self.column = column
        self.vals = vals
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X[self.column].map(self.vals).values.reshape(-1,1)

In [5]:
class CategoricalExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        self.values = None
        
    def _create_values(self, indices):
        return {ind: i+1 for i, ind in enumerate(indices)}
    
    def _apply_values(self, row_val):
        return self.values.get(row_val, 0)
        
    def fit(self, X, y=None):
        self.values = self._create_values(X[self.column].value_counts().index)
        return self 
    
    def transform(self, X, y=None):
        col = X[self.column].apply(self._apply_values)
        return col.values.reshape(-1, 1)

### Pipeline Function

The following function was created in the Data Cleaning notebook, it is used to assist in the creation of a pipeline that cleans the data and prepares it for models, returning the data as a FeatureUnion object. It contains predefined lists, that tell it how to operate in the ways I determined best during EDA. That it takes the columns of the data as input is not necessary, I simply found it preferable to hard coding yet another list of columns inside the function.

In [6]:
def createFU(cols):
    '''
    With the columns of the Ames data, create a Feature Union in a more automated way to clean the data.
    Returns a FeatureUnion that should have clean and expanded data that can then be used in a pipeline with a model
    '''
    #remove columns I don't want to deal with, deciding because too many null during EDA
    cols = [col for col in cols if col not in ['Id','PID','Alley','Misc Feature','Fireplace Qu','Pool QC','Fence']]
    #create lists of cols gotten from EDA
    ord_cols_qual = ['Exter Qual','Exter Cond','Bsmt Qual','Bsmt Cond',
                 'Heating QC','Kitchen Qual','Garage Qual','Garage Cond']
    ord_cols_type = ['BsmtFin Type 1', 'BsmtFin Type 2']
    ord_cols_unique = [('Lot Shape', {'IR1': 2, 'IR2': 1, 'IR3': 0, 'Reg': 3}),
                         ('Utilities', {'AllPub': 3, 'ELO': 0, 'NoSeWa': 1, 'NoSewr': 2}),
                         ('Land Slope', {'Gtl': 2, 'Mod': 1, 'Sev': 0}),
                         ('Bsmt Exposure', {'Av': 3, 'Gd': 4, 'Mn': 2, 'NA': 0, 'No': 1}),
                         ('Central Air', {'N': 0, 'Y': 1}),
                         ('Electrical', {'FuseA': 3, 'FuseF': 2, 'FuseP': 1, 'Mix': 0, 'SBrkr': 4}),
                         ('Functional',
                          {'Maj1': 3,'Maj2': 2,'Min1': 6,'Min2': 5,'Mod': 4,'Sal': 0,'Sev': 1,'Typ': 7}),
                         ('Garage Finish', {'Fin': 3, 'NA': 0, 'RFn': 2, 'Unf': 1}),
                         ('Paved Drive', {'N': 0, 'P': 1, 'Y': 2})]
    nominal_cols = ['MS Zoning','Street','Land Contour','Lot Config','Neighborhood',
                    'Condition 1','Condition 2','Bldg Type','House Style','Roof Style',
                    'Roof Matl','Exterior 1st','Exterior 2nd','Mas Vnr Type','Foundation',
                    'Heating','Garage Type','Sale Type']
    
    #get all the columns that are not objects/not doing anything special with
    obj_cols = (ord_cols_qual+ord_cols_type+[y[0] for y in ord_cols_unique]+nominal_cols)
    num_cols = [col for col in cols if col not in obj_cols]
    
    #create list to hold pipelines
    pipes = []
    
    #create pipe for qualities
    qual_pipe = make_pipeline(
                    MultipleFeaturesMap(ord_cols_qual, 
                                       {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}),
                    Imputer()
                    )
    pipes.append(('qual_pipe', qual_pipe))
    
    #create pipe for type
    type_pipe = make_pipeline(
                    MultipleFeaturesMap(ord_cols_type, 
                                       {'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6}),
                    Imputer()
                    )
    pipes.append(('type_pipe', type_pipe))
    
    #just get all the data from numerical columns
    num_pipe = make_pipeline(
                    MultipleFeaturesExtractor(num_cols),
                    Imputer()
                    )
    pipes.append(('num_pipe', num_pipe))
    
    #iterate over unique columns adding new pipelines for each    
    for col, vals in ord_cols_unique:
        tmp_pipe = make_pipeline(
                        FeatureMap(col, vals),
                        Imputer()
                        )
        pipes.append(('{}_pipe'.format(col), tmp_pipe))
        
    #turn nominal columns into dummies and add pipelines for each
    for col in nominal_cols:
        tmp_pipe = make_pipeline(
                        CategoricalExtractor(col),
                        Imputer(strategy='median'),
                        OneHotEncoder(sparse=False, handle_unknown='ignore')
                        )
        pipes.append(('{}_pipe'.format(col), tmp_pipe))
    
    return FeatureUnion(pipes)

# Load Data

Here I will load the data, both the training and test files. The purpose of loading both is my intention to brute force the kaggle competition with many models which I will save predictions for along the way in case of any crashes, which means I will need the test data loaded.

In [7]:
#training data
train_df = pd.read_csv('datasets/train.csv')

#save the training X without either target variable, then save the encoded classifier target
train_X = train_df.drop(['Sale Condition', 'SalePrice'], axis=1)
train_y = train_df['Sale Condition'].apply(lambda x: 1 if x=='Abnorml' else 0)

#test data requires no changes
test_df = pd.read_csv('datasets/test.csv')

In [44]:
#for graphing purposes I will also split train test
Xtr, Xte, ytr, yte = train_test_split(train_X, train_y, random_state=42, stratify=train_y)

Initialize the FeatureUnion that will contain the pipelines for cleaning the data. The intention is that this serves as the basis for modeling pipelines, and should remain unchanged throughout the notebook.

In [8]:
data_fu = createFU(train_X.columns)

# Modeling

Within this section I will use the pipelines and gridsearch to fit many models. I reiterate that the intent is to succeed through "directed" brute force. Every model shall have it's own section where I use gridsearch to fit it to the training data, predict the test data, and save each to a respective csv.

Afterwards I create a single instance of the estimator, using the best paramters, to create a confusion matrix. The reason for doing this with a new version of the model is to create a static version of the model so there is no need to rerun the GridSearches again to refind the model upon closing the notebook.

### Test Baseline

Just going to guess all 0s

In [57]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = 0

#to csv
pred.to_csv('ClassSubs/Zeros.csv', index=False)

In [58]:
preds = Xte['Id'].apply(lambda x: 0)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.93567251462


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,480,0
is_abnormal,33,0


### Logistic Regression

In [None]:
#create the structure of the pipeline so it can be easily gridsearched
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', None),
    #('selectkbest', SelectKBest(score_func=f_classif, k=5)),
    ('model', LogisticRegression())
])

#create parameters
params = {
    'scaling': [None, StandardScaler(), RobustScaler()],
    #'selectkbest__k':[2, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 150, 'all'],
    'model__fit_intercept':[False, True],
    'model__penalty':['l1', 'l2'],
    'model__C': np.linspace(0.01, 100, 15)
}

gs1 = GridSearchCV(modeling_pipe, params, verbose=1, cv=5, n_jobs=-1)
gs1.fit(train_X, train_y)

In [10]:
print(gs1.best_score_)
print(gs1.best_params_)

0.940516821063
{'model__C': 0.01, 'model__fit_intercept': True, 'model__penalty': 'l2', 'scaling': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [11]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = gs1.best_estimator_.predict(test_df)

#to csv
pred.to_csv('ClassSubs/LogReg.csv', index=False)

In [48]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', StandardScaler()),
    ('model', LogisticRegression(C=.01, penalty='l2'))
])
modeling_pipe.fit(Xtr, ytr)
preds = modeling_pipe.predict(Xte)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.937621832359


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,475,5
is_abnormal,27,6


### KNN Regression

In [None]:
#create the structure of the pipeline so it can be easily gridsearched
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', None),
    ('selectkbest', SelectKBest(score_func=f_classif, k=5)),
    ('model', KNeighborsClassifier())
])

#create parameters
params = {
    'scaling': [None, StandardScaler(), RobustScaler()],
    'selectkbest__k':[5, 10, 25, 50, 75, 100, 150, 'all'],
    'model__n_neighbors':[2, 4, 5, 10, 25, 50],
    'model__weights': ['uniform', 'distance']
}

gs2 = GridSearchCV(modeling_pipe, params, verbose=1, cv=5, n_jobs=-1)
gs2.fit(train_X, train_y)

In [13]:
print(gs2.best_score_)
print(gs2.best_params_)

0.938078985861
{'model__n_neighbors': 4, 'model__weights': 'uniform', 'scaling': StandardScaler(copy=True, with_mean=True, with_std=True), 'selectkbest__k': 50}


In [14]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = gs2.best_estimator_.predict(test_df)

#to csv
pred.to_csv('ClassSubs/KNNClass.csv', index=False)

In [49]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', StandardScaler()),
    ('selectkbest', SelectKBest(score_func=f_classif, k=50)),
    ('model', KNeighborsClassifier(n_neighbors=4))
])
modeling_pipe.fit(Xtr, ytr)
preds = modeling_pipe.predict(Xte)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.927875243665


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,475,5
is_abnormal,32,1


### SVC

In [50]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', StandardScaler()),
    ('model', SVC())
])
modeling_pipe.fit(Xtr, ytr)
preds = modeling_pipe.predict(Xte)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.933723196881


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,479,1
is_abnormal,33,0


In [21]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = modeling_pipe.predict(test_df)

#to csv
pred.to_csv('ClassSubs/SVClass.csv', index=False)

### Decision Tree Classifier

In [None]:
#create the structure of the pipeline so it can be easily gridsearched
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', None),
    ('model', DecisionTreeClassifier())
])

#create parameters
params = {
    'scaling': [None, StandardScaler(), RobustScaler()],
    'model__max_depth':[2, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 150, None],
    'model__max_features':['auto', 'sqrt', 'log2']
}

gs4 = GridSearchCV(modeling_pipe, params, verbose=1, cv=5, n_jobs=-1)
gs4.fit(train_X, train_y)

In [23]:
print(gs4.best_score_)
print(gs4.best_params_)

0.936616284739
{'model__max_depth': 2, 'model__max_features': 'sqrt', 'scaling': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [24]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = gs4.best_estimator_.predict(test_df)

#to csv
pred.to_csv('ClassSubs/DTClass.csv', index=False)

In [51]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', StandardScaler()),
    ('model', DecisionTreeClassifier(max_depth=2, max_features='sqrt'))
])
modeling_pipe.fit(Xtr, ytr)
preds = modeling_pipe.predict(Xte)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.93567251462


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,480,0
is_abnormal,33,0


### Random Forest Classifier

In [None]:
#create the structure of the pipeline so it can be easily gridsearched
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', None),
    ('model', RandomForestClassifier())
])

#create parameters
params = {
    'scaling': [None, StandardScaler(), RobustScaler()],
    'model__max_depth':[2, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 150, None],
    'model__max_features':['auto', 'sqrt', 'log2'],
    'model__n_estimators': [int(x) for x in np.logspace(1, 3, 10)]
}

gs5 = GridSearchCV(modeling_pipe, params, verbose=1, cv=5, n_jobs=-1)
gs5.fit(train_X, train_y)

In [26]:
print(gs5.best_score_)
print(gs5.best_params_)

0.93759141882
{'model__max_depth': 30, 'model__max_features': 'sqrt', 'model__n_estimators': 10, 'scaling': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)}


In [27]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = gs5.best_estimator_.predict(test_df)

#to csv
pred.to_csv('ClassSubs/RFClass.csv', index=False)

In [52]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', RobustScaler()),
    ('model', RandomForestClassifier(max_depth=30, max_features='sqrt', n_estimators=10))
])
modeling_pipe.fit(Xtr, ytr)
preds = modeling_pipe.predict(Xte)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.929824561404


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,477,3
is_abnormal,33,0


**Attempt to Account for Imbalanced Data**

Without importing any special libraries, I am going to hack together a dataframe that has more abnormal data

In [82]:
#training data
fdf = pd.read_csv('datasets/train.csv')
fdf['Sale Condition'] = fdf['Sale Condition'].apply(lambda x: 1 if x=='Abnorml' else 0)
abnormls = fdf[fdf['Sale Condition']==1].copy()
for i in range(0,4):
    fdf = pd.concat((fdf, abnormls))

# df now has 5x more abnormals
#save the training X without either target variable, then save the encoded classifier target
fX = fdf.drop(['Sale Condition', 'SalePrice'], axis=1)
fX = fX.reindex()
fy = fdf['Sale Condition']

#train test split
fXtr, fXte, fytr, fyte = train_test_split(fX, fy, random_state=42, stratify=fy)

Fit to the monstrosity of data

In [85]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', StandardScaler()),
    ('model', RandomForestClassifier(max_depth=30, max_features='sqrt', n_estimators=10))
])
modeling_pipe.fit(fXtr, fytr)
fpreds = modeling_pipe.predict(fXte)
preds = modeling_pipe.predict(Xte)
print('Duplicated Data Accuracy', accuracy_score(yte, preds))
print('Real Train Data Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Duplicated Data Accuracy 0.998050682261
Real Train Data Accuracy 0.998050682261


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,479,1
is_abnormal,0,33


It seems like this methodology has merit, try a submission

In [86]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = modeling_pipe.predict(test_df)

#to csv
pred.to_csv('ClassSubs/fRFClass.csv', index=False)

### AdaBoost Classifier

In [None]:
#create the structure of the pipeline so it can be easily gridsearched
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', None),
    #('selectkbest', SelectKBest(score_func=f_classif, k=5)),
    ('model', AdaBoostClassifier())
])

#create parameters
params = {
    'scaling': [None, StandardScaler(), RobustScaler()],
    #'selectkbest__k':[2, 5, 10, 25, 50, 75, 100, 150, 'all'],
    #'model__base_estimator':[gs1.best_estimator_,gs2.best_estimator_, gs3.best_estimator_, 
    #                         gs4.best_estimator_],
    'model__base_estimator': [LogisticRegression(), DecisionTreeClassifier()],
    'model__n_estimators': [int(x) for x in np.logspace(1, 3, 10)]
}

gs7 = GridSearchCV(modeling_pipe, params, verbose=1, cv=5, n_jobs=-1)
gs7.fit(train_X, train_y)

In [34]:
print(gs7.best_score_)
print(gs7.best_params_)

0.939054119941
{'model__base_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'model__n_estimators': 10, 'scaling': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [35]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = gs7.best_estimator_.predict(test_df)

#to csv
pred.to_csv('ClassSubs/ABClass.csv', index=False)

In [53]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', StandardScaler()),
    ('model', AdaBoostClassifier(base_estimator=LogisticRegression(), n_estimators=10))
])
modeling_pipe.fit(Xtr, ytr)
preds = modeling_pipe.predict(Xte)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.929824561404


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,473,7
is_abnormal,29,4


### Gradient Boosting Classifier

In [None]:
#create the structure of the pipeline so it can be easily gridsearched
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', None),
    ('model', GradientBoostingClassifier())
])

#create parameters
params = {
    'scaling': [None, StandardScaler(), RobustScaler()],
    'model__max_depth':[2, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 150, None],
    'model__max_features':['auto', 'sqrt', 'log2'],
    'model__n_estimators': [int(x) for x in np.linespace(1, 3, 10)]
}

gs8 = GridSearchCV(modeling_pipe, params, verbose=1, cv=5, n_jobs=-1)
gs8.fit(train_X, train_y)

In [37]:
print(gs8.best_score_)
print(gs8.best_params_)

0.939054119941
{'model__max_depth': 2, 'model__max_features': 'log2', 'model__n_estimators': 359, 'scaling': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [38]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = gs8.best_estimator_.predict(test_df)

#to csv
pred.to_csv('ClassSubs/GBClass.csv', index=False)

In [54]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', StandardScaler()),
    ('model', GradientBoostingClassifier(max_depth=2, max_features='log2', n_estimators=359))
])
modeling_pipe.fit(Xtr, ytr)
preds = modeling_pipe.predict(Xte)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.931773879142


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,476,4
is_abnormal,31,2


**Attempt to Account for Imbalanced Data**

Without importing any special libraries, I am going to hack together a dataframe that has more abnormal data

In [154]:
#training data
fdf = pd.read_csv('datasets/train.csv')
fdf['Sale Condition'] = fdf['Sale Condition'].apply(lambda x: 1 if x=='Abnorml' else 0)
abnormls = fdf[fdf['Sale Condition']==1].copy()
for i in range(0,1):
    fdf = pd.concat((fdf, abnormls))

# df now has 5x more abnormals
#save the training X without either target variable, then save the encoded classifier target
fX = fdf.drop(['Sale Condition', 'SalePrice'], axis=1)
fX = fX.reindex()
fy = fdf['Sale Condition']

#train test split
fXtr, fXte, fytr, fyte = train_test_split(fX, fy, random_state=42, stratify=fy)

In [99]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', StandardScaler()),
    ('model', GradientBoostingClassifier(max_depth=5, max_features='log2', n_estimators=300, subsample=.7))
])
modeling_pipe.fit(fXtr, fytr)
fpreds = modeling_pipe.predict(fXte)
preds = modeling_pipe.predict(Xte)
print('Duplicated Data Accuracy', accuracy_score(yte, preds))
print('Real Train Data Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Duplicated Data Accuracy 0.988304093567
Real Train Data Accuracy 0.988304093567


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,476,4
is_abnormal,2,31


Seems promising, may as well try it out.

In [153]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id

pred['Sale Condition'] = m_preds = [0 if nabn >= .45 else 1 for nabn, abn in modeling_pipe.predict_proba(test_df)]

#to csv
pred.to_csv('ClassSubs/fGBClass.csv', index=False)
pred['Sale Condition'].value_counts()

0    874
1      5
Name: Sale Condition, dtype: int64

### Voting Classifier

In [None]:
#create the structure of the pipeline so it can be easily gridsearched
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', None),
    #('selectkbest', SelectKBest(score_func=f_classif, k=5)),
    ('model', VotingClassifier(estimators=[('logR',LogisticRegression()), ('DTree',DecisionTreeClassifier())]))
    #('model', VotingClassifier(estimators=[gs1.best_estimator_,gs2.best_estimator_,
    #                                        gs3.best_estimator_, gs4.best_estimator_]))
])

#create parameters
params = {
    'scaling': [None, StandardScaler(), RobustScaler()],
    #'selectkbest__k':[5, 10, 25, 50, 75, 100, 150, 'all'],
    'model__voting': ['hard', 'soft']
}

gs9 = GridSearchCV(modeling_pipe, params, verbose=1, cv=5, n_jobs=-1)
gs9.fit(train_X, train_y)

In [42]:
print(gs9.best_score_)
print(gs9.best_params_)

0.936616284739
{'model__voting': 'hard', 'scaling': None}


In [43]:
#create predictions dataframe
pred = pd.DataFrame()
pred['Id'] = test_df.Id
pred['Sale Condition'] = gs9.best_estimator_.predict(test_df)

#to csv
pred.to_csv('ClassSubs/VoteClass.csv', index=False)

In [55]:
modeling_pipe = Pipeline([
    ('data', data_fu),
    ('scaling', None),
    ('model', VotingClassifier(estimators=[('logR',LogisticRegression()), ('DTree',DecisionTreeClassifier())], voting='hard'))
])
modeling_pipe.fit(Xtr, ytr)
preds = modeling_pipe.predict(Xte)
print('Accuracy', accuracy_score(yte, preds))

conmat = np.array(confusion_matrix(yte, preds, labels=[0,1]))

confusion = pd.DataFrame(conmat, index=['not_abnormal', 'is_abnormal'],
                         columns=['predicted_not_abnormal', 'predicted_abnormal'])
confusion

Accuracy 0.933723196881


Unnamed: 0,predicted_not_abnormal,predicted_abnormal
not_abnormal,477,3
is_abnormal,31,2
