In [150]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
%matplotlib inline
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

### Directions

Engineer your features, then create three models. Each model will be run on a training set and a test-set (or multiple test-sets, if you take a folds approach). The models should be:

+ Vanilla logistic regression
+ Ridge logistic regression
+ Lasso logistic regression

If you're stuck on how to begin combining your two new modeling skills, here's a hint: the SKlearn LogisticRegression method has a "penalty" argument that takes either 'l1' or 'l2' as a value.

In your report, evaluate all three models and decide on your best. Be clear about the decisions you made that led to these models (feature selection, regularization parameter selection, model evaluation criteria) and why you think that particular model is the best of the three. Also reflect on the strengths and limitations of regression as a modeling approach. Were there things you couldn't do but you wish you could have done?

In [151]:
df = pd.read_csv("train.csv")

In [152]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [153]:
df.shape[0]

1460

In [154]:
trainsize = int(df.shape[0]/2)

In [155]:
trainsize

730

In [156]:
df.select_dtypes(include=["object"]).head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [157]:
for col in df.select_dtypes(include=["object"]):
    print(col)
    print(df[col].nunique())

MSZoning
5
Street
2
Alley
2
LotShape
4
LandContour
4
Utilities
2
LotConfig
5
LandSlope
3
Neighborhood
25
Condition1
9
Condition2
8
BldgType
5
HouseStyle
8
RoofStyle
6
RoofMatl
8
Exterior1st
15
Exterior2nd
16
MasVnrType
4
ExterQual
4
ExterCond
5
Foundation
6
BsmtQual
4
BsmtCond
4
BsmtExposure
4
BsmtFinType1
6
BsmtFinType2
6
Heating
6
HeatingQC
5
CentralAir
2
Electrical
5
KitchenQual
4
Functional
7
FireplaceQu
5
GarageType
6
GarageFinish
3
GarageQual
5
GarageCond
5
PavedDrive
3
PoolQC
3
Fence
4
MiscFeature
4
SaleType
9
SaleCondition
6


In [158]:
df.select_dtypes(exclude=["object"]).columns

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [159]:
df["YearBuilt"].unique()

array([2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 1939, 1965,
       2005, 1962, 2006, 1960, 1929, 1970, 1967, 1958, 1930, 2002, 1968,
       2007, 1951, 1957, 1927, 1920, 1966, 1959, 1994, 1954, 1953, 1955,
       1983, 1975, 1997, 1934, 1963, 1981, 1964, 1999, 1972, 1921, 1945,
       1982, 1998, 1956, 1948, 1910, 1995, 1991, 2009, 1950, 1961, 1977,
       1985, 1979, 1885, 1919, 1990, 1969, 1935, 1988, 1971, 1952, 1936,
       1923, 1924, 1984, 1926, 1940, 1941, 1987, 1986, 2008, 1908, 1892,
       1916, 1932, 1918, 1912, 1947, 1925, 1900, 1980, 1989, 1992, 1949,
       1880, 1928, 1978, 1922, 1996, 2010, 1946, 1913, 1937, 1942, 1938,
       1974, 1893, 1914, 1906, 1890, 1898, 1904, 1882, 1875, 1911, 1917,
       1872, 1905], dtype=int64)

In [160]:
df["BuildOldHouse"] = np.where(df["YearBuilt"] < 1930, 1, 0)

In [161]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,BuildOldHouse
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,1
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,0


In [162]:
df["BuildSemiOldHouse"] = np.where((df["YearBuilt"] > 1930) & (df["YearBuilt"] < 1965), 1, 0)

In [163]:
df["BuiltNotOldHouse"] = np.where((df["YearBuilt"] > 1965) & (df["YearBuilt"] < 1990), 1, 0)

In [164]:
df["BuildContemporaryHouse"] = np.where(df["YearBuilt"] > 1990, 1, 0)

In [165]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,BuildOldHouse,BuildSemiOldHouse,BuiltNotOldHouse,BuildContemporaryHouse
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,2,2008,WD,Normal,208500,0,0,0,1
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,5,2007,WD,Normal,181500,0,0,1,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,9,2008,WD,Normal,223500,0,0,0,1
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,2,2006,WD,Abnorml,140000,1,0,0,0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,12,2008,WD,Normal,250000,0,0,0,1


In [166]:
df["YrSold"].unique()

array([2008, 2007, 2006, 2009, 2010], dtype=int64)

In [167]:
df = pd.get_dummies(df)

In [168]:
df.shape

(1460, 294)

In [169]:
for col in df.columns:
    print(col)
    print(df[col].isna().sum())

Id
0
MSSubClass
0
LotFrontage
259
LotArea
0
OverallQual
0
OverallCond
0
YearBuilt
0
YearRemodAdd
0
MasVnrArea
8
BsmtFinSF1
0
BsmtFinSF2
0
BsmtUnfSF
0
TotalBsmtSF
0
1stFlrSF
0
2ndFlrSF
0
LowQualFinSF
0
GrLivArea
0
BsmtFullBath
0
BsmtHalfBath
0
FullBath
0
HalfBath
0
BedroomAbvGr
0
KitchenAbvGr
0
TotRmsAbvGrd
0
Fireplaces
0
GarageYrBlt
81
GarageCars
0
GarageArea
0
WoodDeckSF
0
OpenPorchSF
0
EnclosedPorch
0
3SsnPorch
0
ScreenPorch
0
PoolArea
0
MiscVal
0
MoSold
0
YrSold
0
SalePrice
0
BuildOldHouse
0
BuildSemiOldHouse
0
BuiltNotOldHouse
0
BuildContemporaryHouse
0
MSZoning_C (all)
0
MSZoning_FV
0
MSZoning_RH
0
MSZoning_RL
0
MSZoning_RM
0
Street_Grvl
0
Street_Pave
0
Alley_Grvl
0
Alley_Pave
0
LotShape_IR1
0
LotShape_IR2
0
LotShape_IR3
0
LotShape_Reg
0
LandContour_Bnk
0
LandContour_HLS
0
LandContour_Low
0
LandContour_Lvl
0
Utilities_AllPub
0
Utilities_NoSeWa
0
LotConfig_Corner
0
LotConfig_CulDSac
0
LotConfig_FR2
0
LotConfig_FR3
0
LotConfig_Inside
0
LandSlope_Gtl
0
LandSlope_Mod
0
LandSlope_Sev
0

In [170]:
df = df.dropna()

df.shape

(1121, 294)

In [171]:
df["Affordable"] = np.where(df["SalePrice"] < 165000, 1, 0)

In [172]:
df["SalePrice"].describe()

count      1121.000000
mean     185506.152542
std       82999.159004
min       35311.000000
25%      131000.000000
50%      164900.000000
75%      219500.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [173]:
del df["SalePrice"]

In [174]:
df["Affordable"][10:30]

11    0
13    0
15    1
17    1
18    1
19    1
20    0
21    1
22    0
23    1
25    0
26    1
27    0
28    0
29    1
30    1
32    0
33    0
34    0
35    0
Name: Affordable, dtype: int32

In [175]:
df_test = df.iloc[trainsize:, :-1]
df_train = df.iloc[:trainsize, :-1]
outcome_test = df["Affordable"][trainsize:].ravel()
outcome_train = df["Affordable"][:trainsize].ravel()

### Logistic Regression

In [176]:
lr = LogisticRegression()
X = df_train
y = outcome_train

model_1 = lr.fit(X,y)

In [177]:
print("coefficients for vanilla logistic regression:")
print(model_1.coef_)
print(model_1.intercept_)


coefficients for vanilla logistic regression:
[[-6.90067098e-05  1.76785158e-02 -1.67898953e-02 -9.02572270e-05
  -1.10151397e+00 -4.75882400e-01 -4.16196070e-02 -2.06823624e-03
  -2.91274447e-03 -1.39974949e-03 -1.22412943e-03 -2.48130955e-04
  -2.87200988e-03 -9.60742047e-04 -9.22847527e-04 -3.32187653e-03
  -5.20546609e-03  1.07927369e-01 -3.75041221e-01 -1.08905562e+00
  -5.30701061e-01 -1.04675820e-01  3.79235872e-01  7.05424576e-02
  -8.66110907e-02 -3.06890439e-03 -2.09118079e-01 -2.54879663e-03
  -4.58382402e-04 -6.58171507e-03 -3.01810723e-03 -2.70118493e-03
  -8.88671713e-03 -1.64063539e-02  5.38906640e-03 -2.73721950e-02
   6.06421771e-02 -3.14652109e-01 -8.89645172e-03  2.69337667e-01
  -1.18506211e-01  6.91171477e-02 -3.22878986e-01  5.33820561e-02
  -7.11703354e-01  9.12795033e-01  2.21257097e-04  4.90639477e-04
   5.38674183e-02 -1.75651569e-01 -5.16631259e-02  3.09323841e-03
  -2.52561703e-06  4.92843096e-02  2.64884847e-01 -9.46519067e-03
   2.70845776e-01 -5.25553536e

In [178]:
model_1_pred_y = model_1.predict(X)

In [179]:
print("Number of mislabeled points out of a total of {} points: {}".format(df_train.shape[0], (outcome_train != model_1_pred_y).sum()))

Number of mislabeled points out of a total of 730 points: 17


In [180]:
print("Accuracy of vanilla logistic regression on training set: ", 
      ((outcome_train == model_1_pred_y).sum()) / df_train.shape[0])

Accuracy of vanilla logistic regression on training set:  0.9767123287671233


In [181]:
lr = LogisticRegression()
X = df_test
y = outcome_test

model_1 = lr.fit(X,y)

model_1_pred_y = model_1.predict(X)

In [182]:
print("Accuracy of vanilla logistic regression on test set: ", 
      ((outcome_test == model_1_pred_y).sum()) / df_test.shape[0])

Accuracy of vanilla logistic regression on test set:  0.9616368286445013


In [183]:
lrtrain = LogisticRegression()
lrtrain.fit(df_train, outcome_train)
lrtest = LogisticRegression()
lrtest.fit(df_test, outcome_test)

print("Difference in r-squared between train and test: ", lrtrain.score(df_train, outcome_train) - lrtest.score(df_test, outcome_test))


Difference in r-squared between train and test:  0.015075500122622043


### Ridge Regression

In [184]:
ridgereg = linear_model.Ridge(alpha=.5, fit_intercept=False)
ridgereg.fit(df_train, outcome_train)
ridgereg.score(df_train, outcome_train)

0.8211530876605193

In [185]:
for lam in lambdas:
    ridgereg = linear_model.Ridge(alpha=lam, fit_intercept=False)
    ridgereg.fit(df_train, outcome_train)
    print(lam, ridgereg.score(df_train, outcome_train))

0 0.8262016893187637
0.02 0.8251688580623975
0.05 0.8248867566839628
0.1 0.8244413282375768
0.15 0.8239768233204712
0.25 0.8230769646510642
0.35 0.822253384517958
0.5 0.8211530876605193
0.55 0.8208164905983636
0.65 0.8201806640428854
0.75 0.8195877285397908
0.95 0.8185052122278296


In [186]:
ridgereg = linear_model.Ridge(alpha=.02, fit_intercept=False)
ridgereg.fit(df_train, outcome_train)
print("Accuracy of ridge regression on training set: ", 
      ridgereg.score(df_train, outcome_train))

Accuracy of ridge regression on training set:  0.8251688580623975


In [187]:
ridgereg = linear_model.Ridge(alpha=.02, fit_intercept=False)
ridgereg.fit(df_test, outcome_test)
print("Accuracy of ridge regression on test set: ", 
      ridgereg.score(df_test, outcome_test))

Accuracy of ridge regression on test set:  0.8779099383025759


In [188]:
ridgeregtrain = linear_model.Ridge(alpha=.02, fit_intercept=False)
ridgeregtrain.fit(df_train, outcome_train)
ridgeregtest = linear_model.Ridge(alpha=.02, fit_intercept=False)
ridgeregtest.fit(df_test, outcome_test)

print("Difference in r-squared between train and test: ", ridgeregtrain.score(df_train, outcome_train) - ridgeregtest.score(df_test, outcome_test))


Difference in r-squared between train and test:  -0.05274108024017843


### Lasso Regression

In [189]:
lass = linear_model.Lasso()
lass.fit(df_train, outcome_train)
print("Accuracy of ridge regression on training set: ", 
      lass.score(df_train, outcome_train))

Accuracy of ridge regression on training set:  0.6121343353425919


In [190]:
lambdas = [0, .02, .05, .10, .15, .25, .35, .5, .55, .65, .75, .95 ]

for lam in lambdas:
    lass = linear_model.Lasso(alpha=lam)
    lass.fit(df_train, outcome_train)
    print(lam, lass.score(df_train, outcome_train))

  """
  positive)


0 0.8264233672238277
0.02 0.6832363168368452
0.05 0.639346761176592
0.1 0.6344286768838217
0.15 0.6339192506793174
0.25 0.6322826366056876
0.35 0.6298408387646803
0.5 0.6262549845681622
0.55 0.6248961559666126
0.65 0.6226404480674184
0.75 0.6200926855332368
0.95 0.6139078032905236


In [191]:
lass = linear_model.Lasso(alpha=.02)
lass.fit(df_train, outcome_train)
print("Accuracy of lasso regression on training set: ", 
      lass.score(df_train, outcome_train))

Accuracy of lasso regression on training set:  0.6832363168368452


In [192]:
lass = linear_model.Lasso(alpha=.02)
lass.fit(df_test, outcome_test)
print("Accuracy of lasso regression on test set: ", 
      lass.score(df_test, outcome_test))

Accuracy of lasso regression on test set:  0.6459236067557461


In [193]:
lasstrain = linear_model.Lasso(alpha=.02)
lasstrain.fit(df_train, outcome_train)
lasstest = linear_model.Lasso(alpha=.02)
lasstest.fit(df_test, outcome_test)
print("Difference in r-squared between train and test: ", lasstrain.score(df_train, outcome_train) - lasstest.score(df_test, outcome_test))


Difference in r-squared between train and test:  0.03731271008109904


### Report

The feature engineering I did before running these models included creating four different features that focused on the year the house was built. Additionally, I created a feature about whether or not the house was "affordable" ($165000 or less), and this became the outcome I was looking to predict based on the other features in the dataset. 

When evaluating the models, I focused on the r-squared score and the difference between the r-squared score for the training set versus the test set. 

The strongest model of the three was the vanilla logistic regression model. This model had the highest accuracy and the smallest difference between the test and train r-squared scores. 

There are many strengths of regression as a modeling approach because the different features allow the model to build different algorithms that create the line that best fits the data.