In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
def check_var(var_string):
    complete = 1 - (train[var_string].isna().sum() / len(train[var_string]))
    complete = complete * 100
    print(f'{var_string}: {round(complete, 2)}% complete\n')
    var_type = train.dtypes[var_string]
    print(f'type: {var_type}\n')
    if var_type == object:
        print(f'{train[var_string].value_counts()}\n\n')
    else:
        print(f'{round(train.describe()[var_string], 2)}\n\n')

In [3]:
train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

### ID

- Key Variable; Exclude from X
- Action Needed: None

Id: 100.0% complete

type: int64

count    2051.00
mean     1474.03
std       843.98
min         1.00
25%       753.50
50%      1486.00
75%      2198.00
max      2930.00

### PID

- Parcel ID, ID for another database; Exclude from X
- Action Needed: None

PID: 100.0% complete

type: int64

count    2.051000e+03
mean     7.135900e+08
std      1.886918e+08
min      5.263011e+08
25%      5.284581e+08
50%      5.354532e+08
75%      9.071801e+08
max      9.241520e+08

### MS SubClass

- Codes for building class/type
- Action Needed: convert to string, dummify

MS SubClass: 100.0% complete

type: int64

count    2051.00
mean       57.01
std        42.82
min        20.00
25%        20.00
50%        50.00
75%        70.00
max       190.00

### MS Zoning

- Codes for zoning classification
- Action Needed: dummify

MS Zoning: 100.0% complete

type: object

RL         1598
RM          316
FV          101
C (all)      19
RH           14
A (agr)       2
I (all)       1

### Lot Frontage

- Feet of street connected to property
- Action Needed: interpolate w/ sqrt of Lot Area

Lot Frontage: 83.91028766455388% complete

type: float64

count    1721.00
mean       69.06
std        23.26
min        21.00
25%        58.00
50%        68.00
75%        80.00
max       313.00

### Lot Area

- Lot size in square feet
- Action Needed: None

Lot Area: 100.0% complete

type: int64

count      2051.00
mean      10065.21
std        6742.49
min        1300.00
25%        7500.00
50%        9430.00
75%       11513.50
max      159000.00

### Street

- Type of access to property
- Action Needed: dummify

Street: 100.0% complete

type: object

Pave    2044
Grvl       7

### Alley

- Type of access to alley, if there is alley
- Action Needed: interpolate w/ zeros, convert to boolean, dummify

Alley: 6.825938566552903% complete

type: object

Grvl    85
Pave    55

### Lot Shape

- Shape of lot
- Action Needed: dummify

Lot Shape: 100.0% complete

type: object

Reg    1295
IR1     692
IR2      55
IR3       9

### Land Contour

- Flatness of property
- Action Needed: dummify

Land Contour: 100.0% complete

type: object

Lvl    1843
HLS      85
Bnk      80
Low      43

### Utilities

- Utility access
- Action Needed: dummify

Utilities: 100.0% complete

type: object

AllPub    2049
NoSewr       1
NoSeWa       1

### Lot Config

- Lot configuration
- Action Needed: dummify

Lot Config: 100.0% complete

type: object

Inside     1503
Corner      348
CulDSac     131
FR2          60
FR3           9

### Land Slope

- Land Slope of property
- Action Needed: dummify

Land Slope: 100.0% complete

type: object

Gtl    1953
Mod      88
Sev      10

### Neighborhood

- Name of neighborhood
- Action Needed: dummify

Neighborhood: 100.0% complete

type: object

NAmes      310
CollgCr    180
OldTown    163
Edwards    143
Somerst    130
NridgHt    122
Gilbert    116
Sawyer     111
NWAmes      87
SawyerW     87
Mitchel     82
BrkSide     76
Crawfor     71
IDOTRR      69
Timber      48
NoRidge     48
StoneBr     38
SWISU       32
ClearCr     27
MeadowV     24
Blmngtn     22
BrDale      19
Veenker     17
NPkVill     17
Blueste      6
Greens       3
GrnHill      2
Landmrk      1

### Condition 1

- Proximity to main road or railroad
- Action Needed: dummify

Condition 1: 100.0% complete

type: object

Norm      1767
Feedr      109
Artery      70
RRAn        36
PosN        27
RRAe        21
PosA        12
RRNn         6
RRNe         3

### Condition 2

- Proximity to second main road or rail road, if one is present
- Action Needed: dummify

Condition 2: 100.0% complete

type: object

Norm      2025
Feedr       11
Artery       5
PosN         3
PosA         3
RRNn         2
RRAn         1
RRAe         1

### Bldg Type

- House type
- Action Needed: dummify

Bldg Type: 100.0% complete

type: object

1Fam      1700
TwnhsE     161
Duplex      75
Twnhs       69
2fmCon      46

### House Style

- House style
- Action Needed: dummify

House Style: 100.0% complete

type: object

1Story    1059
2Story     598
1.5Fin     218
SLvl        94
SFoyer      50
2.5Unf      14
1.5Unf      12
2.5Fin       6

### Overall Qual

- Overall quality, ordinal var
- Action Needed: None

Overall Qual: 100.0% complete

type: int64

count    2051.00
mean        6.11
std         1.43
min         1.00
25%         5.00
50%         6.00
75%         7.00
max        10.00

### Overall Cond

- Overall condition, ordinal var
- Action Needed: None

Overall Cond: 100.0% complete

type: int64

count    2051.00
mean        5.56
std         1.10
min         1.00
25%         5.00
50%         5.00
75%         6.00
max         9.00

### Year Built

- Year built, ordinal var
- Action Needed: None

Year Built: 100.0% complete

type: int64

count    2051.00
mean     1971.71
std        30.18
min      1872.00
25%      1953.50
50%      1974.00
75%      2001.00
max      2010.00

### Year Remod/Add

- Year of most recent remodel, if not remodel, then year built
- Action Needed: None

Year Remod/Add: 100.0% complete

type: int64

count    2051.00
mean     1984.19
std        21.04
min      1950.00
25%      1964.50
50%      1993.00
75%      2004.00
max      2010.00

### Roof Style

- Type/shape of roof
- Action Needed: dummify

Roof Style: 100.0% complete

type: object

Gable      1619
Hip         397
Flat         13
Gambrel      12
Mansard       7
Shed          3

### Roof Matl

- Material roof is made of
- Action Needed: dummify

Roof Matl: 100.0% complete

type: object

CompShg    2025
Tar&Grv      15
WdShngl       5
WdShake       4
Membran       1
ClyTile       1

### Exterior 1st

- Primary material house is made of
- Action Needed: dummify

Exterior 1st: 100.0% complete

type: object

VinylSd    724
MetalSd    331
HdBoard    300
Wd Sdng    276
Plywood    152
CemntBd     90
BrkFace     64
WdShing     45
AsbShng     33
Stucco      27
BrkComm      3
CBlock       2
Stone        2
AsphShn      1
ImStucc      1

### Exterior 2nd

- Secondary material house is made of, if more than one material
- Action Needed: dummify

Exterior 2nd: 100.0% complete

type: object

VinylSd    721
MetalSd    324
HdBoard    275
Wd Sdng    262
Plywood    185
CmentBd     90
Wd Shng     63
BrkFace     34
Stucco      30
AsbShng     28
Brk Cmn     17
ImStucc     11
Stone        6
AsphShn      3
CBlock       2

### Mas Vnr Type

- Masonry veneer type
- Action Needed: interpolate with None and dummify

Mas Vnr Type: 98.93% complete

type: object

None       1218
BrkFace     630
Stone       168
BrkCmn       13

### Mas Vnr Area

- Masonry veneer area
- Action Needed: interpolate with None and dummify

Mas Vnr Area: 98.93% complete

type: float64

count    2029.00
mean       99.70
std       174.96
min         0.00
25%         0.00
50%         0.00
75%       161.00
max      1600.00

### Exter Qual

- Quality of exterior of house
- Action Needed: ordinal

Exter Qual: 100.0% complete

type: object

TA    1247
Gd     697
Ex      81
Fa      26

### Exter Cond

- Condition of exterior of house
- Action Needed: ordinal

Exter Cond: 100.0% complete

type: object

TA    1778
Gd     215
Fa      49
Ex       7
Po       2

### Foundation

- Type of foundation
- Action Needed: dummify

Foundation: 100.0% complete

type: object

PConc     926
CBlock    863
BrkTil    221
Slab       34
Stone       5
Wood        2

### Bsmt Qual

- Quality of basement
- Action Needed: NaNs converted to None, changed to ordinal

Bsmt Qual: 97.24448897795591% complete

type: object

TA    887
Gd    864
Ex    184
Fa     60
Po      1

### Bsmt Cond

- Condition of basement
- Action Needed: NaNs converted to None, changed to ordinal

Bsmt Cond: 97.24448897795591% complete

type: object

TA    1834
Gd      89
Fa      65
Po       5
Ex       3

### Bsmt Exposure

- Basement Exposure, whether or not you can get outside from basement?
- Action Needed: interpolate and dummify or drop

Bsmt Exposure: 97.0898143502258% complete

type: object

No    1339
Av     288
Gd     203
Mn     163

### BsmtFin Type 1

- Quality finish of basement
- Action Needed: Add None, convert to categorical

BsmtFin Type 1: 97.24448897795591% complete

type: object

GLQ    615
Unf    603
ALQ    293
BLQ    200
Rec    183
LwQ    102

### BsmtFin SF

- SF of finished basement space
- Action Needed: interpolate and dummify or drop

BsmtFin SF 1: 99.95121951219512% complete

type: float64

count    2050.00
mean      442.30
std       461.20
min         0.00
25%         0.00
50%       368.00
75%       733.75
max      5644.00

### BsmtFin Type 2

- Quality of second finished area?
- Action Needed: drop

BsmtFin Type 2: 97.19298245614036% complete

type: object

Unf    1749
Rec      80
LwQ      60
BLQ      48
ALQ      35
GLQ      23

### BsmtFin SF 2

- SF of second type finished area
- Action Needed: interpolate with zeros

BsmtFin SF 2: 99.95121951219512% complete

type: float64

count    2050.00
mean       47.96
std       165.00
min         0.00
25%         0.00
50%         0.00
75%         0.00
max      1474.00

### Bsmt Unf SF

- SF of unfinished basement area
- Action Needed: interpolate with zeros

Bsmt Unf SF: 99.95121951219512% complete

type: float64

count    2050.00
mean      567.73
std       444.95
min         0.00
25%       220.00
50%       474.50
75%       811.00
max      2336.00

### Total Bsmt SF

- Total basement SF
- Action Needed: interpolate with zeros

Total Bsmt SF: 99.95121951219512% complete

type: float64

count    2050.00
mean     1057.99
std       449.41
min         0.00
25%       793.00
50%       994.50
75%      1318.75
max      6110.00

### Heating

- Type of heating system
- Action Needed: dummify

Heating: 100.0% complete

type: object

GasA    2018
GasW      20
Wall       6
Grav       5
OthW       2

### Heating QC

- Quality of heating system
- Action Needed: ordinal

Heating QC: 100.0% complete

type: object

Ex    1065
TA     597
Gd     319
Fa      67
Po       3

### Central Air

- Whether or not there is central air
- Action Needed: boolean dummify

Central Air: 100.0% complete

type: object

Y    1910
N     141

### Electrical

- Type of heating system
- Action Needed: ordinal

Electrical: 100.0% complete

type: object

SBrkr    1868
FuseA     140
FuseF      35
FuseP       7
Mix         1

### 1st Flr SF

- SF of 1st floor
- Action Needed: None

1st Flr SF: 100.0% complete

type: int64

count    2051.00
mean     1164.49
std       396.45
min       334.00
25%       879.50
50%      1093.00
75%      1405.00
max      5095.00

### 2nd Flr SF

- SF of 2nd floor
- Action Needed: None

2nd Flr SF: 100.0% complete

type: int64

count    2051.00
mean      329.33
std       425.67
min         0.00
25%         0.00
50%         0.00
75%       692.50
max      1862.00

### Low Qual Fin SF

- SF of low quality finish
- Action Needed: None

Low Qual Fin SF: 100.0% complete

type: int64

count    2051.00
mean        5.51
std        51.07
min         0.00
25%         0.00
50%         0.00
75%         0.00
max      1064.00

### Gr Liv Area

- SF of above ground living area
- Action Needed: None

Gr Liv Area: 100.0% complete

type: int64

count    2051.00
mean     1499.33
std       500.45
min       334.00
25%      1129.00
50%      1444.00
75%      1728.50
max      5642.00

### Bsmt Full Bath

- Number of full bathrooms in basement
- Action Needed: interpolate with zeros

Bsmt Full Bath: 99.90239141044412% complete

type: float64

count    2049.00
mean        0.43
std         0.52
min         0.00
25%         0.00
50%         0.00
75%         1.00
max         3.00

### Bsmt Half Bath

- Number of half baths in basement
- Action Needed: interpolate with zeros

Bsmt Half Bath: 99.90239141044412% complete

type: float64

count    2049.00
mean        0.06
std         0.25
min         0.00
25%         0.00
50%         0.00
75%         0.00
max         2.00

### Full Bath

- Number of full baths
- Action Needed: None

Full Bath: 100.0% complete

type: int64

count    2051.00
mean        1.58
std         0.55
min         0.00
25%         1.00
50%         2.00
75%         2.00
max         4.00

### Half Bath

- Number of half baths
- Action Needed: None

Half Bath: 100.0% complete

type: int64

count    2051.00
mean        0.37
std         0.50
min         0.00
25%         0.00
50%         0.00
75%         1.00
max         2.00

### Bedroom AbvGr

- Number of above ground bedrooms
- Action Needed: None

Bedroom AbvGr: 100.0% complete

type: int64

count    2051.00
mean        2.84
std         0.83
min         0.00
25%         2.00
50%         3.00
75%         3.00
max         8.00

### Kitchen AbvGr

- Number of above ground kitchens
- Action Needed: None

Kitchen AbvGr: 100.0% complete

type: int64

count    2051.00
mean        1.04
std         0.21
min         0.00
25%         1.00
50%         1.00
75%         1.00
max         3.00

### Kitchen Qual

- Kitchen quality
- Action Needed: change to ordinal

Kitchen Qual: 100.0% complete

type: object

TA    1047
Gd     806
Ex     151
Fa      47

### TotRms AbvGrd

- Number of rooms above ground
- Action Needed: None

TotRms AbvGrd: 100.0% complete

type: int64

count    2051.00
mean        6.44
std         1.56
min         2.00
25%         5.00
50%         6.00
75%         7.00
max        15.00

### Functional

- Home functionality rating
- Action Needed: convert to ordinal

Functional: 100.0% complete

type: object

Typ     1915
Min2      42
Min1      42
Mod       29
Maj1      12
Maj2       7
Sal        2
Sev        2

### Fireplaces

- Number of fireplaces
- Action Needed: None

Fireplaces: 100.0% complete

type: int64

count    2051.00
mean        0.59
std         0.64
min         0.00
25%         0.00
50%         1.00
75%         1.00
max         4.00

### FireplaceQu

- Fireplace quality
- Action Needed: interpolate with None and dummify

Fireplace Qu: 51.24% complete

type: object

Gd    523
TA    407
Fa     59
Po     31
Ex     31

### Garage Type

- Type of garage
- Action Needed: add None and dummify

Garage Type: 94.16924664602682% complete

type: object

Attchd     1213
Detchd      536
BuiltIn     132
Basment      27
2Types       19
CarPort      11

### Garage Yr Blt

- Year garage was built
- Action Needed: interpolate with Year Built column

Garage Yr Blt: 94.11461022199276% complete

type: float64

count    1937.00
mean     1978.71
std        25.44
min      1895.00
25%      1961.00
50%      1980.00
75%      2002.00
max      2207.00

### Garage Finish

- Quality of garage finish
- Action Needed: interpolate with none and change to ordinal

Garage Finish: 94.11461022199276% complete

type: object

Unf    849
RFn    579
Fin    509

### Garage Cars

- Number of cars garage can fit
- Action Needed: interpolate with zeros

Garage Cars: 99.95121951219512% complete

type: float64

count    2050.00
mean        1.78
std         0.76
min         0.00
25%         1.00
50%         2.00
75%         2.00
max         5.00

### Garage Area

- SF of garage
- Action Needed: interpolate with zeros

Garage Area: 99.95121951219512% complete

type: float64

count    2050.00
mean      473.67
std       215.93
min         0.00
25%       319.00
50%       480.00
75%       576.00
max      1418.00

### Garage Qual

- Quality of garage
- Action Needed: drop

Garage Qual: 94.11461022199276% complete

type: object

TA    1832
Fa      82
Gd      18
Ex       3
Po       2

### Garage Cond

- Condition of garage
- Action Needed: drop

Garage Cond: 94.11461022199276% complete

type: object

TA    1868
Fa      47
Gd      12
Po       8
Ex       2

### Paved Drive

- Whether or not driveway is paved
- Action Needed: change to boolean

Paved Drive: 100.0% complete

type: object

Y    1861
N     151
P      39

### Wood Deck SF

- SF of wood decks
- Action Needed: None

Wood Deck SF: 100.0% complete

type: int64

count    2051.00
mean       93.83
std       128.55
min         0.00
25%         0.00
50%         0.00
75%       168.00
max      1424.00

### Open Porch SF

- SF of open porches
- Action Needed: None

Open Porch SF: 100.0% complete

type: int64

count    2051.00
mean       47.56
std        66.75
min         0.00
25%         0.00
50%        27.00
75%        70.00
max       547.00

### Enclosed Porch

- SF of enclosed porches
- Action Needed: None

Enclosed Porch: 100.0% complete

type: int64

count    2051.00
mean       22.57
std        59.85
min         0.00
25%         0.00
50%         0.00
75%         0.00
max       432.00

### 3Ssn Porch

- 3 season porch SF
- Action Needed: None

3Ssn Porch: 100.0% complete

type: int64

count    2051.00
mean        2.59
std        25.23
min         0.00
25%         0.00
50%         0.00
75%         0.00
max       508.00

### Screen Porch

- SF of screened porches
- Action Needed: None

Screen Porch: 100.0% complete

type: int64

count    2051.00
mean       16.51
std        57.37
min         0.00
25%         0.00
50%         0.00
75%         0.00
max       490.00

### Pool Area

- SF of pool
- Action Needed: change to boolean

Pool Area: 100.0% complete

type: int64

count    2051.00
mean        2.40
std        37.78
min         0.00
25%         0.00
50%         0.00
75%         0.00
max       800.00

### Pool QC

- Pool Quality
- Action Needed: drop

Pool QC: 0.44% complete

type: object

Gd    4
Fa    2
TA    2
Ex    1

### Fence

- Fence type
- Action Needed: change to boolean

Fence: 19.5% complete

type: object

MnPrv    227
GdPrv     83
GdWo      80
MnWw      10

### Misc Feature

- Miscellaneous features grouped together
- Action Needed: drop

Misc Feature: 3.17% complete

type: object

Shed    56
Gar2     4
Othr     3
Elev     1
TenC     1

### Misc Val

- Value of misc feature
- Action Needed: None

Misc Val: 100.0% complete

type: int64

count     2051.00
mean        51.57
std        573.39
min          0.00
25%          0.00
50%          0.00
75%          0.00
max      17000.00

### Mo Sold

- Month sold in
- Action Needed: make categorical, dummify

Mo Sold: 100.0% complete

type: int64

count    2051.00
mean        6.22
std         2.74
min         1.00
25%         4.00
50%         6.00
75%         8.00
max        12.00

### Yr Sold

- Year sold in
- Action Needed: change to categorical, dummify

Yr Sold: 100.0% complete

type: int64

count    2051.00
mean     2007.78
std         1.31
min      2006.00
25%      2007.00
50%      2008.00
75%      2009.00
max      2010.00

### Sale Type

- Type of sale
- Action Needed: dummify

Sale Type: 100.0% complete

type: object

WD       1781
New       160
COD        63
ConLD      17
CWD        10
ConLI       7
ConLw       5
Con         4
Oth         4

### Sale Price

- Price of sale - **Target Variable**
- Action Needed: None

SalePrice: 100.0% complete

type: int64

count      2051.00
mean     181469.70
std       79258.66
min       12789.00
25%      129825.00
50%      162500.00
75%      214000.00
max      611657.00
