In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

%matplotlib inline

## Load the Data

---

Today's [dataset](`train.csv`) is from Kaggle https://www.kaggle.com/c/dsi-us-6-project-2-regression-challenge/data. 




In [59]:
train_dst='./datasets/train.csv'
test_dst='./datasets/test.csv'
traindf= pd.read_csv(train_dst)
testdf=pd.read_csv(test_dst)


In [60]:
traindf.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [61]:
traindf.shape

(2051, 81)

In [62]:
nancolumns=traindf.columns[traindf.isna().any()].tolist()
print(len(nancolumns))
print(nancolumns)      # total of 26 columns with nan values in them, now need to see which one to remove, Cant remove all

26
['Lot Frontage', 'Alley', 'Mas Vnr Type', 'Mas Vnr Area', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garage Cond', 'Pool QC', 'Fence', 'Misc Feature']


In [63]:
for i in nancolumns:
    print(i,traindf[i].isnull().sum()) #shows number of nan values in each column

Lot Frontage 330
Alley 1911
Mas Vnr Type 22
Mas Vnr Area 22
Bsmt Qual 55
Bsmt Cond 55
Bsmt Exposure 58
BsmtFin Type 1 55
BsmtFin SF 1 1
BsmtFin Type 2 56
BsmtFin SF 2 1
Bsmt Unf SF 1
Total Bsmt SF 1
Bsmt Full Bath 2
Bsmt Half Bath 2
Fireplace Qu 1000
Garage Type 113
Garage Yr Blt 114
Garage Finish 114
Garage Cars 1
Garage Area 1
Garage Qual 114
Garage Cond 114
Pool QC 2042
Fence 1651
Misc Feature 1986


In [64]:
traindf=traindf.drop(columns=['Alley','Fireplace Qu','Pool QC','Fence','Misc Feature'],axis=1) #dropping irrelavant columns

In [65]:
traindf

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,...,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,IR1,Lvl,AllPub,...,44,0,0,0,0,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,...,74,0,0,0,0,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,Reg,Lvl,AllPub,...,52,0,0,0,0,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,Reg,Lvl,AllPub,...,0,0,0,0,0,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,IR1,Lvl,AllPub,...,59,0,0,0,0,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,IR1,HLS,AllPub,...,276,0,0,0,0,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,IR1,Lvl,AllPub,...,0,0,0,0,0,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,Reg,Bnk,AllPub,...,0,0,0,0,0,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,Reg,Lvl,AllPub,...,189,140,0,0,0,0,11,2009,WD,144000


In [68]:
traindf.dropna(axis=1)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,13517,Pave,IR1,Lvl,AllPub,CulDSac,...,44,0,0,0,0,0,3,2010,WD,130500
1,544,531379050,60,RL,11492,Pave,IR1,Lvl,AllPub,CulDSac,...,74,0,0,0,0,0,4,2009,WD,220000
2,153,535304180,20,RL,7922,Pave,Reg,Lvl,AllPub,Inside,...,52,0,0,0,0,0,1,2010,WD,109000
3,318,916386060,60,RL,9802,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,4,2010,WD,174000
4,255,906425045,50,RL,14235,Pave,IR1,Lvl,AllPub,Inside,...,59,0,0,0,0,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,11449,Pave,IR1,HLS,AllPub,Inside,...,276,0,0,0,0,0,1,2008,WD,298751
2047,785,905377130,30,RL,12342,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,0,3,2009,WD,82500
2048,916,909253010,50,RL,7558,Pave,Reg,Bnk,AllPub,Inside,...,0,0,0,0,0,0,3,2009,WD,177000
2049,639,535179160,20,RL,10400,Pave,Reg,Lvl,AllPub,Corner,...,189,140,0,0,0,0,11,2009,WD,144000
