In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 81)

In [42]:
# trainデータを取得、NaN値などの状況を確認
dataset = pd.read_csv('data/train.csv', sep=',')
dataset_header = dataset.columns
print(f'Raw dataset shape: {dataset.shape[0]} * {dataset.shape[1]}')

# NaNチェック
# datasetから一列ずつ取り出して、nullが含まれている場合はそのカラム名を返す
def print_NaN_information(dataset):
    for data in dataset:
        numof_null = dataset[data].isnull().sum()
        if numof_null > 0:
            ratio = (numof_null / dataset.shape[0]) * 100
            print(f'NaN in {data}:{numof_null} \tNaN, ratio is\t{ratio:.2f}%')

print_NaN_information(dataset)
describe = dataset.describe(include='all')

Raw dataset shape: 1460 * 81
NaN in LotFrontage:259 	NaN, ratio is	17.74%
NaN in Alley:1369 	NaN, ratio is	93.77%
NaN in MasVnrType:8 	NaN, ratio is	0.55%
NaN in MasVnrArea:8 	NaN, ratio is	0.55%
NaN in BsmtQual:37 	NaN, ratio is	2.53%
NaN in BsmtCond:37 	NaN, ratio is	2.53%
NaN in BsmtExposure:38 	NaN, ratio is	2.60%
NaN in BsmtFinType1:37 	NaN, ratio is	2.53%
NaN in BsmtFinType2:38 	NaN, ratio is	2.60%
NaN in Electrical:1 	NaN, ratio is	0.07%
NaN in FireplaceQu:690 	NaN, ratio is	47.26%
NaN in GarageType:81 	NaN, ratio is	5.55%
NaN in GarageYrBlt:81 	NaN, ratio is	5.55%
NaN in GarageFinish:81 	NaN, ratio is	5.55%
NaN in GarageQual:81 	NaN, ratio is	5.55%
NaN in GarageCond:81 	NaN, ratio is	5.55%
NaN in PoolQC:1453 	NaN, ratio is	99.52%
NaN in Fence:1179 	NaN, ratio is	80.75%
NaN in MiscFeature:1406 	NaN, ratio is	96.30%


In [27]:
# NaNデータの処理

def NaN_processing(dataset):
    for x in dataset:
        

# Alleyは"No alley access"="NA"のため、NaN処理なし
# BsmtQualは"Basementなし"="NA"のため、NaN処理なし (BsmtCondも同様)
# Bsmt系はBasementがなければNAになるが... なぜBsmtExposureとBsmtFinType2は38で他は37?

# Garage系はそのまま。81 NaNであることは81の住宅でガレージがないことを示す
# GarageYrBltは年のデータでNaNがある。せっかくなので後ほどmedianで置換

# PoolQCは"プールなし"="NA"のため、NaN処理なし
# Fenceは"Fenceなし"="NA"のため、NaN処理なし

dataset = NaN_processing(dataset)

Id
MSSubClass
MSZoning
LotFrontage
LotArea
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
OverallQual
OverallCond
YearBuilt
YearRemodAdd
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
Heating
HeatingQC
CentralAir
Electrical
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
KitchenQual
TotRmsAbvGrd
Functional
Fireplaces
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PavedDrive
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
PoolQC
Fence
MiscVal
MoSold
YrSold
SaleType
SaleCondition
SalePrice


In [11]:
# Data Preprocessing
def data_preprocessing(dataset, mode):
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()

    # dtypeが数値でないものはラベルエンコーディング
    for x in dataset:
        if np.dtype(dataset[x]) == object:
            dataset[x] = label_encoder.fit_transform(dataset[x])

    # GarageYrBltのNaNを補完
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    dataset['GarageYrBlt'] = imputer.fit_transform(np.reshape(dataset['GarageYrBlt'].values, (dataset['GarageYrBlt'].shape[0], 1)))[:,0]

    #独立変数X, 従属変数y
    X = dataset.drop('Id', axis=1)
    if mode == 'train':
        X = dataset.iloc[:, :-1].values
        y = dataset.iloc[:,-1].values   
    else:
        X = dataset.iloc[:, :].values
        y = np.nan
    X = imputer.fit_transform(X)
       
    # ZScoreでスケーリング
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y, dataset

X, y, dataset = data_preprocessing(dataset, mode='train')

# train, testデータへ分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']

In [46]:
# (Multiple) Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)
y_train_pred = regressor.predict(X_train)

# trainデータとtestデータそれぞれのR2スコアを表示する関数
def print_r2_score(y_train, y_test, y_train_pred, y_pred):
    from sklearn.metrics import r2_score
    print(f'R2 Score(train) is {r2_score(y_train, y_train_pred):.5f}')
    print(f'R2 Score(test) is {r2_score(y_test, y_pred):.5f}')

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.84956
R2 Score(test) is 0.85430


In [47]:
# Polymial Regression
from sklearn.preprocessing import PolynomialFeatures
polynomial_instance = PolynomialFeatures(degree = 2)
X_poly_train = polynomial_instance.fit_transform(X_train)
X_poly_test = polynomial_instance.fit_transform(X_test)

regressor = LinearRegression()
regressor.fit(X_poly_train, y_train)
y_train_pred = regressor.predict(X_poly_train)
y_pred = regressor.predict(X_poly_test)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 1.00000
R2 Score(test) is 0.44656


In [48]:
# SVR
from sklearn.svm import SVR
svr_regressor = SVR(kernel='linear')
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
y_train_pred = svr_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.13558
R2 Score(test) is 0.16843


In [49]:
# SVR (rbf)
from sklearn.svm import SVR
svr_regressor = SVR(kernel='rbf')
svr_regressor.fit(X_train, y_train)
y_pred = svr_regressor.predict(X_test)
y_train_pred = svr_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is -0.04504
R2 Score(test) is -0.03857


In [50]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
rndm_regressor = RandomForestRegressor(n_estimators = 100, random_state=0)
rndm_regressor.fit(X_train, y_train)
y_pred = rndm_regressor.predict(X_test)
y_train_pred = rndm_regressor.predict(X_train)

print_r2_score(y_train, y_test, y_train_pred, y_pred)

R2 Score(train) is 0.97875
R2 Score(test) is 0.90222


In [53]:
# Predict
test_dataset = pd.read_csv('data/test.csv')
#test_dataset = NaN_check(test_dataset)
test_dataset = NaN_processing(test_dataset)
X_sub, y_sub, test_dataset = data_preprocessing(test_dataset, mode='predict')

y_pred = rndm_regressor.predict(X_sub)

output = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
