In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score
df = pd.read_csv("CaliforniaHousing.csv")
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [2]:
def fit(df):
    X=df[df.columns[:-1]]
    y=df[df.columns[-1]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    coef = pd.Series(model.coef_, index=X.columns)
    print(f"MAPE: {mape:.4f}")
    print(f"R2: {r2:.4f}")
    print(coef)
    return coef
coef=fit(df)

MAPE: 0.3233
R2: 0.5943
MedInc        0.433333
HouseAge      0.009293
AveRooms     -0.098643
AveBedrms     0.593215
Population   -0.000008
AveOccup     -0.004745
Latitude     -0.421449
Longitude    -0.434166
dtype: float64


In [3]:
cond1 = (df["MedHouseVal"] <= 5)
cond2 = (df["AveRooms"] <= df["AveRooms"].quantile(0.99))
cond3 = (df["AveBedrms"] <= df["AveBedrms"].quantile(0.99))
cond4 = (df["Population"] <= df["Population"].quantile(0.99))
cond5 = (df["AveOccup"] <= df["AveOccup"].quantile(0.99))
df_prep = df.loc[cond1 & cond2 & cond3 & cond4 & cond5]
df_prep.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,19055.0,19055.0,19055.0,19055.0,19055.0,19055.0,19055.0,19055.0,19055.0
mean,3.675348,28.662136,5.210592,1.064349,1377.22031,2.912624,35.648,-119.579213,1.931627
std,1.563962,12.421668,1.165471,0.11688,866.285478,0.69546,2.144714,2.005519,0.980969
min,0.4999,1.0,0.846154,0.333333,3.0,0.75,32.54,-124.35,0.14999
25%,2.52545,18.0,4.410857,1.004938,802.0,2.443988,33.93,-121.79,1.163
50%,3.449,29.0,5.169444,1.047521,1175.0,2.832461,34.27,-118.51,1.746
75%,4.5833,37.0,5.940678,1.096614,1720.0,3.289391,37.73,-118.01,2.5
max,15.0001,52.0,10.352941,2.127208,5804.0,5.392954,41.95,-114.55,5.0


In [6]:
coef=fit(df_prep)
#coefが 0に近い houseageと populationgは取り除く

MAPE: 0.2841
R2: 0.6330
MedInc        0.439034
HouseAge      0.010457
AveRooms     -0.118551
AveBedrms     0.992037
Population    0.000061
AveOccup     -0.331091
Latitude     -0.389385
Longitude    -0.384806
dtype: float64


In [7]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = df_prep[df_prep.columns[:-1]]
df_vif = pd.DataFrame()
df_vif.index = X.columns
df_vif["Coefficient"] = coef
df_vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
df_vif
#다중공선성 판별

Unnamed: 0,Coefficient,VIF
MedInc,0.439034,17.594523
HouseAge,0.010457,7.818422
AveRooms,-0.118551,56.868174
AveBedrms,0.992037,125.369512
Population,6.1e-05,4.219317
AveOccup,-0.331091,20.737068
Latitude,-0.389385,623.919518
Longitude,-0.384806,963.073803


In [27]:
df_prep_1=df_prep.drop(columns=['HouseAge','Population'])
df_prep_1

Unnamed: 0,MedInc,AveRooms,AveBedrms,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,6.984127,1.023810,2.555556,37.88,-122.23,4.526
1,8.3014,6.238137,0.971880,2.109842,37.86,-122.22,3.585
2,7.2574,8.288136,1.073446,2.802260,37.85,-122.24,3.521
3,5.6431,5.817352,1.073059,2.547945,37.85,-122.25,3.413
4,3.8462,6.281853,1.081081,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...
20635,1.5603,5.045455,1.133333,2.560606,39.48,-121.09,0.781
20636,2.5568,6.114035,1.315789,3.122807,39.49,-121.21,0.771
20637,1.7000,5.205543,1.120092,2.325635,39.43,-121.22,0.923
20638,1.8672,5.329513,1.171920,2.123209,39.43,-121.32,0.847


In [28]:
coef=fit(df_prep_1)

MAPE: 0.2855
R2: 0.6198
MedInc       0.420678
AveRooms    -0.126211
AveBedrms    0.876953
AveOccup    -0.316804
Latitude    -0.432135
Longitude   -0.432097
dtype: float64


In [29]:
X = df_prep_1[df_prep_1.columns[:-1]]
df_vif = pd.DataFrame()
df_vif.index = X.columns
df_vif["Coefficient"] = coef
df_vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
df_vif
#vifが一番高いLongitude取り除く

Unnamed: 0,Coefficient,VIF
MedInc,0.420678,17.331099
AveRooms,-0.126211,55.668584
AveBedrms,0.876953,123.385238
AveOccup,-0.316804,19.840979
Latitude,-0.432135,615.330232
Longitude,-0.432097,898.067079


In [44]:
df_prep_2=df_prep_1.drop(columns=['Longitude'])
coef=fit(df_prep_2)
#mapeが30％を超えてlongitudeのかわりにlatitudeを取り除く

MAPE: 0.3269
R2: 0.5307
MedInc       0.543892
AveRooms    -0.277085
AveBedrms    1.061226
AveOccup    -0.321731
Latitude    -0.039195
dtype: float64


In [46]:
df_prep_2=df_prep_1.drop(columns=['Latitude'])
coef=fit(df_prep_2)
#mapeが30％を超えてcoefが低くvifが10をこえたAveRoomsを取り除く

MAPE: 0.3351
R2: 0.5230
MedInc       0.568163
AveRooms    -0.312809
AveBedrms    1.182044
AveOccup    -0.287892
Longitude   -0.013597
dtype: float64


In [47]:
df_prep_2=df_prep_1.drop(columns=['AveRooms'])
coef=fit(df_prep_2)

MAPE: 0.2925
R2: 0.6126
MedInc       0.345468
AveBedrms    0.419943
AveOccup    -0.333951
Latitude    -0.484518
Longitude   -0.477206
dtype: float64


In [40]:
X = df_prep_2[df_prep_2.columns[:-1]]
df_vif = pd.DataFrame()
df_vif.index = X.columns
df_vif["Coefficient"] = coef
df_vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
df_vif
#vifが一番高いLongitude取り除く

Unnamed: 0,Coefficient,VIF
MedInc,0.345468,7.041299
AveBedrms,0.419943,87.00881
AveOccup,-0.333951,19.276986
Latitude,-0.484518,539.841695
Longitude,-0.477206,775.460171


In [53]:
df_prep_3=df_prep_2.drop(columns=['Longitude'])
coef=fit(df_prep_3)
#mapeが30％を超えてlongitudeのかわりにlatitudeを取り除く

MAPE: 0.3536
R2: 0.4886
MedInc       0.387497
AveBedrms   -0.036908
AveOccup    -0.366033
Latitude    -0.067695
dtype: float64


In [54]:
df_prep_3=df_prep_2.drop(columns=['Latitude'])
coef=fit(df_prep_3)
#mapeが30％を超えてcoefが低くvifが10をこえたAveOccupを取り除く

MAPE: 0.3735
R2: 0.4662
MedInc       0.395040
AveBedrms   -0.086286
AveOccup    -0.329087
Longitude    0.003375
dtype: float64


In [55]:
df_prep_3=df_prep_2.drop(columns=['AveOccup'])
coef=fit(df_prep_3)
#mapeが30％を超えてcoefが低くvifが10をこえたAveBedrmsを取り除く

MAPE: 0.3072
R2: 0.5554
MedInc       0.357093
AveBedrms    0.670632
Latitude    -0.481907
Longitude   -0.497058
dtype: float64


In [58]:
df_prep_3=df_prep_2.drop(columns=['AveBedrms'])
coef=fit(df_prep_3)

MAPE: 0.2934
R2: 0.6121
MedInc       0.339690
AveOccup    -0.342552
Latitude    -0.475196
Longitude   -0.467506
dtype: float64


In [61]:
X = df_prep_3[df_prep_3.columns[:-1]]
df_vif = pd.DataFrame()
df_vif.index = X.columns
df_vif["Coefficient"] = coef
df_vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
df_vif

Unnamed: 0,Coefficient,VIF
MedInc,0.33969,6.688061
AveOccup,-0.342552,19.084751
Latitude,-0.475196,535.686168
Longitude,-0.467506,624.657294


In [62]:
df_vif.to_csv('PDA_230868_8_2.csv')

In [63]:
#longitudeとlatitudeのvif値はすごく高いのですが、mape値を優先として
#HouseAge,Population,AveRooms,AveBedrms順で取り除きました。