# 重回帰モデル

In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns


# データ読み込み
cal_housing = pd.read_csv('datasets/cal_housing/CaliforniaHousing/cal_housing.data', sep=',')
cal_housing.columns = ['longitude', 'latitude', 'housingMedianAge', 'totalRooms', 'totalBedrooms', 'population', 'households', 'medianIncome', 'medianHouseValue']


# データクレンジング
def cleansing(df, col, n):
  mean = df[col].mean()
  std = df[col].std()

  df = df[(df[col] > mean - n * std) & (df[col] < mean + n * std)]
  
  return df

cal_housing = cleansing(cal_housing, 'housingMedianAge', 2)
cal_housing = cleansing(cal_housing, 'totalRooms', 2)
cal_housing = cleansing(cal_housing, 'totalBedrooms', 2)
cal_housing = cleansing(cal_housing, 'population', 2)
cal_housing = cleansing(cal_housing, 'households', 2)
cal_housing = cleansing(cal_housing, 'medianIncome', 2)
cal_housing = cleansing(cal_housing, 'medianHouseValue', 2)


# 関連する基本統計量
display(cal_housing.describe())

def train(X, y):
  # 訓練データとテストデータに分割
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

  # 学習
  model = LinearRegression()
  model.fit(X_train, y_train)

  # 評価
  mse = mean_squared_error(y_test, model.predict(X_test))
  r2 = r2_score(y_test, model.predict(X_test))
  print('COEF', model.coef_)
  print('INTERCEPT', model.intercept_)
  print('MSE:', mse)
  print('R2:', r2)

  return model


# 学習・評価
X = cal_housing[['longitude', 'latitude', 'housingMedianAge', 'totalRooms', 'totalBedrooms', 'population', 'households', 'medianIncome']]
longitude_model = train(X, cal_housing['medianHouseValue'])



Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,15648.0,15648.0,15648.0,15648.0,15648.0,15648.0,15648.0,15648.0,15648.0
mean,-119.611961,35.729048,29.915836,2000.000383,410.934369,1116.581097,384.378195,3.527576,178709.624041
std,2.004892,2.184998,12.069361,949.868324,181.436156,507.466004,167.153233,1.404762,84774.564866
min,-124.35,32.54,4.0,26.0,14.0,27.0,20.0,0.4999,14999.0
25%,-121.76,33.94,20.0,1324.75,279.0,748.75,264.0,2.4554,110400.0
50%,-118.77,34.42,31.0,1878.0,393.0,1065.0,370.0,3.35815,165300.0
75%,-118.03,37.75,38.0,2584.0,533.0,1454.0,499.0,4.43075,233725.0
max,-114.49,41.95,52.0,6349.0,1047.0,2519.0,779.0,7.7234,405400.0


COEF [-3.41981934e+04 -3.40961332e+04  6.86904905e+02 -1.89341475e+01
  1.67598822e+02 -5.71386826e+01  1.10698213e+02  3.91439365e+04]
INTERCEPT -2862317.153045594
MSE: 2742822915.537325
R2: 0.6257296688234251
