# 선형 회귀분석 실습

## 1. 어떤 공장의 월별 생산 금액과 전기 사용량

In [1]:
from scipy import stats

In [2]:
x = [3.52, 2.58, 3.31, 4.07, 4.62, 3.98, 4.29, 4.83, 3.71, 4.61, 3.90, 3.20]
y = [2.48, 2.27, 2.47, 2.77, 2.98, 3.05, 3.18, 3.46, 3.03, 3.25, 2.67, 2.53]

In [4]:
slope, intercept, r_value, p_value, stderr = stats.linregress(x,y)

In [5]:
print(stderr) #standard error을 계산하기

0.07901935226531728


In [None]:
print("기울기(slope) = {:3.5f}".format(slope))  #f는 실수형이다. 
print("Y-절편(intercept) = {:3.5f}".format(intercept))
print("상관계수(correlation coefficient) = {:3.5f}".format(r_value))
print("p-value = {:1.15f}".format(p_value))
print("매출이 4억원일 때의 전기 사용량 예측값은 {:3.5}kWh입니다.".format(4*slope+intercept))

In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy as np
import matplotlib
font_path = 'C:/Windows/Fonts/NanumGothic.ttf'
fontprop = fm.FontProperties(fname=font_path, size=10)
 
ry = np.polyval([slope, intercept], x)
plt.plot(x, y, 'b.')
plt.plot(x, ry, 'r.-')
plt.title('회귀분석 결과', fontproperties=fontprop)
plt.legend(['실제 데이터', '회귀분석모델을 따르는 데이터'], prop=fontprop)
plt.show()

## 2. 시뮬레이션 데이터 (N = 1000)

In [None]:
import numpy as np

In [None]:
num_points = 1000
vectors_set = []

for i in range(num_points):
    x1 = np.random.normal(0.0, 0.55)
    y1 = x1 * 0.1 + 0.3 + np.random.normal(0.0, 0.03)
    vectors_set.append([x1, y1])
    
x_data = [v[0] for v in vectors_set]
y_data = [v[1] for v in vectors_set]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(x_data, y_data, 'ro')
#plt.legend()
plt.show()

In [None]:
import tensorflow as tf

In [None]:
W = tf.Variable(tf.random_uniform([1], -1.0, 1.0))
b = tf.Variable(tf.zeros([1]))
y = W * x_data + b

In [None]:
loss = tf.reduce_mean(tf.square(y - y_data))

In [None]:
optimizer = tf.train.GradientDescentOptimizer(0.5)
train = optimizer.minimize(loss)

In [None]:
init = tf.global_variables_initializer()

sess = tf.Session()
sess.run(init)

In [None]:
for step in range(20):
    sess.run(train)
    print(step, sess.run(W), sess.run(b))
    plt.plot(x_data, y_data, 'ro')
    plt.plot(x_data, sess.run(W) * x_data + sess.run(b))
    #plt.legend()
    plt.show()

## 3. 부동산 가격 데이터

In [None]:
import pandas as pd
import matplotlib
import sklearn

df = pd.read_csv('economy.csv', parse_dates =["date"], index_col ="date") 
df = df.dropna()
df.head()

In [None]:
train = df[(df.year > 2006) & (df.year < 2017)]
test = df[df.year >= 2017]

feature_names = ['region_cd', 'year', 'month', 'building_type',
                 'tradeprice_sido', 'construction_realized_amount','cd',
                 'spirit_deposit_rate','exchange_rate','composite_stock_price_index',
                 'economy_growth','exchequer_bond_three','household_loan_all',
                 'mortgage_all','numberofnosells','unsalenum_c']

X_train = train[feature_names]
X_test = test[feature_names]

lable_name = "tradeprice_sido_n1"
Y_train = train[lable_name]

Y_test = test[lable_name]

print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression(fit_intercept=True, normalize=True, n_jobs=None)
lm.fit(X_train, Y_train)
accuracy = lm.score(X_test, Y_test)
print("Linear Regression test file accuracy:"+str(accuracy))

lm.coef_

In [None]:
X_Cols = X_train.rename(columns= {'region_cd': '지역코드(시도)', 'year': '연도', 'month':'월',  'building_type': '부동산타입',  
                               'tradeprice_sido' : '매매가격지수(시도)', 'construction_realized_amount' : '건설기성액(백만원)', "cd": "cd(91일물)",
                                'spirit_deposit_rate': '정기예금금리', 'exchange_rate': '환율', 'composite_stock_price_index': '종합주가지수',
                               'economy_growth': '경제성장률','exchequer_bond_three' : '국고채3년','household_loan_all': '가계대출액(전국)',
                               'mortgage_all' : '주택대출액(전국)', 'numberofnosells':'미분양 가구수(시도)','unsalenum_c':'공사완료후 미분양(민간,시도)' })

print(X_train.columns)
coefs = pd.DataFrame(zip(X_Cols.columns,lm.coef_), columns = ['features', 'coefficients'])

coefs.reindex(coefs.coefficients.abs().sort_values(ascending=False).index)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

Y_pred = lm.predict(X_test)

plt.scatter(Y_test, Y_pred)
plt.xlabel("Price Index: $Y_i$")
plt.ylabel("Predicted price Index: $\hat{Y}_i$")
plt.title("Prices vs Predicted price Index: $Y_i$ vs $\hat{Y}_i$")