In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/study/합친데이터셋 - 시트3.csv')
df

In [None]:
!apt-get install -y fonts-nanum*
!rm -rf /root/.cache/matplotlib/* # 폰트 캐시 재설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns

path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font_name = mpl.font_manager.FontProperties(fname=path).get_name()
plt.rcParams['font.family'] = font_name

In [None]:
col =['합계출산율','1 to 19 hours_m', '20 to 29 hours_m',
       '30 to 34 hours_m', '35 to 39 hours_m', '40 hours or more_m', '소득불평등',
       '자살률', '1 to 19 hours_w', '20 to 29 hours_w', '30 to 34 hours_w',
       '35 to 39 hours_w', '40 hours or more_w', '남성 고용률', '여성 고용률', '사회보장기여금', '비혼 출산율', '결혼율', '성별에 따른 임금격차', '가족수당관리지출',
       '행복지수']

In [None]:
corr = df[col].corr(method = 'pearson')
corr
heatmap_data = df[col]
mask1 = np.zeros_like(corr)
mask1[np.triu_indices_from(mask1)] = True

fig = plt.figure(figsize = (20, 16))
ax = fig.gca()
plt.title("Person Correlation of Features", size = 15)
sns.heatmap(corr.values,
            annot = True,mask=mask1,fmt='.2f', annot_kws = {"size" : 15},yticklabels = col, xticklabels = col, ax=ax, cmap = "RdYlBu")

plt.tight_layout() ## 여백을 조정합니다.
plt.show() ## 화면에 출력합니다.

In [None]:
from sklearn.model_selection import train_test_split
col =['1 to 19 hours_m', '20 to 29 hours_m',
       '30 to 34 hours_m', '35 to 39 hours_m', '40 hours or more_m', '소득불평등',
       '자살률', '1 to 19 hours_w', '20 to 29 hours_w', '30 to 34 hours_w',
       '35 to 39 hours_w', '40 hours or more_w', '남성 고용률', '여성 고용률', '사회보장기여금', '비혼 출산율', '결혼율', '성별에 따른 임금격차', '가족수당관리지출',
       '행복지수']
# split dataset into training & test
X = df[col]
y = df['합계출산율']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['features'] = X_train.columns
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif.round(1)

In [None]:
col =['1 to 19 hours_m', '20 to 29 hours_m',
       '30 to 34 hours_m',  '소득불평등',
       '자살률', '1 to 19 hours_w', '20 to 29 hours_w', '30 to 34 hours_w',
         '여성 고용률', '사회보장기여금', '비혼 출산율', '결혼율', '성별에 따른 임금격차', '가족수당관리지출',
       '행복지수']
X = df[col]
y = df['합계출산율']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

vif = pd.DataFrame()
vif['features'] = X_train.columns
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif.round(1)

In [None]:
from sklearn import linear_model

# fit regression model in training set
lr = linear_model.LinearRegression()
model = lr.fit(X_train, y_train)

# predict in test set
pred_test = lr.predict(X_test)

In [None]:
print(lr.coef_)

In [None]:
### "feature - coefficients" DataFrame 만들기

coefs = pd.DataFrame(zip(df[col].columns, lr.coef_), columns = ['feature', 'coefficients'])
coefs

In [None]:
coefs_new = coefs.reindex(coefs.coefficients.abs().sort_values(ascending=False).index)
coefs_new

In [None]:
### coefficients 를 시각화 합니다.

### figure size
plt.figure(figsize = (8, 8))

### bar plot : matplotlib.pyplot 모듈의 barh() 함수를 사용해서 수평 막대 그래프를 그립니다.
plt.barh(coefs_new['feature'], coefs_new['coefficients'])
plt.title('"feature - coefficient" Graph')
plt.xlabel('coefficients')
plt.ylabel('features')
plt.show()

In [None]:
import statsmodels.api as sm

X_train2 = sm.add_constant(X_train)
model2 = sm.OLS(y_train, X_train2).fit()
model2.summary()

In [None]:
### 예측 결과 시각화 (test set)
df = pd.DataFrame({'actual': y_test, 'prediction': pred_test})
df = df.sort_values(by='actual').reset_index(drop=True)
df.head()

In [None]:
plt.figure(figsize=(12, 9))
plt.scatter(df.index, df['prediction'], marker='x', color='r')
plt.scatter(df.index, df['actual'], alpha=0.3, marker='o', color='black')
plt.title("Prediction Result in Test Set", fontsize=20)
plt.legend(['prediction', 'actual'], fontsize=12)
plt.show()

In [None]:
## R squared
print(model.score(X_train, y_train))  # training set
print(model.score(X_test, y_test))  # test set

In [None]:
## RMSE(Root Mean Squared Eror)
from sklearn.metrics import mean_squared_error
from math import sqrt

### training set
pred_train = lr.predict(X_train)
print(sqrt(mean_squared_error(y_train, pred_train)))

## test set
print(sqrt(mean_squared_error(y_test, pred_test)))
