In [44]:
%matplotlib inline
import numpy as np
import pandas as pd
import statsmodels.api as sm 
import matplotlib.pyplot as plt 
import seaborn as sns

In [45]:
total_industry = pd.read_excel('total_industry.xlsx', index_col=0)

In [46]:
total_industry = total_industry.assign(남성비율=total_industry.남/total_industry.소계,
                      학사여성비율=total_industry["학사(여)"] / total_industry.학사,
                      석사여성비율=total_industry['석사(여)'] / total_industry.석사,
                      박사여성비율=total_industry['박사(여)'] / total_industry.박사,
                      자연계여성비율=total_industry['자연계(여)'] / total_industry.자연계,
                      공학계여성비율=total_industry['공학계(여)'] / total_industry.공학계,
                      비이공계여성비율=total_industry['비이공계(여)'] / total_industry.비이공계)

total_filter = total_industry.rename(columns={'산업별(1)':'산업별'}).\
                              query('산업별 in ["기계", "디스플레이", "반도체", "섬유", "자동차", "전자", "철강", "화학", "바이오ㆍ헬스", "조선", "소프트웨어", "IT 비즈니스"]')

In [55]:
df1 = total_filter.loc[:, '지역':'비이공계여성비율']
df2 = total_filter[['시점', '산업별']]
da = pd.concat([df2, df1], axis=1)

In [56]:
da['시점2'] = da.시점 - 2012

In [57]:
model = sm.OLS.from_formula('학사여성비율 ~ 지역 + 산업별 + 시점2', data=da)

In [58]:
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,학사여성비율,R-squared:,0.346
Model:,OLS,Adj. R-squared:,0.321
Method:,Least Squares,F-statistic:,13.79
Date:,"Mon, 22 Aug 2022",Prob (F-statistic):,4.53e-28
Time:,21:08:13,Log-Likelihood:,253.17
No. Observations:,407,AIC:,-474.3
Df Residuals:,391,BIC:,-410.2
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2002,0.027,7.441,0.000,0.147,0.253
지역[T.부산],0.0101,0.018,0.551,0.582,-0.026,0.046
지역[T.서울],0.0711,0.019,3.784,0.000,0.034,0.108
지역[T.울산],-0.0315,0.018,-1.712,0.088,-0.068,0.005
산업별[T.기계],-0.2062,0.031,-6.554,0.000,-0.268,-0.144
산업별[T.디스플레이],-0.0925,0.033,-2.828,0.005,-0.157,-0.028
산업별[T.바이오ㆍ헬스],-0.1153,0.031,-3.666,0.000,-0.177,-0.053
산업별[T.반도체],-0.1202,0.033,-3.637,0.000,-0.185,-0.055
산업별[T.섬유],0.0775,0.031,2.463,0.014,0.016,0.139

0,1,2,3
Omnibus:,237.712,Durbin-Watson:,1.687
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2278.763
Skew:,2.343,Prob(JB):,0.0
Kurtosis:,13.603,Cond. No.,59.5


In [67]:
model2 = sm.OLS.from_formula('학사여성비율 ~ C(지역, Treatment(reference="서울")) + C(산업별, Treatment(reference="기계"))', data=da)
result2 = model2.fit()
result2.summary()

0,1,2,3
Dep. Variable:,학사여성비율,R-squared:,0.339
Model:,OLS,Adj. R-squared:,0.315
Method:,Least Squares,F-statistic:,14.35
Date:,"Mon, 22 Aug 2022",Prob (F-statistic):,8.409999999999999e-28
Time:,21:34:47,Log-Likelihood:,250.98
No. Observations:,407,AIC:,-472.0
Df Residuals:,392,BIC:,-411.8
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0857,0.025,3.407,0.001,0.036,0.135
"C(지역, Treatment(reference=""서울""))[T.경남]",-0.0706,0.019,-3.741,0.000,-0.108,-0.033
"C(지역, Treatment(reference=""서울""))[T.부산]",-0.0598,0.019,-3.129,0.002,-0.097,-0.022
"C(지역, Treatment(reference=""서울""))[T.울산]",-0.1026,0.019,-5.340,0.000,-0.140,-0.065
"C(산업별, Treatment(reference=""기계""))[T.IT 비즈니스]",0.2056,0.032,6.508,0.000,0.143,0.268
"C(산업별, Treatment(reference=""기계""))[T.디스플레이]",0.1123,0.033,3.442,0.001,0.048,0.176
"C(산업별, Treatment(reference=""기계""))[T.바이오ㆍ헬스]",0.0909,0.031,2.897,0.004,0.029,0.153
"C(산업별, Treatment(reference=""기계""))[T.반도체]",0.0884,0.033,2.685,0.008,0.024,0.153
"C(산업별, Treatment(reference=""기계""))[T.섬유]",0.2837,0.031,9.045,0.000,0.222,0.345

0,1,2,3
Omnibus:,247.357,Durbin-Watson:,1.672
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2625.16
Skew:,2.429,Prob(JB):,0.0
Kurtosis:,14.455,Cond. No.,13.7


In [60]:
model3 = sm.OLS.from_formula('학사여성비율 ~ 지역', data=da)
result3 = model3.fit()
result3.summary()

0,1,2,3
Dep. Variable:,학사여성비율,R-squared:,0.063
Model:,OLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,9.052
Date:,"Mon, 22 Aug 2022",Prob (F-statistic):,8.21e-06
Time:,21:13:46,Log-Likelihood:,180.04
No. Observations:,407,AIC:,-352.1
Df Residuals:,403,BIC:,-336.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0971,0.015,6.461,0.000,0.068,0.127
지역[T.부산],0.0102,0.022,0.475,0.635,-0.032,0.053
지역[T.서울],0.0791,0.022,3.600,0.000,0.036,0.122
지역[T.울산],-0.0346,0.022,-1.599,0.111,-0.077,0.008

0,1,2,3
Omnibus:,247.821,Durbin-Watson:,1.869
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1691.836
Skew:,2.647,Prob(JB):,0.0
Kurtosis:,11.47,Cond. No.,4.67


In [61]:
da.corr()

Unnamed: 0,시점,남성비율,학사여성비율,석사여성비율,박사여성비율,자연계여성비율,공학계여성비율,비이공계여성비율,시점2
시점,1.0,-0.163128,0.083786,0.149357,0.107497,-0.067601,0.170618,0.083001,1.0
남성비율,-0.163128,1.0,-0.73177,-0.457773,-0.169917,-0.614067,-0.712461,-0.601876,-0.163128
학사여성비율,0.083786,-0.73177,1.0,0.598014,0.170846,0.657089,0.876863,0.637266,0.083786
석사여성비율,0.149357,-0.457773,0.598014,1.0,0.14089,0.439556,0.590198,0.40229,0.149357
박사여성비율,0.107497,-0.169917,0.170846,0.14089,1.0,0.130793,0.219971,0.136135,0.107497
자연계여성비율,-0.067601,-0.614067,0.657089,0.439556,0.130793,1.0,0.555363,0.518849,-0.067601
공학계여성비율,0.170618,-0.712461,0.876863,0.590198,0.219971,0.555363,1.0,0.530859,0.170618
비이공계여성비율,0.083001,-0.601876,0.637266,0.40229,0.136135,0.518849,0.530859,1.0,0.083001
시점2,1.0,-0.163128,0.083786,0.149357,0.107497,-0.067601,0.170618,0.083001,1.0


In [70]:
model2 = sm.OLS.from_formula('학사여성비율 ~ 산업별 * 지역', data=da)
result2 = model2.fit()
result2.summary()

0,1,2,3
Dep. Variable:,학사여성비율,R-squared:,0.483
Model:,OLS,Adj. R-squared:,0.417
Method:,Least Squares,F-statistic:,7.306
Date:,"Mon, 22 Aug 2022",Prob (F-statistic):,5.16e-30
Time:,21:51:27,Log-Likelihood:,300.95
No. Observations:,407,AIC:,-507.9
Df Residuals:,360,BIC:,-319.5
Df Model:,46,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1658,0.041,4.050,0.000,0.085,0.246
산업별[T.기계],-0.1431,0.058,-2.472,0.014,-0.257,-0.029
산업별[T.디스플레이],0.0492,0.058,0.851,0.396,-0.065,0.163
산업별[T.바이오ㆍ헬스],-0.0187,0.058,-0.323,0.747,-0.133,0.095
산업별[T.반도체],-0.0601,0.058,-1.039,0.300,-0.174,0.054
산업별[T.섬유],0.0238,0.058,0.412,0.681,-0.090,0.138
산업별[T.소프트웨어],-0.0534,0.058,-0.922,0.357,-0.167,0.061
산업별[T.자동차],-0.1300,0.058,-2.245,0.025,-0.244,-0.016
산업별[T.전자],-0.1037,0.058,-1.791,0.074,-0.218,0.010

0,1,2,3
Omnibus:,297.253,Durbin-Watson:,1.526
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5117.601
Skew:,2.921,Prob(JB):,0.0
Kurtosis:,19.36,Cond. No.,1.06e+17
