## 理解endog、exog
![](img/endogexog.png)

## statsmodel的api

In [3]:
import statsmodels.api as sm
print(dir(sm))

['BayesGaussMI', 'BinomialBayesMixedGLM', 'Factor', 'GEE', 'GLM', 'GLMGam', 'GLS', 'GLSAR', 'GeneralizedPoisson', 'Logit', 'MANOVA', 'MI', 'MICE', 'MICEData', 'MNLogit', 'MixedLM', 'NegativeBinomial', 'NegativeBinomialP', 'NominalGEE', 'OLS', 'OrdinalGEE', 'PCA', 'PHReg', 'Poisson', 'PoissonBayesMixedGLM', 'ProbPlot', 'Probit', 'QuantReg', 'RLM', 'RecursiveLS', 'SurvfuncRight', 'WLS', 'ZeroInflatedGeneralizedPoisson', 'ZeroInflatedNegativeBinomialP', 'ZeroInflatedPoisson', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', 'add_constant', 'categorical', 'cov_struct', 'datasets', 'distributions', 'duration', 'emplike', 'families', 'formula', 'gam', 'genmod', 'graphics', 'iolib', 'load', 'multivariate', 'nonparametric', 'qqline', 'qqplot', 'qqplot_2samples', 'regression', 'robust', 'show_versions', 'stats', 'test', 'tools', 'tsa', 'webdoc']


In [4]:
#作图相关api
print(dir(sm.graphics))

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'abline_plot', 'beanplot', 'fboxplot', 'hdrboxplot', 'influence_plot', 'interaction_plot', 'mean_diff_plot', 'plot_ccpr', 'plot_ccpr_grid', 'plot_corr', 'plot_corr_grid', 'plot_fit', 'plot_leverage_resid2', 'plot_partregress', 'plot_partregress_grid', 'plot_regress_exog', 'qqplot', 'rainbow', 'rainbowplot', 'tsa', 'violinplot']


In [5]:
#时间序列相关api
print(dir(sm.tsa))

['AR', 'ARIMA', 'ARMA', 'ArmaProcess', 'DynamicFactor', 'DynamicVAR', 'ExponentialSmoothing', 'Holt', 'MarkovAutoregression', 'MarkovRegression', 'SARIMAX', 'SVAR', 'SimpleExpSmoothing', 'UnobservedComponents', 'VAR', 'VARMAX', 'VECM', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'acf', 'acovf', 'add_lag', 'add_trend', 'adfuller', 'arma_generate_sample', 'arma_order_select_ic', 'bds', 'ccf', 'ccovf', 'coint', 'datetools', 'detrend', 'filters', 'graphics', 'innovations', 'interp', 'kpss', 'lagmat', 'lagmat2ds', 'pacf', 'pacf_ols', 'pacf_yw', 'periodogram', 'q_stat', 'seasonal_decompose', 'statespace', 'stattools', 'tsatools', 'var', 'x13_arima_analysis', 'x13_arima_select_order']


### statsmodel库的架构
是按照话题来设计的

```
statsmodels/
    __init__.py
    api.py
    discrete/
        __init__.py
        discrete_model.py
        tests/
            results/
    tsa/
        __init__.py
        api.py
        tsatools.py
        stattools.py
        arima_model.py
        arima_process.py
        vector_ar/
            __init__.py
            var_model.py
            tests/
                results/
        tests/
            results/
    stats/
        __init__.py
        api.py
        stattools.py
        tests/
    tools/
        __init__.py
        tools.py
        decorators.py
        tests/
```


### import方法
#### 函数和类

In [18]:
from statsmodels.regression.linear_model import OLS, WLS
from statsmodels.tools.tools import add_constant

#### 模块

In [20]:
from statsmodels.datasets import macrodata
from statsmodels.stats import diagnostic

#### 起别名

In [21]:
import statsmodels.regression.linear_model as lm
import statsmodels.stats.diagnostic as smsdia
import statsmodels.stats.outliers_influence as oi

## R语言的style
statsmodels自从0.5.0版本之后，允许R语言风格的公式写法。


**注意**
statsmodels.formula.api支持小写调用，如ols、gls等，而小写的ols支持R语言公式写法。

```
公式0 y ～ x
公式1 Lottery ~ Literacy + Wealth + Region
公式2 Lottery ~ Literacy + Wealth + Region-1
公式3 Lottery ~ Literacy:Wealth
公式4 Lottery ~ Literacy*Wealth
公式4 Lottery ~ np.log(Literacy)
```

- 公式0被默认执行为y～1+x
- "~"左侧是y，右侧是x变量，x之间使用"+"分割
- 公式2中有“-1”， "-" 被用于移除某个变量或者列，公式2移除来常数项
- 公式3要研究的**自变量**仅有一个，就是变量**LiteracyWealth**
- 公式4要研究的**自变量**有3个，分别是**Literacy、Wealth、LiteracyWealth**
- 可以在模型中使用向量化函数，操作公式中的变量，例如**公式4**

In [25]:
#from statsmodels.regression.linear_model import OLS
import statsmodels.formula.api as smf
import pandas as pd

df = pd.read_csv('data/guerry.csv')
df = df[['Lottery', 'Literacy', 'Wealth', 'Region']]
df.head()

Unnamed: 0,Lottery,Literacy,Wealth,Region
0,41,37,73,E
1,38,51,22,N
2,66,13,61,C
3,80,46,76,E
4,79,69,83,E


In [55]:
#Region是分类变量，这里没有做dummy虚拟变量
model = smf.ols(formula='Lottery ~ Literacy + Wealth + Region',
                data=df)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       0.358
Model:                            OLS   Adj. R-squared:                  0.300
Method:                 Least Squares   F-statistic:                     6.205
Date:                Thu, 19 Dec 2019   Prob (F-statistic):           8.69e-06
Time:                        16:16:23   Log-Likelihood:                -379.21
No. Observations:                  86   AIC:                             774.4
Df Residuals:                      78   BIC:                             794.1
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      75.4006     23.971      3.145      

In [35]:
result.params

Intercept      75.400576
Region[T.C]   -36.748921
Region[T.E]   -52.176706
Region[T.N]   -46.765882
Region[T.S]   -41.297178
Region[T.W]   -46.840197
Literacy       -0.185819
Wealth          0.451475
dtype: float64

In [59]:
result.pvalues[result.pvalues<0.05]

Intercept      0.002347
Region[T.E]    0.018206
Region[T.N]    0.033076
Region[T.W]    0.035906
Wealth         0.000035
dtype: float64

In [51]:
import numpy as np
model2 = smf.ols(formula='Lottery ~ np.log(Literacy)',
                data=df)
result2 = model2.fit()
print(result2.params)

Intercept           113.405982
np.log(Literacy)    -19.622903
dtype: float64


In [52]:
model3 = smf.ols(formula='Lottery ~ Literacy:Wealth',
                data=df)
result3 = model3.fit()
print(result3.params)

Intercept          38.185708
Literacy:Wealth     0.003344
dtype: float64


In [54]:
model4 = smf.ols(formula='Lottery ~ Literacy*Wealth',
                data=df)
result4 = model4.fit()
print(result4.params)

Intercept          38.843678
Literacy           -0.340264
Wealth              0.429771
Literacy:Wealth    -0.000429
dtype: float64
