# Modeling with StatsModels

## 1. Ordinary Least Square
- Column Names
- Log Transformation
- Condition Number
- Standard Scaling

# 2. Dimensionality Reduction
- ANOVA
- F-test and Feature Influence

# 3. Outlier
- Cook's Distance

# 4. Regularization
- Lasso

# 5. Diagnosis of Regression
- Residual Normality Test
- Partial Regression Plot

# 6. Cross Validatoin

# 7. Test
- score

In [1]:
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import warnings
import sys
import datetime
import scipy as sp
import statsmodels.stats.api as sms
import statsmodels.api as sm
from patsy import dmatrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import utils.statsmodel_helper as sh
import utils.feature_selection as fs
import utils.preprocessing as pp
import utils.error_calculator as ec

class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

df_train_macro = pd.read_csv('./data/train_macro.csv', index_col=0)
df_test_macro = pd.read_csv('./data/test_macro.csv', index_col=0)

sys.setrecursionlimit(1500)

degree = 2
skewness_limit = 1
num_of_cooks = 2
num_of_f_test = 10

# 1. Column Names
## Column Names
Replace -, +, :, ~, * in column name with underscore

In [2]:
new_cols = []
for col in  list(df_train_macro.columns):
    col = col.replace('-', '_').replace('+', '_').replace(':', '_').replace('~', '_').replace('*', '_')
    new_cols.append('_'+col)
df_train_macro.columns = new_cols

new_cols = []
for col in list(df_test_macro.columns):
    col = col.replace('-', '_').replace('+', '_').replace(':', '_').replace('~', '_').replace('*', '_')
    new_cols.append('_'+col)
df_test_macro.columns = new_cols

categorial_ivs = list(set(df_train_macro.columns) - set(df_train_macro._get_numeric_data().columns))
numeric_ivs = df_train_macro._get_numeric_data().columns.drop('_price_doc').tolist()

## Log Transformation
Transform data with skewness greater than 1.

In [3]:
features_to_log = []
for f in df_train_macro._get_numeric_data().columns:
    skewness = sp.stats.skew(df_train_macro[f])
    if skewness > skewness_limit:
        features_to_log.append(f)

for col in df_train_macro._get_numeric_data().columns:
    if col != '_price_doc':
        min_val_train = min(df_train_macro[col])
        min_val_test  = min(df_test_macro[col])
        min_val = min(min_val_train, min_val_test)
        if min_val <= 0:
            df_train_macro[col] += (np.abs(min_val) + 0.1)
            df_test_macro[col]  += (np.abs(min_val) + 0.1)
    else:
        min_val_train = min(df_train_macro[col])
        if min_val_train <= 0:
            df_train_macro[col] += (np.abs(min_val_train) + 0.1)

In [4]:
formula = sh.make_statsmodels_ols_formula(numeric_ivs, categorial_ivs, '_price_doc', log_vs=features_to_log, degree=degree, scale=False)
model = sm.OLS.from_formula(formula, data=df_train_macro)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,np.log(_price_doc),R-squared:,0.389
Model:,OLS,Adj. R-squared:,0.387
Method:,Least Squares,F-statistic:,184.0
Date:,"Mon, 28 Oct 2019",Prob (F-statistic):,0.0
Time:,21:24:31,Log-Likelihood:,-20041.0
No. Observations:,30404,AIC:,40290.0
Df Residuals:,30298,BIC:,41180.0
Df Model:,105,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.9655,1.912,3.643,0.000,3.218,10.713
C(_ecology)[T.good],-0.0382,0.015,-2.568,0.010,-0.067,-0.009
C(_ecology)[T.no data],-0.1856,0.026,-7.107,0.000,-0.237,-0.134
C(_ecology)[T.poor],-0.0045,0.013,-0.349,0.727,-0.030,0.021
C(_ecology)[T.satisfactory],0.0301,0.017,1.810,0.070,-0.002,0.063
C(_water_1line)[T.yes],-0.0003,0.012,-0.025,0.980,-0.023,0.022
C(_nuclear_reactor_raion)[T.yes],-0.0311,0.019,-1.615,0.106,-0.069,0.007
C(_big_road1_1line)[T.yes],-0.0233,0.019,-1.219,0.223,-0.061,0.014
C(_radiation_raion)[T.yes],-0.0437,0.008,-5.184,0.000,-0.060,-0.027

0,1,2,3
Omnibus:,14756.149,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,80275.479
Skew:,-2.361,Prob(JB):,0.0
Kurtosis:,9.408,Cond. No.,1.02e+16


## Condition Number
Large condition number occurs when the scale of data changes significantly due to the unit difference. Scaling can decrease condition number. Multicollinearity can also cause large condition number. We can handle this by reducing dimensionality with variance inflation factor.

## Standard Scaling
Standalize variables by removing the mean and scaling to unit variance.

In [None]:
formula = sh.make_statsmodels_ols_formula(numeric_ivs, categorial_ivs, '_price_doc', log_vs=features_to_log, degree=degree, scale=True)
model = sm.OLS.from_formula(formula, data=df_train_macro)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,np.log(_price_doc),R-squared:,0.389
Model:,OLS,Adj. R-squared:,0.387
Method:,Least Squares,F-statistic:,184.0
Date:,"Mon, 28 Oct 2019",Prob (F-statistic):,0.0
Time:,21:25:02,Log-Likelihood:,-20041.0
No. Observations:,30404,AIC:,40290.0
Df Residuals:,30298,BIC:,41180.0
Df Model:,105,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,15.6357,0.015,1028.866,0.000,15.606,15.665
C(_ecology)[T.good],-0.0382,0.015,-2.568,0.010,-0.067,-0.009
C(_ecology)[T.no data],-0.1856,0.026,-7.107,0.000,-0.237,-0.134
C(_ecology)[T.poor],-0.0045,0.013,-0.349,0.727,-0.030,0.021
C(_ecology)[T.satisfactory],0.0301,0.017,1.810,0.070,-0.002,0.063
C(_water_1line)[T.yes],-0.0003,0.012,-0.025,0.980,-0.023,0.022
C(_nuclear_reactor_raion)[T.yes],-0.0311,0.019,-1.615,0.106,-0.069,0.007
C(_big_road1_1line)[T.yes],-0.0233,0.019,-1.219,0.223,-0.061,0.014
C(_radiation_raion)[T.yes],-0.0437,0.008,-5.184,0.000,-0.060,-0.027

0,1,2,3
Omnibus:,14756.149,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,80275.479
Skew:,-2.361,Prob(JB):,0.0
Kurtosis:,9.408,Cond. No.,1.37e+16


Scaling did not significantly decrease the condition number.

# 2. Dimensionality Reduction
## ANOVA

In [None]:
anova = sm.stats.anova_lm(result, typ=2)
anova

Unnamed: 0,sum_sq,df,F,PR(>F)
C(_ecology),21.266066,4.0,24.213637,4.948295e-20
C(_water_1line),0.000138,1.0,0.000627,0.9800251
C(_nuclear_reactor_raion),0.572989,1.0,2.609631,0.1062269
C(_big_road1_1line),0.32613,1.0,1.485333,0.2229515
C(_radiation_raion),5.9001,1.0,26.87152,2.188339e-07
C(_incineration_raion),0.002834,1.0,0.012908,0.9095442
C(_product_type),29.757188,1.0,135.526667,2.952085e-31
C(_railroad_terminal_raion),0.324087,1.0,1.476029,0.2244063
C(_big_market_raion),0.020281,1.0,0.092369,0.7611883
C(_railroad_1line),26.271992,1.0,119.653629,8.491574e-28



We can remove features with p-value equal or greater than 0.05 since they have very small influences on the dependent variable

## F-test and Feature Influence

In [None]:
result, sms_vars, formula = fs.by_f_test(df_train_macro, formula, repeat=num_of_f_test)
result.summary()

# 3. Outlier
## Cook's Distance
- Find data with large leverage and residual by calculating Cook's distance.

In [None]:
df_train_macro_with_outliers = df_train_macro.copy(deep=True)
df_train_macro, model, result = pp.remove_outliers(df_train_macro, formula, repeat=3)
result.summary()

# 4. Regularization
## Lasso
Find variables with zero coefficient when Lasso regularization is applied.

In [None]:
result_lasso = model.fit_regularized(alpha=0.001, L1_wt=1)

Let's remove features with zero coefficient to reduce dimensionality.

In [None]:
sms_vars = []
for idx, coef in enumerate(result_lasso.params):
    if coef ==0:
        continue
    feature = result_lasso.params.index[idx]
    if feature == 'Intercept':
        continue
    startDelPos = feature.find('[')
    endDelPos = feature.find(']')
    feature = feature.replace(feature[startDelPos:endDelPos+1], '')
    sms_vars.append(feature)

In [None]:
formula = 'np.log(_price_doc) ~ ' + " + ".join(sms_vars)
model = sm.OLS.from_formula(formula, data=df_train_macro)
result = model.fit()
result.summary()

# 5. Diagnosis of Regression
## Residual Normality Test

In [None]:
# outlier remove result 
sp.stats.probplot(result.resid, plot=plt)
plt.show()

In [None]:
test = sms.omni_normtest(result.resid)
for xi in zip(['Chi^2', 'P-value'], test):
    print("%-12s: %6.3f" % xi)

# Partial Regression Plot
Let's visualize the influence of a single independent variable.

In [None]:
fig = plt.figure(figsize=(10,70))
sm.graphics.plot_partregress_grid(result, fig=fig)
fig.suptitle("")
plt.show()

# 6. Cross Validation

In [None]:
dm = dmatrix(" + ".join(sms_vars) + ' + np.log(_price_doc)', df_train_macro_with_outliers, return_type="dataframe")
X = dm[dm.columns.drop(['np.log(_price_doc)'])]
y = dm['np.log(_price_doc)']
cv = cv = KFold(n_splits=1000, shuffle=True, random_state=0)
r2s = cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2', cv=cv)
r2s.mean()

In [None]:
plt.hist(r2s, bins=100)

In [None]:
y_pred = np.exp(result.predict(df_test_macro))
y_pred = y_pred.to_frame('price_doc')
y_pred.to_csv('./data/stats_models_{}.csv'.format(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')), header=True, index=True)

## Score

In [None]:
real = pd.read_csv('./data/sample_submission.csv', index_col=0)
score = ec.rmsle(y_pred, real)
score