# Modeling with StatsModels

## 1. Ordinary Least Square
- Column Names
- Log Transformation
- Condition Number
- Standard Scaling

# 2. Dimensionality Reduction
- ANOVA
- F-test and Feature Influence

# 3. Outlier
- Cook's Distance

# 4. Regularization
- Lasso

# 5. Diagnosis of Regression
- Residual Normality Test
- Partial Regression Plot

# 6. Cross Validatoin

# 7. Test
- score

In [6]:
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import warnings
import sys
import datetime
import scipy as sp
import statsmodels.stats.api as sms
import statsmodels.api as sm
from patsy import dmatrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import utils.statsmodel_helper as sh
import utils.feature_selection as fs
import utils.preprocessing as pp
import utils.error_calculator as ec


class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)



sys.setrecursionlimit(1500)

degree = 2
skewness_limit = 1
num_of_cooks = 2
num_of_f_test = 10

In [1]:
import warnings
warnings.simplefilter("ignore")
import statsmodels.api as sm

# modeling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# Linear Regression Model - sklearn
from sklearn.model_selection import train_test_split

In [2]:
df_train_macro = pd.read_csv('../code/data/train_macro3.csv', index_col=0)
df_test_macro = pd.read_csv('../code/data/test_macro3.csv', index_col=0)

In [3]:
df_train_macro.describe().iloc[[3, 7], :]

Unnamed: 0,usdrub,full_sq,life_sq,floor,num_room,kitch_sq,state,area_m,preschool_education_centers_raion,school_education_centers_raion,...,market_count_5000,room_size,avg_price_ID_metro,avg_price_ID_railroad_station_walk,avg_price_ID_big_road1,avg_price_ID_big_road2,avg_price_ID_railroad_terminal,avg_price_ID_bus_terminal,avg_price_sub_area,price_doc
min,28.8082,10.0,0.0,0.0,0.0,0.0,1.0,2081628.0,0.0,0.0,...,0.0,0.0,3000000.0,0.0,3608626.0,4516778.0,6311649.0,5626025.0,1000000.0,395685.0
max,69.4666,729.0,802.0,44.0,21.356604,123.0,4.0,206071800.0,13.0,14.0,...,21.0,224.940665,56220124.0,19842280.0,15500000.0,13222690.0,11851720.0,10046540.0,20116570.0,95122496.0


In [None]:
# numeric_ivs = df_train_macro._get_numeric_data().columns.drop('price_doc').tolist()
numeric_ivs = df_train_macro._get_numeric_data().columns.tolist()
categorial_ivs = list(df_train_macro.dtypes[df_train_macro.dtypes == object].index)

In [None]:
df_cat_dummies = pd.get_dummies(df_train_macro[categorial_ivs])

In [None]:
df_real = df_train_macro[numeric_ivs]

In [None]:
df_train_macro = pd.concat([df_real, df_cat_dummies], axis=1)
cols = list(df_train_macro.columns.values)
cols.pop(cols.index('price_doc'))
df_train_macro = df_train_macro[cols + ['price_doc']]
df_train_macro.tail(2)

In [None]:
dfy = df_train_macro.iloc[:, -1]
dfx = df_train_macro.iloc[:, :-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dfx, dfy, test_size=0.3, random_state=1)
len(X_train), len(X_test), len(y_train), len(y_test)

In [None]:
# StandardScaler
std_scaler = StandardScaler()
std_scaler.fit(X_train)
X_train_std_scaler = std_scaler.transform(X_train)
std_scaler.fit(X_test)
X_test_std_scaler = std_scaler.transform(X_test)

In [None]:
# RobustScaler
robust_scaler = RobustScaler()
robust_scaler.fit(X_train)
X_train_robust_scaler = robust_scaler.transform(X_train)
robust_scaler.fit(X_test)
X_train_robust_scaler = robust_scaler.transform(X_test)

In [None]:
X_train = sm.add_constant(X_train)

In [None]:
result = sm.OLS(y_train, X_train).fit()
result.summary()

In [None]:
X_train_std_scaler = sm.add_constant(X_train_std_scaler)
result_std_scaler = sm.OLS(y_train, X_train_std_scaler).fit()
result_std_scaler.summary()

In [None]:
# X_train_robust_scaler = sm.add_constant(X_train_robust_scaler)
# result_robust_scaler = sm.OLS(y_train, X_train_robust_scaler).fit()
# result_robust_scaler.summary()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

model = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression()),
])

## Column Names

In [4]:
new_cols = []
for col in  list(df_train_macro.columns):
    col = col.replace('-', '_').replace('+', '_').replace(':', '_').replace('~', '_').replace('*', '_')
    new_cols.append('_'+col)
df_train_macro.columns = new_cols

new_cols = []
for col in list(df_test_macro.columns):
    col = col.replace('-', '_').replace('+', '_').replace(':', '_').replace('~', '_').replace('*', '_')
    new_cols.append('_'+col)
df_test_macro.columns = new_cols

categorial_ivs = list(set(df_train_macro.columns) - set(df_train_macro._get_numeric_data().columns))
numeric_ivs = df_train_macro._get_numeric_data().columns.drop('_price_doc').tolist()

## Log Transformation
Transform data with skewness greater than 1.

In [7]:
features_to_log = []
for f in df_train_macro._get_numeric_data().columns:
    skewness = sp.stats.skew(df_train_macro[f])
    if skewness > skewness_limit:
        features_to_log.append(f)

for col in df_train_macro._get_numeric_data().columns:
    if col != '_price_doc':
        min_val_train = min(df_train_macro[col])
        min_val_test  = min(df_test_macro[col])
        min_val = min(min_val_train, min_val_test)
        if min_val <= 0:
            df_train_macro[col] += (np.abs(min_val) + 0.1)
            df_test_macro[col]  += (np.abs(min_val) + 0.1)
    else:
        min_val_train = min(df_train_macro[col])
        if min_val_train <= 0:
            df_train_macro[col] += (np.abs(min_val_train) + 0.1)

In [8]:
formula = sh.make_statsmodels_ols_formula(numeric_ivs, categorial_ivs, '_price_doc', log_vs=features_to_log, degree=degree, scale=False)
model = sm.OLS.from_formula(formula, data=df_train_macro)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,np.log(_price_doc),R-squared:,0.421
Model:,OLS,Adj. R-squared:,0.42
Method:,Least Squares,F-statistic:,580.1
Date:,"Sun, 17 Nov 2019",Prob (F-statistic):,0.0
Time:,22:04:40,Log-Likelihood:,-19244.0
No. Observations:,30404,AIC:,38570.0
Df Residuals:,30365,BIC:,38890.0
Df Model:,38,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0042,0.001,4.802,0.000,0.002,0.006
C(_product_type)[T.OwnerOccupier],0.1471,0.013,11.295,0.000,0.122,0.173
C(_ecology)[T.good],-0.0059,0.014,-0.432,0.666,-0.033,0.021
C(_ecology)[T.no data],0.0219,0.009,2.377,0.017,0.004,0.040
C(_ecology)[T.poor],-0.0020,0.013,-0.157,0.875,-0.027,0.023
C(_ecology)[T.satisfactory],-0.0281,0.014,-1.963,0.050,-0.056,-4.78e-05
C(_detention_facility_raion)[T.yes],0.0163,0.018,0.905,0.365,-0.019,0.051
C(_incineration_raion)[T.yes],-0.0323,0.015,-2.129,0.033,-0.062,-0.003
C(_thermal_power_plant_raion)[T.yes],-0.0242,0.018,-1.310,0.190,-0.060,0.012

0,1,2,3
Omnibus:,15579.967,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,92254.635
Skew:,-2.488,Prob(JB):,0.0
Kurtosis:,9.933,Cond. No.,1.75e+17


## Condition Number
Large condition number occurs when the scale of data changes significantly due to the unit difference. Scaling can decrease condition number. Multicollinearity can also cause large condition number. We can handle this by reducing dimensionality with variance inflation factor.

## Standard Scaling
Standalize variables by removing the mean and scaling to unit variance.

In [None]:
formula = sh.make_statsmodels_ols_formula(numeric_ivs, categorial_ivs, '_price_doc', log_vs=features_to_log, degree=degree, scale=True)
model = sm.OLS.from_formula(formula, data=df_train_macro)
result = model.fit()
result.summary()

Scaling did not significantly decrease the condition number.

# 2. Dimensionality Reduction
## ANOVA

In [None]:
anova = sm.stats.anova_lm(result, typ=2)
anova


We can remove features with p-value equal or greater than 0.05 since they have very small influences on the dependent variable

## F-test and Feature Influence

In [None]:
result, sms_vars, formula = fs.by_f_test(df_train_macro, formula, repeat=num_of_f_test)
result.summary()

# 3. Outlier
## Cook's Distance
- Find data with large leverage and residual by calculating Cook's distance.

In [None]:
df_train_macro_with_outliers = df_train_macro.copy(deep=True)
df_train_macro, model, result = pp.remove_outliers(df_train_macro, formula, repeat=3)
result.summary()

# 4. Regularization
## Lasso
Find variables with zero coefficient when Lasso regularization is applied.

In [None]:
result_lasso = model.fit_regularized(alpha=0.001, L1_wt=1)

Let's remove features with zero coefficient to reduce dimensionality.

In [None]:
sms_vars = []
for idx, coef in enumerate(result_lasso.params):
    if coef ==0:
        continue
    feature = result_lasso.params.index[idx]
    if feature == 'Intercept':
        continue
    startDelPos = feature.find('[')
    endDelPos = feature.find(']')
    feature = feature.replace(feature[startDelPos:endDelPos+1], '')
    sms_vars.append(feature)

In [None]:
formula = 'np.log(_price_doc) ~ ' + " + ".join(sms_vars)
model =sm.OLS.from_formula(formula, data=df_train_macro)
result = model.fit()
result.summary()

# 5. Diagnosis of Regression
## Residual Normality Test

In [None]:
# outlier remove result 
sp.stats.probplot(result.resid, plot=plt)
plt.show()

In [None]:
test = sms.omni_normtest(result.resid)
for xi in zip(['Chi^2', 'P-value'], test):
    print("%-12s: %6.3f" % xi)

# Partial Regression Plot
Let's visualize the influence of a single independent variable.

In [None]:
fig = plt.figure(figsize=(10,70))
sm.graphics.plot_partregress_grid(result, fig=fig)
fig.suptitle("")
plt.show()

# 6. Cross Validation

In [None]:
dm = dmatrix(" + ".join(sms_vars) + ' + np.log(_price_doc)', df_train_macro_with_outliers, return_type="dataframe")
X = dm[dm.columns.drop(['np.log(_price_doc)'])]
y = dm['np.log(_price_doc)']
cv = cv = KFold(n_splits=1000, shuffle=True, random_state=0)
r2s = cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2', cv=cv)
r2s.mean()

In [None]:
plt.hist(r2s, bins=100)

In [None]:
y_pred = np.exp(result.predict(df_test_macro))
y_pred = y_pred.to_frame('price_doc')
y_pred.to_csv('./data/stats_models_{}.csv'.format(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')), header=True, index=True)

## Score

0.39773