In [1]:
import scipy.stats as st 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [53]:
#> set.seed(666)
np.random.seed(seed=233423)
x1 = st.norm.rvs(size=10000)           # some continuous variables 
x2 = st.norm.rvs(size=10000)  
z = 1 + 2*x1 + 3*x2        # linear combination with a bias
pr = 1/(1+np.exp(-z))         # pass through an inv-logit function
y = st.binom.rvs(n=1,p=pr, size=10000) #rbinom(1000,1,pr)      # bernoulli response variable
 
X=np.column_stack([x1,x2])
# standardize the features since regularization requires all features to be on same scale
scaler = StandardScaler(copy=True)
# we have created a standardization based on the training data
X_train = scaler.fit(X).transform(X)
y_train = y

#now feed it to glm:
#df = data.frame(y=y,x1=x1,x2=x2)

In [54]:
# Initiate logistic regression object
logit = LogisticRegression(C=1e9,fit_intercept=True)

# Fit model. Let X_train = matrix of predictors, y_train = matrix of variable.
# NOTE: Do not include a column for the intercept when fitting the model.
resLogit = logit.fit(X_train, y_train)
#print(resLogit.intercept_,resLogit.coef_)

# Calculate matrix of predicted class probabilities. 
# Check resLogit.classes_ to make sure that sklearn ordered your classes as expected
predProbs = np.matrix(resLogit.predict_proba(X_train))

# Design matrix -- add column of 1's at the beginning of your X_train matrix
X_design = np.column_stack((np.ones(shape = X_train.shape[0]), X_train))
#np.ones(shape = X_train.shape[0])
#X_design =X_train

# Initiate matrix of 0's, fill diagonal with each predicted observation's variance
V = np.matrix(np.zeros(shape = (X_design.shape[0], X_design.shape[0])))
np.fill_diagonal(V, np.multiply(predProbs[:,0], predProbs[:,1]).A1)

# Covariance matrix
covLogit = np.linalg.inv(X_design.T * V * X_design)
#print("Covariance matrix: ", covLogit)

# Standard errors
print("Standard errors: ", np.sqrt(np.diag(covLogit)) )

# Wald statistic (coefficient / s.e.) ^ 2
logitParams = np.insert(resLogit.coef_, 0, resLogit.intercept_)
print("Coefficients:    ",logitParams)
#print( "Wald statistics: ", (logitParams / np.sqrt(np.diag(covLogit))) ** 2)

Standard errors:  [ 0.03531992  0.04883996  0.06290314]
Coefficients:     [ 0.90538252  1.98547107  2.91685876]


In [55]:
import statsmodels.formula.api as sm
 
model = sm.Logit(y_train, X_design)
 
result =model.fit() #model.fit(method='bfgs')
result.summary()

Optimization terminated successfully.
         Current function value: 0.310366
         Iterations 8


0,1,2,3
Dep. Variable:,y,No. Observations:,10000.0
Model:,Logit,Df Residuals:,9997.0
Method:,MLE,Df Model:,2.0
Date:,"Fri, 29 Sep 2017",Pseudo R-squ.:,0.541
Time:,00:11:01,Log-Likelihood:,-3103.7
converged:,True,LL-Null:,-6762.0
,,LLR p-value:,0.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.9054,0.035,25.634,0.000,0.836 0.975
x1,1.9855,0.049,40.653,0.000,1.890 2.081
x2,2.9169,0.063,46.371,0.000,2.794 3.040


In [56]:
print("Standard errors: ", np.sqrt(np.diag(covLogit)) )
logitParams = np.insert(resLogit.coef_, 0, resLogit.intercept_)
print([round(float(c+(1.96*v)),3) for c,v in zip(logitParams,np.sqrt(np.diag(covLogit)))])
print([round(float(x),3) for x in logitParams])
print([round(float(c-(1.96*v)),3) for c,v in zip(logitParams,np.sqrt(np.diag(covLogit)))])

Standard errors:  [ 0.03531992  0.04883996  0.06290314]
[0.975, 2.081, 3.04]
[0.905, 1.985, 2.917]
[0.836, 1.89, 2.794]


In [30]:
[round(float(c+1.96*v),3) for c,v in zip(logitParams,np.sqrt(np.diag(covLogit)))]
[round(float(c-1.96*v),3) for c,v in zip(logitParams,np.sqrt(np.diag(covLogit)))]

[0.489, 0.831, 1.335]

In [31]:
for c,v in zip(logitParams,np.sqrt(np.diag(covLogit))):
    print(v)

0.355785946657
0.420717909496
0.559726623114


In [32]:
1.1868-(1.96*0.356)

0.48904000000000014