In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy import stats

In [3]:
hours = [0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50]
# 0 no pass, 1 yes pass
target = [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]

In [4]:
hours

[0.5,
 0.75,
 1.0,
 1.25,
 1.5,
 1.75,
 1.75,
 2.0,
 2.25,
 2.5,
 2.75,
 3.0,
 3.25,
 3.5,
 4.0,
 4.25,
 4.5,
 4.75,
 5.0,
 5.5]

In [5]:
target 

[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]

In [6]:
data = pd.DataFrame({
    'hours' : hours,
    'target' : target
})

In [7]:
data

Unnamed: 0,hours,target
0,0.5,0
1,0.75,0
2,1.0,0
3,1.25,0
4,1.5,0
5,1.75,0
6,1.75,1
7,2.0,0
8,2.25,1
9,2.5,0


In [8]:
y = data.target
X = data.drop('target', axis=1)

In [9]:
logreg = LogisticRegression(random_state=42, C=1e9)

In [10]:
logreg.fit(X, y)

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
logreg.coef_

array([[1.50464522]])

In [12]:
logreg.intercept_

array([-4.07771764])

In [13]:
def logit_pvalue(model, x):
    """ Calculate z-scores for scikit-learn LogisticRegression.
    parameters:
        model: fitted sklearn.linear_model.LogisticRegression with intercept and large C
        x:     matrix on which the model was fit
    This function uses asymtptics for maximum likelihood estimates.
    """
    # first index refers to proba that belongs to class 0
    # second index refers to proba that belongs to calss 1
    p = model.predict_proba(x) # return matrix (N,2)
    # number of samples
    n = len(p)
    # number of features + 1 
    m = len(model.coef_[0]) + 1
    coefs = np.concatenate([model.intercept_, model.coef_[0]]) # put intercept and coefs in same array
    x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1)) # 
    ans = np.zeros((m, m))
    for i in range(n):
        # dot product of transposed row and row
        # then multiply by both probas
        # add it to ans
        ans = ans + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p[i,1] * p[i, 0]
    # acovariance matrix
    vcov = np.linalg.inv(np.matrix(ans))
    # square root diagonal of covariace matrix
    se = np.sqrt(np.diag(vcov))
    # divide coefs by standard error
    t =  coefs/se 
    # two tailed using normal dist
    p = (1 - stats.norm.cdf(abs(t))) * 2
    return p

In [14]:
logit_pvalue(logreg, X)

array([0.02058146, 0.01670279])

In [15]:
p = logreg.predict_proba(X)

In [16]:
sm_model = sm.Logit(data.target, sm.add_constant(data.hours)).fit(disp=0)

  return ptp(axis=axis, out=out, **kwargs)


In [17]:
print(sm_model.pvalues)

const    0.020582
hours    0.016703
dtype: float64


In [18]:
sm_model.summary()

0,1,2,3
Dep. Variable:,target,No. Observations:,20.0
Model:,Logit,Df Residuals:,18.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 04 Feb 2020",Pseudo R-squ.:,0.4208
Time:,14:23:13,Log-Likelihood:,-8.0299
converged:,True,LL-Null:,-13.863
Covariance Type:,nonrobust,LLR p-value:,0.0006365

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.0777,1.761,-2.316,0.021,-7.529,-0.626
hours,1.5046,0.629,2.393,0.017,0.272,2.737


In [19]:
def load_peace_sys_data():
    df = pd.DataFrame(pd.read_csv('peace_sys.csv', index_col=0, na_values=['(NA)']))
    return df

In [20]:
peace_sys = load_peace_sys_data()

In [21]:
peace_sys = peace_sys.drop(['SCCS', 'Coder'], axis=1)

In [22]:
symp6 = peace_sys[['SymP6', 'PSys']]

In [23]:
symp6 = symp6.replace(9, np.nan)
symp6 = symp6.dropna()

In [24]:
symp6['PSys'] = symp6['PSys'].replace(2, 0)

In [25]:
symp6

Unnamed: 0_level_0,SymP6,PSys
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Marshallese,2.0,0
Konso,3.0,0
Pawnee,2.0,0
Kikuyu,2.0,0
Omaha,2.0,0
Natchez,2.0,0
Switzerland,4.0,1
Iroquois,4.0,1
Nordic Nats.,4.0,1
W. Australia,4.0,1


In [26]:
sm_model = sm.Logit(symp6.PSys, sm.add_constant(symp6.SymP6)).fit(disp=0)

In [27]:
sm_model.pvalues

const    0.034672
SymP6    0.036682
dtype: float64

In [28]:
df = pd.DataFrame(columns=['pvalue'])

In [29]:
curr_col = peace_sys[['SymP6']]

In [30]:
curr_col['PSys'] = peace_sys.PSys

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
curr_col

Unnamed: 0_level_0,SymP6,PSys
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Gilbertese,9,2
Marshallese,2,2
E. Pomo,9,2
Popoluca,9,2
Konso,3,2
Bribri,9,2
Tallensi,9,2
Russians,9,2
Trukese,9,2
Toraja,9,2


In [32]:
peace_sys['PSys'].name

'PSys'

In [84]:
def logit_pvalues_with_nan(data):
    data_copy = data.drop('PSys', axis=1)
    pvalues = []

    
    for label in data_copy.columns:
        X = data[[label, 'PSys']]
        X = X.dropna()
        
        y = X.PSys
        X = X.drop('PSys', axis=1)
        
        logreg = LogisticRegression(random_state=42, C=1e9)
        logreg.fit(X, y)
        pvalue = logit_pvalue(logreg, X)[1]
        
        pvalues.append(pvalue)
        
    res = pd.DataFrame({'Variable' : data_copy.columns, 'pvalue' : pvalues})
    res = res.set_index('Variable')
    
    return res

In [85]:
peace_sys = load_peace_sys_data()
peace_sys = peace_sys.drop(['SCCS', 'Coder', 'PSysRec'], axis=1)
peace_sys = peace_sys.replace(9, np.nan)

NON_WAR_VARS = ['SymP6', 'NWNorm5.1', 'RitP6', 'Dep3.3Econ', 'Int2.4Hist', 'ID1.1Over', 
                'NWVal4.1', 'Int2.2Econ', 'Dep3.2Ecol', 'CM8.5Peace', 'PSys']

res = logit_pvalues_with_nan(peace_sys[NON_WAR_VARS])
res

Unnamed: 0_level_0,pvalue
Variable,Unnamed: 1_level_1
SymP6,0.036682
NWNorm5.1,0.007431
RitP6,0.013339
Dep3.3Econ,0.010404
Int2.4Hist,0.028189
ID1.1Over,0.00798
NWVal4.1,0.01349
Int2.2Econ,0.019053
Dep3.2Ecol,0.031614
CM8.5Peace,0.932156


In [36]:
res = pd.DataFrame({'Variable' : X.columns, 'pvalue' : res})
res = res.set_index('Variable')

In [37]:
len(X.columns) 

32

In [71]:
data_copy = peace_sys.drop('PSys', axis=1)
pvalues = np.zeros(len(data_copy.columns))



In [74]:
data_copy.columns

Index(['ID1.1Over', 'ID1.2Ethno', 'Int2.1Mar', 'Int2.2Econ', 'Int2.3Pol',
       'Int2.4Hist', 'Dep3.1Sec', 'Dep3.2Ecol', 'Dep3.3Econ', 'NWVal4.1',
       'WVal4.2', 'NWNorm5.1', 'WNorm5.2', 'MythsP6', 'MythsWar6', 'RitP6',
       'RitWar6', 'SymP6', 'SymWar6', 'SuperOrd7', 'CM8.1Neg', 'CM8.2Med',
       'CM8.3Arb', 'CM8.4Adj', 'CM8.5Peace', 'CM8.6War', 'Lead9.1P',
       'Lead9.2War', 'Cult10.1Com', 'Cult10.2Diff', 'Comp10.3', 'InComp10.4'],
      dtype='object')