# LDML

In [1]:
import pandas as pd
from double_ml_data import DoubleMLData
data=pd.read_csv("D:\Rworkspace\LDML\data.csv",index_col=0)
data.index=data.index-1
dataset=DoubleMLData(data,"net_tfa","p401",x_cols=["age","inc","educ","fsize","marr","twoearn","db","pira","hown" ])

In [2]:
print(dataset)


------------------ Data summary      ------------------
Outcome variable: net_tfa
Treatment variable(s): ['p401']
Covariates: ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown']
Instrument variable(s): None
No. Observations: 9915

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9915 entries, 0 to 9914
Columns: 14 entries, nifa to hown
dtypes: float64(3), int64(11)
memory usage: 1.1 MB



In [3]:
from sklearn.ensemble import RandomForestRegressor
from ldml import LocalizedDML
learner = RandomForestRegressor(n_estimators=50, max_depth=7, max_features=3, min_samples_leaf=3)
final_model=LocalizedDML([0.25,0.5,0.75],dataset,5,learner,trim_type='clip',semiadaptive=False)

In [4]:
result=final_model.fit()
print(result)

   gamma       q1       q0     qte        se1        se0      seqte
0   0.25    272.0   -791.0  1063.0   3.981826  16.431441  17.185666
1   0.50   5513.0    975.0  4538.0  24.726983  23.947594  37.513267
2   0.75  22899.0  15199.0  7700.0  40.303786  22.804206  53.125206


# 含缺失值线性模型估计

In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression,LogisticRegression
from missing_data import MissingDataLinear,MissingDataLogistics

In [6]:
# linear，Y缺失
def test_data_generator(n=1000,miss_ratio=0.2):
    n_miss=int(n*miss_ratio)
    n_obs=n-n_miss
    x1=np.random.normal(1, 1, n)
    x2=np.random.normal(1, 1, n)
    x3=np.random.normal(2, 1, n)
    y=2*x1+0.5*x2+3*x3+np.random.normal(0, 1, n)
    mask=np.append(np.ones(n_obs),np.zeros(n_miss))
    np.random.shuffle(mask)
    y=np.where(mask==1,y,np.nan)
    return pd.DataFrame({'X1':x1,'X2':x2,'X3':x3,'Y':y},index=np.arange(n))

data=test_data_generator(n=1000)
data

Unnamed: 0,X1,X2,X3,Y
0,2.017284,0.969672,1.322154,9.048274
1,-0.384347,1.042059,1.934429,5.620539
2,1.718156,1.527338,2.110364,11.741008
3,0.596675,-0.165156,3.075305,9.653289
4,1.220425,0.828551,2.486482,11.982931
...,...,...,...,...
995,0.432514,-0.014072,1.296396,5.218871
996,0.666455,-0.595158,3.454589,8.383418
997,0.812427,2.275764,2.419934,9.767427
998,1.867103,0.472542,1.959441,9.226150


In [7]:
lm=LinearRegression()
m=MissingDataLinear(data,'Y',['X1','X2','X3'],lm,5)
m.fit()

Unnamed: 0,coef,ste
X1,2.00705,0.030549
X2,0.432766,0.034
X3,2.999671,0.023252


In [8]:
# linear，X缺失
def test_data_generator(n=1000,miss_ratio=0.1):
    n_miss=int(n*miss_ratio)
    n_obs=n-n_miss
    x1=np.random.normal(1, 1, n)
    x2=np.random.normal(1, 1, n)
    x3=np.random.normal(2, 1, n)
    y=2*x1+0.5*x2+3*x3+np.random.normal(0, 1, n)
    mask=np.append(np.ones(n_obs),np.zeros(n_miss))
    np.random.shuffle(mask)
    x1=np.where(mask==1,x1,np.nan)
    np.random.shuffle(mask)
    x2=np.where(mask==1,x2,np.nan)
    #return pd.DataFrame({'X1':x1,'X2':x2,'X3':x3,'Y':y,'R':mask},index=np.arange(n))
    return pd.DataFrame({'X1':x1,'X2':x2,'X3':x3,'Y':y},index=np.arange(n))

data=test_data_generator(n=1000)

In [9]:
lm=LinearRegression()
m=MissingDataLinear(data,'Y',['X1','X2','X3'],lm,5)
print(m)

Dependent variable: Y
Independent variables:['X1', 'X2', 'X3']
Missing variable(s): ['X1', 'X2']
Non-missing independent variable(s): ['X3']
No. Observations: 1000
Missing num: 187.0
-----------------------
Using model:LinearRegression()


In [10]:
result=m.fit()
print(result)

        coef       ste
X1  2.058933  0.043987
X2  0.511401  0.034546
X3  2.978563  0.024917


In [11]:
# logistics，Y缺失
def test_data_generator(n=1000,miss_ratio=0.2):
    n_miss=int(n*miss_ratio)
    n_obs=n-n_miss
    x1=np.random.normal(1, 1, n)
    x2=np.random.normal(1, 1, n)
    x3=np.random.normal(2, 1, n)
    p=1/(1+np.exp(-(2*x1+0.5*x2+3*x3+np.random.normal(0, 0.5, n))))
    y=np.random.binomial(1,p)
    mask=np.append(np.ones(n_obs),np.zeros(n_miss))
    np.random.shuffle(mask)
    y=np.where(mask==1,y,np.nan)
    #return pd.DataFrame({'X1':x1,'X2':x2,'X3':x3,'Y':y,'R':mask},index=np.arange(n))
    return pd.DataFrame({'X1':x1,'X2':x2,'X3':x3,'Y':y},index=np.arange(n))

data=test_data_generator(n=10000)

In [12]:
model=LogisticRegression(C=1)
m=MissingDataLogistics(data,'Y',['X1','X2','X3'],model,5)
print(m)

Dependent variable: Y
Independent variables:['X1', 'X2', 'X3']
Missing variable(s): Y
Non-missing independent variable(s): ['X1', 'X2', 'X3']
No. Observations: 10000
Missing num: 2000.0
-----------------------
Using model:LogisticRegression(C=1)


In [13]:
m.fit()

array([1.84425399, 0.47178011, 2.75488699])

In [14]:
# logistics，X缺失
def test_data_generator(n=1000,miss_ratio=0.2):
    n_miss=int(n*miss_ratio)
    n_obs=n-n_miss
    x1=np.random.normal(1, 1, n)
    x2=np.random.normal(1, 1, n)
    x3=np.random.normal(2, 1, n)
    p=1/(1+np.exp(-(2*x1+0.5*x2+3*x3+np.random.normal(0, 0.5, n))))
    #p=1/(1+np.exp(-(1*x1+1*x2+1*x3+np.random.normal(0, 1, n))))
    y=np.random.binomial(1,p)
    mask=np.append(np.ones(n_obs),np.zeros(n_miss))
    np.random.shuffle(mask)
    x1=np.where(mask==1,x1,np.nan)
    np.random.shuffle(mask)
    x2=np.where(mask==1,x2,np.nan)
    #return pd.DataFrame({'X1':x1,'X2':x2,'X3':x3,'Y':y,'R':mask},index=np.arange(n))
    return pd.DataFrame({'X1':x1,'X2':x2,'X3':x3,'Y':y},index=np.arange(n))

data=test_data_generator(5000,0.1)

In [15]:
model=LogisticRegression(C=1)
m=MissingDataLogistics(data,'Y',['X1','X2','X3'],model,5)
print(m)

Dependent variable: Y
Independent variables:['X1', 'X2', 'X3']
Missing variable(s): ['X1', 'X2']
Non-missing independent variable(s): ['X3']
No. Observations: 5000
Missing num: 955.0
-----------------------
Using model:LogisticRegression(C=1)


In [16]:
m.fit()

array([1.86171282, 0.52375148, 2.79711691])