In [1]:
import xlearn as xl
import pandas as pd, numpy as np
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# Load data
d = pd.read_csv("turnout_train.csv")
ta = pd.read_csv("turnout_to_assign.csv")

In [3]:
d.columns

Index(['id', 'state', 'treat', 'voted_2006', 'voted_2008', 'voted_2009',
       'voted_2010', 'voted_2011', 'voted_2012', 'voted_2013', 'voted_2014',
       'i_age', 'age_miss', 'voting_history_label', 'black', 'hispanic',
       'other_race', 'white', 'female', 'notfem', 'married'],
      dtype='object')

In [4]:
# no treat, voted_2014. 
# We want to know whether to treat them or not by predicting the differences in Pr(voted_2014) between the treated and untreated.
ta.columns

Index(['id', 'state', 'voted_2006', 'voted_2008', 'voted_2009', 'voted_2010',
       'voted_2011', 'voted_2012', 'voted_2013', 'i_age', 'age_miss',
       'voting_history_label', 'black', 'hispanic', 'other_race', 'white',
       'female', 'notfem', 'married'],
      dtype='object')

In [5]:
# Combine data and create train/test split
comb = pd.concat([d, ta], ignore_index=True)
comb['to_assign'] = comb['voted_2014'].isna() # people for which we want to assign treatment
train_idx = comb[~comb['to_assign']].sample(frac=0.8, random_state=72540).index # train set
comb['train'] = comb.index.isin(train_idx)
comb

Unnamed: 0,id,state,treat,voted_2006,voted_2008,voted_2009,voted_2010,voted_2011,voted_2012,voted_2013,...,voting_history_label,black,hispanic,other_race,white,female,notfem,married,to_assign,train
0,2426706,AZ,0.0,0,1,0,0,0,1,0,...,above,0,0,0,1,1,0,0,False,True
1,2426707,FL,1.0,0,1,0,1,0,1,0,...,above,1,0,0,0,0,1,0,False,True
2,2426709,FL,1.0,0,1,0,0,0,1,0,...,average,0,0,0,1,1,0,0,False,True
3,2426710,LA,1.0,0,0,0,1,0,1,0,...,average,1,0,0,0,1,0,0,False,True
4,2426711,WI,1.0,0,1,0,1,0,1,0,...,above,0,0,0,1,1,0,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2826357,4414214,WI,,1,1,0,1,0,1,0,...,above,0,0,0,1,1,0,0,True,False
2826358,4414215,AR,,0,1,0,1,0,1,0,...,above,1,0,0,0,0,1,0,True,False
2826359,4414216,KY,,0,1,0,1,1,1,0,...,above,1,0,0,0,1,0,1,True,False
2826360,4414217,FL,,1,0,0,0,0,1,0,...,average,0,0,0,1,1,0,0,True,False


In [6]:
# Fit separate models for treatment and control
comb = pd.get_dummies(comb, columns=['state', 'voting_history_label'], drop_first=True)

comb_train = comb[(comb['train']) & (~comb['to_assign'])]
comb_test = comb[(~comb['train']) & (~comb['to_assign'])]
comb_to_predict = comb[comb['to_assign']]

X_train = comb_train.drop(['voted_2014', 'to_assign', 'train'], axis=1)
X_test = comb_test.drop(['voted_2014', 'to_assign', 'train'], axis=1)
X_to_predict = comb_to_predict.drop(['voted_2014', 'to_assign', 'train'], axis=1)

X_train_treat = X_train[X_train['treat'] == 1]
X_train_control = X_train[X_train['treat'] == 0]
y_train_treat = comb_train[comb_train['treat'] == 1]['voted_2014']
y_train_control = comb_train[comb_train['treat'] == 0]['voted_2014']

X_train_treat = X_train_treat.drop('treat', axis=1)
X_train_control = X_train_control.drop('treat', axis=1)
X_test = X_test.drop('treat', axis=1)
X_to_predict = X_to_predict.drop('treat', axis=1)

In [7]:
model_treat = LogisticRegression().fit(X_train_treat, y_train_treat)
model_control = LogisticRegression().fit(X_train_control, y_train_control)

: 

In [None]:
X_to_predict

Unnamed: 0,id,treat,voted_2006,voted_2008,voted_2009,voted_2010,voted_2011,voted_2012,voted_2013,i_age,...,state_LA,state_ME,state_MI,state_NC,state_NH,state_SD,state_TX,state_WI,voting_history_label_average,voting_history_label_below
1412144,3000001,,0,1,0,1,0,1,0,61.000000,...,False,False,False,False,False,False,False,False,False,False
1412145,3000002,,0,0,0,1,1,1,0,41.813999,...,False,False,False,False,False,False,False,True,False,False
1412146,3000003,,0,1,0,0,0,1,0,36.000000,...,False,False,False,False,False,False,False,False,True,False
1412147,3000004,,0,0,0,0,0,0,0,26.000000,...,False,False,False,False,False,False,False,False,False,True
1412148,3000005,,0,1,0,0,0,1,0,40.000000,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2826357,4414214,,1,1,0,1,0,1,0,46.000000,...,False,False,False,False,False,False,False,True,False,False
2826358,4414215,,0,1,0,1,0,1,0,63.000000,...,False,False,False,False,False,False,False,False,False,False
2826359,4414216,,0,1,0,1,1,1,0,32.000000,...,False,False,False,False,False,False,False,False,False,False
2826360,4414217,,1,0,0,0,0,1,0,67.000000,...,False,False,False,False,False,False,False,False,True,False


In [18]:
model_treat.predict_proba(X_to_predict)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [12]:
# Get predictions
comb['y_1_hat'] = model_treat.predict_proba(X_to_predict)[:, 1]
comb['y_0_hat'] = model_control.predict_proba(X_to_predict)[:, 1]

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- state
- voting_history_label
Feature names seen at fit time, yet now missing:
- state_AR
- state_AZ
- state_CO
- state_FL
- state_GA
- ...


In [None]:


# Calculate treatment effect and decision
comb['effect_hat'] = comb['y_1_hat'] - comb['y_0_hat']
min_effect_to_treat = 0.80 / 150  # From the R code
comb['effect_hat_net'] = comb['effect_hat'] - min_effect_to_treat
comb['should_treat'] = comb['effect_hat_net'] > 0

# Export predictions
output = comb[comb['to_assign']][['id', 'should_treat']].rename(columns={'should_treat': 'treat'})
output['treat'] = output['treat'].astype(int)
output.to_csv("example_output.csv.gz", index=False, compression='gzip')



In [13]:
# Read small_train.txt
df = pd.read_csv("./small_train.txt", sep=" ", header=None)
df

ParserError: Error tokenizing data. C error: Expected 18 fields in line 9, saw 22


In [14]:
ffm_model = xl.create_ffm() # Use field-aware factorization machine
ffm_model.setTrain("./turnout_to_assign.csv")  # Training data


In [11]:
# Training task
ffm_model = xl.create_ffm() # Use field-aware factorization machine
ffm_model.setTrain("./small_train.txt")  # Training data
ffm_model.setValidate("./small_test.txt")  # Validation data

# param:
#  0. binary classification
#  1. learning rate: 0.2
#  2. regular lambda: 0.002
#  3. evaluation metric: accuracy
param = {'task':'binary', 'lr':0.2,
         'lambda':0.002, 'metric':'acc'}

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')

# Prediction task
ffm_model.setTest("./small_test.txt")  # Test data
ffm_model.setSigmoid()  # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")


[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 2 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./small_train.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./small_test.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 9991
[32m[------------