In [1]:
from fastai.structured import *
from fastai.column_data import *
from fastai import metrics
import pandas as pd
from torch.nn import functional as F

In [2]:
class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
                 y_range=None, use_bn=False):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont=n_emb, n_cont
        
        szs = [n_emb+n_cont] + szs
        self.lins = nn.ModuleList([
            nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in szs[1:]])
        for o in self.lins: kaiming_normal(o.weight.data)
        self.outp = nn.Linear(szs[-1], out_sz)
        kaiming_normal(self.outp.weight.data)

        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn,self.y_range = use_bn,y_range

    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        for l,d,b in zip(self.lins, self.drops, self.bns):
            x = F.relu(l(x))
            if self.use_bn: x = b(x)
            x = d(x)
        x = self.outp(x)
        if self.y_range:
            x = F.sigmoid(x)
            x = x*(self.y_range[1] - self.y_range[0])
            x = x+self.y_range[0]
        return x

In [3]:
class ColumnarDataset(Dataset):
    def __init__(self, cats, conts, y):
        n = len(cats[0]) if cats else len(conts[0])
        self.cats = np.stack(cats, 1).astype(np.int64) if cats else np.zeros((n,1))
        self.conts = np.stack(conts, 1).astype(np.float32) if conts else np.zeros((n,1))
        self.y = np.zeros((n,1)) if y is None else y.values # THIS LINE IS CHANGED FROM y[:, None]
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]

    @classmethod
    def from_data_frames(cls, df_cat, df_cont, y=None):
        cat_cols = [c.values for n,c in df_cat.items()]
        cont_cols = [c.values for n,c in df_cont.items()]
        return cls(cat_cols, cont_cols, y)

    @classmethod
    def from_data_frame(cls, df, cat_flds, y=None):
        return cls.from_data_frames(df[cat_flds], df.drop(cat_flds, axis=1), y)

In [4]:
class ColumnarModelData(ModelData):
    def __init__(self, path, trn_ds, val_ds, bs=None, ts_bs=None,  sampler=None, test_ds=None, shuffle=None): ## add batch_sampler
        test_dl = DataLoader(test_ds, ts_bs, shuffle=False, num_workers=1) if test_ds is not None else None
        super().__init__(path, DataLoader(trn_ds, batch_size=bs, sampler=sampler, shuffle=shuffle, num_workers=1),
            DataLoader(val_ds, bs, shuffle=False, num_workers=1), test_dl)

    @classmethod
    def from_arrays(cls, path, val_idxs, xs, y, bs=None, ts_bs=None, sampler=None, test_xs=None, shuffle=True):
        ((val_xs, trn_xs), (val_y, trn_y)) = split_by_idx(val_idxs, xs, y)
        test_ds = PassthruDataset(*(test_xs.T), [0] * len(test_xs)) if test_xs is not None else None
        return cls(path, PassthruDataset(*(trn_xs.T), trn_y), PassthruDataset(*(val_xs.T), val_y),
                   bs, shuffle=shuffle, test_ds=test_ds)

    @classmethod
    def from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs=None, ts_bs=None, sampler=None, test_df=None):
        test_ds = ColumnarDataset.from_data_frame(test_df, cat_flds) if test_df is not None else None
        return cls(path, ColumnarDataset.from_data_frame(trn_df, cat_flds, trn_y),
                    ColumnarDataset.from_data_frame(val_df, cat_flds, val_y), bs, sampler=sampler, test_ds=test_ds)

    @classmethod
    def from_data_frame(cls, path, val_idxs, df, y, cat_flds, bs=None,  ts_bs=None, sampler=None, test_df=None):
        ((val_df, trn_df), (val_y, trn_y)) = split_by_idx(val_idxs, df, y)
        return cls.from_data_frames(path, trn_df, val_df, trn_y, val_y, cat_flds, bs, sampler=sampler, test_df=test_df)

    def get_learner(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
                    y_range=None, use_bn=False, **kwargs):
        model = MixedInputModel(emb_szs, n_cont, emb_drop, out_sz, szs, drops, y_range, use_bn)
        return StructuredLearner(self, StructuredModel(to_gpu(model)), opt_fn=optim.Adam, **kwargs)


In [5]:
path='/home/paperspace/data/talkingdata/'
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [6]:
train_df = pd.read_feather(f'{path}train_df')
test_df = pd.read_feather(f'{path}test_df')

In [7]:
cat_vars =['app', 'channel', 'device', 'ip'
           , 'os', 'hour', 'day', 'wday', 'qty', 'ip_app_count',
       'ip_app_os_count']
dep = 'is_attributed'
n = len(train_df); n

40000000

In [8]:
train_df = train_df[cat_vars+[dep]]

In [9]:
for v in cat_vars: train_df[v] = train_df[v].astype('category').cat.as_ordered()
for v in cat_vars: test_df[v] = test_df[v].astype('category').cat.as_ordered()

In [10]:
apply_cats(test_df, train_df)

'Categorical.from_codes(codes, categories)'?
  df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)


In [11]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
test_df[['click_id']] = test_df[['click_id']].astype("int32")

In [12]:
idxs = get_cv_idxs(n, val_pct=0.001)
train_df_samp = train_df.iloc[idxs]
samp_size = len(train_df_samp); samp_size

40000

In [13]:
cat_sz = [(c, len(train_df[c].cat.categories)+1) for c in cat_vars]

In [14]:
df, y, nas = proc_df(train_df_samp, 'is_attributed', do_scale=False)
del train_df

In [15]:
df_test, y_test, nas = proc_df(test_df[cat_vars+[dep]], 'is_attributed', do_scale=False)
del test_df

In [16]:
import gc
gc.collect()

351

In [17]:
one_pct =np.sum(y==1)/len(y)
one_pct

0.002575

In [18]:
val_idx = list(range(round(len(df)*0.8), len(df)))

In [19]:
trn_y = np.delete(y, val_idx)
weights = np.zeros(len(trn_y))

for i in range(len(trn_y)):
    if(trn_y[i]==1):
        weights[i] = 1
    else:
        weights[i] = 0.1
del trn_y

In [20]:
md = ColumnarModelData.from_data_frame(path, val_idx, df, pd.Series(y.astype('int')), cat_flds=cat_vars,bs=128*2, ts_bs=128,
                                       sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)),
                                       test_df = df_test) 

In [21]:
# weighted sample see flag
#next(iter(md.trn_dl))[2]

In [22]:
emb_szs = [(c, max(50, round(500*(1/c)))) for _,c in cat_sz]

In [23]:
model = MixedInputModel(emb_szs, n_cont=0, emb_drop=0.1, out_sz=2, szs=[1000, 500, 100], drops=[0.1, 0.1, 0.1]).cuda()

In [24]:
bm = BasicModel(model, 'binary_classifier')

In [25]:
# you can simply create learner with any custom model and data
# source code is here
class StructuredLearner(Learner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)
        self.crit = F.mse_loss


learn = StructuredLearner(md, bm)

In [26]:
# note that we've changed the model class' crit attribute
# this is not recommended from an OOP perspective 
# but it's handy here
learn.crit = F.cross_entropy
learn.crit

<function torch.nn.functional.cross_entropy>

In [31]:
learn.lr_find()

epoch      trn_loss   val_loss                               
    0      0.207456   35.510422 



In [27]:
%matplotlib inline
learn.sched.plot()

AttributeError: 'NoneType' object has no attribute 'plot'

In [28]:
lr = 0.01
learn.fit(lr, 1, cycle_len=1)

epoch      trn_loss   val_loss                               
    0      0.132066   0.030631  



[0.030631274]

In [34]:
#learn.save('mod_rs_1e')

In [27]:
learn.load('mod_rs_1e')

In [36]:
sklearn.metrics.roc_auc_score(y[val_idx], learn.predict()[:, 1])

0.9771188565697092

In [None]:
lr = 0.01*0.1
learn.fit(lr, 1, 1)

In [None]:
#learn.save('mod_4_4e')

In [None]:
#learn.load('mod_4_3e')

In [None]:
sklearn.metrics.roc_auc_score(y[val_idx], learn.predict()[:, 1])

In [None]:
## if memory runs out, delete dataloader object (md), create only test_dl.
## in the future, should ve create separate dataloader for train and valid, and test sets.

pred = learn.predict_dl(md.test_dl)

In [None]:
i_batch, sample_batched in 
enumerate(dataloader)

In [None]:
# softmax
expsums = np.exp(preds).sum(axis=1)
probs = np.exp(preds) / expsums[:, None]

In [None]:
test_df['is_attributed'] = probs[:, 1]

In [None]:
sub = test_df[['click_id','is_attributed']]

In [None]:
sub.to_csv('/home/paperspace/data/talkingdata/sub/'+"sub_dp_6.csv", index=False)