In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
torch.cuda.is_available()

False

In [10]:
nn.Sequential(*[nn.Linear(1,2), nn.ReLU()])

Sequential(
  (0): Linear(in_features=1, out_features=2, bias=True)
  (1): ReLU()
)

In [225]:
class MLPClassifer(nn.Module):
    def __init__(self, nfeature, hidden_units=32, nlayers=3, dropout=0):
        super().__init__()
        
        models = [nn.Linear(nfeature, hidden_units), nn.Dropout(p=dropout)]
        for _ in range(nlayers-1):
            models.extend([nn.Linear(hidden_units, hidden_units)])
            
        models.append(nn.Linear(hidden_units, 2))
        
        self.model = nn.Sequential(*models)
        
    def forward(self, features):
        
        return self.model(features)

In [78]:
class TokenFeatureDataset(torch.utils.data.Dataset):
    def __init__(self, src, tgt):
        super().__init__()
        
        self.src = src
        self.tgt = tgt
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def __getitem__(self, index):
        return{
            "id": index,
            "feature": self.src[index],
            "label": self.tgt[index],
        }
    
    def __len__(self):
        return len(self.src)
    
    def collater(self, samples):
        def merge(key, dtype=torch.float32):
            return torch.tensor([s[key] for s in samples], 
                                dtype=dtype,
                                device=self.device)
        
        batch = {
            'id': merge('id'),
            'feature': merge('feature'),
            'label': merge('label', dtype=torch.long),
        }
        return batch
    
    def get_nfeature(self):
        return len(self.src[0])
    

In [14]:
from collections import Counter

def get_ngram_freq(file, ngram=4):
    res = Counter()
    prev = ['</s>']
    with open(file) as fin:
        for i, line in enumerate(fin):
            if i % 100000 == 0:
                print(f'procesed {i} lines')
            for tok in line.strip().split():
                prev = prev[-ngram:]
                for j in range(1, ngram+1):
                    res[' '.join(prev[-j:])] += 1  
                prev.append(tok)
                
            prev.append('</s>')
    
    return res

In [29]:
def preprocess_data(hypos, freq_cnt, ngram=4):
    features = []
    labels = []
    prev = ['</s>']
    for hypo in hypos:
        for i in range(len(hypo['string'])):
            local_f = []
            # confidence-related features
            local_f.extend([hypo['lm_entropy'][i], np.exp(hypo['lm_max'][i])])
            labels.append(int((hypo['positional_scores'][i] - hypo['lm_scores'][i]) > 0.01))
            # frequency-related featuress
            tok = hypo['string'][i]
            prev = prev[-ngram:]
            for j in range(1, ngram+1):
                local_f.append(freq_cnt[' '.join(prev[-j:])])
            prev.append(tok)
            
            features.append(local_f)
    
    return features, labels

In [2]:
hypos = []
with open('../analysis_new.jsonl') as fin:
    for line in fin:
        hypos.append(json.loads(line.strip()))

In [17]:
hypos[0].keys()

dict_keys(['string', 'tokens', 'positional_scores', 'knn_scores', 'lm_scores', 'lm_entropy', 'lm_max'])

In [5]:
np.exp(np.array(hypos[0]['lm_max'][:10]))

array([0.37130827, 0.55252217, 0.99160496, 0.65741908, 0.9926836 ,
       0.99929167, 0.99984313, 0.98193099, 0.99318737, 0.38685434])

In [18]:
training_data = '/projects/junxianh/knnlmXS/examples/language_model/wikitext-103/wiki.train.tokens'

In [16]:
freq_cnt = get_ngram_freq(training_data)

procesed 0 lines
procesed 100000 lines
procesed 200000 lines
procesed 300000 lines
procesed 400000 lines
procesed 500000 lines
procesed 600000 lines
procesed 700000 lines
procesed 800000 lines
procesed 900000 lines
procesed 1000000 lines
procesed 1100000 lines
procesed 1200000 lines
procesed 1300000 lines
procesed 1400000 lines
procesed 1500000 lines
procesed 1600000 lines
procesed 1700000 lines
procesed 1800000 lines


In [24]:
freq_cnt["you hate me"]

2

In [19]:
freq_cnt = Counter()

In [125]:
features, labels = preprocess_data(hypos, freq_cnt)

In [32]:
len(features)

217646

In [33]:
len(labels)

217646

In [35]:
labels[:20]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [28]:
features[2]

[0.10498046875, 0.9916049558943966, 0, 0, 0, 0]

In [189]:
x_train, x_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=22)

In [38]:
len(x_train)

174116

In [48]:
x_train[10]

[9.119510650634766e-06, 0.9999994039537299, 0, 0, 0, 0]

In [40]:
len(y_train)

174116

In [190]:
scaler = StandardScaler()
scaler = scaler.fit(x_train)

In [192]:
x_train_norm = scaler.transform(x_train)
x_val_norm = scaler.transform(x_val)

In [148]:
features[:10]

[[4.2578125, 0.3713082658051863, 0, 0, 0, 0],
 [2.84375, 0.5525221705240402, 0, 0, 0, 0],
 [0.10498046875, 0.9916049558943966, 0, 0, 0, 0],
 [1.0830078125, 0.6574190806558238, 0, 0, 0, 0],
 [0.0640869140625, 0.9926836038585435, 0, 0, 0, 0],
 [0.006256103515625, 0.9992916709664463, 0, 0, 0, 0],
 [0.0019893646240234375, 0.9998431328798847, 0, 0, 0, 0],
 [0.1448974609375, 0.9819309852047077, 0, 0, 0, 0],
 [0.048583984375, 0.9931873743710079, 0, 0, 0, 0],
 [1.2412109375, 0.38685434308746147, 0, 0, 0, 0]]

In [171]:
x_train_norm[:10]

array([[-1.52558698,  1.78679802,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.50609792, -1.30979425,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.49712464, -0.83988585,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.36261144, -0.42616263,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.99092171,  1.07408044,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.83561193, -1.05754866,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.35653887, -0.82163974,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.48210935,  0.00942679,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.50145036, -0.81683021,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.20147851,  0.44173925,  0.        ,  0.        ,  0.        ,
         0.        ]])

In [149]:
labels[:10]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [172]:
x_val_norm = scaler.transform(x_val)

In [50]:
x_train_norm.shape

(174116, 6)

In [183]:
x_val_norm[0]

array([-0.22743281,  0.07513852,  0.        ,  0.        ,  0.        ,
        0.        ])

In [193]:
training_set = TokenFeatureDataset(x_train_norm, y_train)
val_set = TokenFeatureDataset(x_val_norm, y_val)

In [174]:
labels[:10]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [194]:
train_dataloader = torch.utils.data.DataLoader(training_set, 
                                               batch_size=64,
                                               shuffle=False,
                                               collate_fn=training_set.collater)

val_dataloader = torch.utils.data.DataLoader(val_set,
                                             batch_size=64,
                                             shuffle=False,
                                             collate_fn=val_set.collater)

In [176]:
sample_check = next(iter(train_dataloader))

In [131]:
x_train_norm[0]

array([ 0.7721734 , -0.28048421,  0.        ,  0.        ,  0.        ,
        0.        ])

In [132]:
features[0]

[4.2578125, 0.3713082658051863, 0, 0, 0, 0]

In [118]:
hypos[0]['lm_entropy'][:10]

[4.2578125,
 2.84375,
 0.10498046875,
 1.0830078125,
 0.0640869140625,
 0.006256103515625,
 0.0019893646240234375,
 0.1448974609375,
 0.048583984375,
 1.2412109375]

In [178]:
sample_check

{'id': tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
         14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
         28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
         42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54., 55.,
         56., 57., 58., 59., 60., 61., 62., 63.]),
 'feature': tensor([[-1.5256e+00,  1.7868e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00],
         [ 1.5061e+00, -1.3098e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00],
         [ 4.9712e-01, -8.3989e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00],
         [-3.6261e-01, -4.2616e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00],
         [-9.9092e-01,  1.0741e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00],
         [ 8.3561e-01, -1.0575e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00],
         [ 3.5654e-01,

In [227]:
def validate(val_dataloader, model):
    model.eval()
    running_loss = 0.
    nsamples = 0
    truth_list = []
    prediction_list = []
    for i, sample in enumerate(val_dataloader, 0):
        inputs, truth = sample['feature'], sample['label']
        truth_list.extend(truth.tolist())
#         inputs, labels = sample_check['feature'], sample_check['label']
        outputs = model(inputs)
        loss = criterion(outputs, truth)
        
        # (batch)
        _, prediction = torch.max(outputs, dim=1)
#         import pdb; pdb.set_trace()
        
        prediction_list.extend(prediction.tolist())
        
        running_loss += loss.item() * inputs.size(0)
        nsamples += inputs.size(0)
        
    print(f"val loss: {running_loss/nsamples:.3f}")
    print(f"val accuracy: {accuracy_score(truth_list, prediction_list):.3f}")
    precision, recall, _, _ = precision_recall_fscore_support(truth_list, 
                                                              prediction_list, 
                                                              average='binary', 
                                                              pos_label=1)
    print(f"val precision: {precision:.3f}, recall: {recall:.3f}")
    
    return truth_list, prediction_list
        

In [226]:
nepochs = 30
hidden_units = 128
nlayers = 2
dropout = 0.
lr = 1e-5

model = MLPClassifer(training_set.get_nfeature(), 
                     hidden_units=hidden_units,
                     nlayers=nlayers,
                     dropout=dropout)

if torch.cuda.is_available():
    model.cuda()

In [228]:
truth_list, prediction_list = validate(val_dataloader, model)

val loss: 0.647
val accuracy: 0.659
val precision: 0.340, recall: 0.038


In [229]:
len(truth_list)

43530

In [214]:
print(truth_list[:100])

[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]


In [215]:
len(prediction_list)

43530

In [216]:
print(prediction_list[:100])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [230]:
print(model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
model.train()
for epoch in range(nepochs):
    running_loss = 0.
    nsamples = 0
    for i, sample in enumerate(train_dataloader, 0):
        inputs, truth = sample['feature'], sample['label']
#         inputs, labels = sample_check['feature'], sample_check['label']
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, truth)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
        nsamples += inputs.size(0)
        
        if (i+1) % 500 == 0:
            print(f'epoch: {epoch}, step: {i},  training loss: {running_loss/nsamples:.3f}')
            running_loss = 0
            nsamples = 0
            
    validate(val_dataloader, model)
    model.train()
            
    
            

MLPClassifer(
  (model): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): Dropout(p=0.0, inplace=False)
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): Linear(in_features=128, out_features=2, bias=True)
  )
)
epoch: 0, step: 499,  training loss: 0.618
epoch: 0, step: 999,  training loss: 0.606
epoch: 0, step: 1499,  training loss: 0.604
epoch: 0, step: 1999,  training loss: 0.601
epoch: 0, step: 2499,  training loss: 0.601
val loss: 0.601
val accuracy: 0.669
val precision: 0.459, recall: 0.037
epoch: 1, step: 499,  training loss: 0.601
epoch: 1, step: 999,  training loss: 0.602
epoch: 1, step: 1499,  training loss: 0.602
epoch: 1, step: 1999,  training loss: 0.601
epoch: 1, step: 2499,  training loss: 0.601
val loss: 0.600
val accuracy: 0.668
val precision: 0.456, recall: 0.054
epoch: 2, step: 499,  training loss: 0.601
epoch: 2, step: 999,  training loss: 0.602
epoch: 2, step: 1499,  training loss: 0.602
epoch: 2, step: 1999,  tra

In [165]:
model.model[0].weight

Parameter containing:
tensor([[-0.2765, -0.1965,  0.2563,  0.2765, -0.2565,  0.0878],
        [-0.3355,  0.1549,  0.0846,  0.2896, -0.0298, -0.3650],
        [ 0.4995,  0.5734, -0.1531,  0.0785,  0.2103, -0.3887],
        [-0.0129, -0.1014,  0.0956,  0.0179,  0.0496,  0.0615],
        [ 0.1619,  0.2508, -0.0199,  0.3697,  0.0483,  0.1909],
        [ 0.0747,  0.1110,  0.0884, -0.2797,  0.2881, -0.1727],
        [ 0.1522,  0.0783,  0.2609,  0.0388, -0.2520,  0.1852],
        [ 0.0173, -0.0549, -0.2127,  0.0557,  0.2349,  0.3436],
        [-0.0514, -0.0788, -0.2656, -0.2521, -0.0080, -0.2199],
        [ 0.1538,  0.1476,  0.2338, -0.1833,  0.3058,  0.1342],
        [ 0.0370, -0.1428, -0.0290, -0.1752,  0.3829,  0.0872],
        [ 0.1598,  0.2272, -0.0186, -0.0433,  0.2779,  0.1212],
        [ 0.2339,  0.5931, -0.3093, -0.0167, -0.1937, -0.2987],
        [-0.0907, -0.1262,  0.3418,  0.3039, -0.1901,  0.3110],
        [-0.4579,  0.1241, -0.3085, -0.2843, -0.3176,  0.3345],
        [-0.1248, 