In [18]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [75]:
dat = pd.read_csv("qualified_clean.csv")
dat['times'] = pd.to_datetime(dat['times'])
dat = dat.sort_values(by=['user', 'entry', 'times'])
dat = dat.reset_index(drop=True)
print(dat.columns)
dat = dat.drop(['times', 'next_char'], axis=1)
dat = dat.dropna()
dat.head()

Index(['user', 'entry', 'character', 'next_char', 'times', 'digraph',
       'phrasetime', 'del', 'err'],
      dtype='object')


Unnamed: 0,user,entry,character,digraph,phrasetime,del,err
0,1,1,8,0.0,0.303296,1.0,0.0
1,1,1,10,0.013945,0.303296,1.0,0.0
2,1,1,1,0.001499,0.303296,1.0,0.0
3,1,1,6,0.001758,0.303296,1.0,0.0
4,1,1,23,0.015259,0.303296,1.0,0.0


In [76]:
MAX_LEN = 100


x = []
y = []
for user in np.unique(dat['user']):
    u_x = dat[dat['user'] == user]
    u_e = []
    for entry in np.unique(u_x['entry']):
        new_entry = u_x[u_x['entry'] == entry]
        new_entry = np.pad(new_entry, [(MAX_LEN - new_entry.shape[0], 0), (0, 0)], mode='constant')
        u_e.append(new_entry)
    x.append(np.stack(u_e))
    num_entries = len(u_e)
    y.append(np.ones([num_entries]) * user)
    print(user)

x = np.concatenate(x)
y = np.concatenate(y)

print(x.shape, y.shape)
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
(14120, 100, 7) (14120,)


In [77]:
print(x.dtype)

float64


In [88]:
dataset = torch.utils.data.TensorDataset(torch.Tensor(x), torch.Tensor(y))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=500, shuffle=True)

In [89]:
class LogisticRegression(torch.nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(7, 26)

    def forward(self, x):
        x_simple = x[:, -1, :]
        outputs = self.linear(x_simple)
        
        return outputs

In [136]:
class GRUModel(torch.nn.Module):
    def __init__(self, hidden_size=100):
        super(GRUModel, self).__init__()
        self.gru = torch.nn.GRU(7, hidden_size, num_layers=2)
        self.linear = torch.nn.Linear(10 * hidden_size, 26)

    def forward(self, x):
        x = x[:, -10:, :]
        x = torch.transpose(x, 0, 1) # now (seq_len, batch, dim)
#         print(x.shape)
        gruOut, gruHN = self.gru(x)
#         print(gruHN)
        gruOut = torch.transpose(gruOut, 0, 1) # now (batch, seq_len, dim)
        gruOut = gruOut.reshape(gruOut.size(0), -1)
#         print(last_layer.shape)
        out = self.linear(gruOut)
        
        
        return out

In [138]:
model = GRUModel()

criterion = torch.nn.CrossEntropyLoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01) 

PRINT_RATE = 20

## Optimization Loop

for epoch in range(1000): 
    loss_sum = 0
    total, correct = 0, 0
    for idx, (x_batch, y_batch) in enumerate(dataloader):
        print(idx)
        y_pred = model(x_batch)
#         print(y_pred[:5])
#         print("y_pred shape is", y_pred.shape)
#         print("y_batch shape is", y_batch.shape)
        loss = criterion(y_pred, y_batch.long())
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step()
        loss_sum += loss
        y_pred_i = torch.argmax(y_pred, dim=-1)
        correct += torch.sum(torch.eq(y_pred_i, y_batch)).item()
        total += len(y_batch)
    print('epoch {}, loss {}'.format(epoch, loss_sum.item())) 
#     print(y_pred_i[:1])
    print(total, correct, correct/total)

        

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
epoch 0, loss 93.57019805908203
14120 786 0.0556657223796034
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
epoch 1, loss 92.33280181884766
14120 923 0.06536827195467422
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
epoch 2, loss 92.00379180908203
14120 917 0.06494334277620396
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
epoch 3, loss 91.70684051513672
14120 923 0.06536827195467422
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
epoch 4, loss 91.4397964477539
14120 935 0.06621813031161473
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
epoch 5, loss 91.27296447753906
14120 1040 0.07365439093484419
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
epoch 6, loss 90.79420471191406
14120 1042 0.07379603399433428
0
1
2
3
4
5
6
7
8
9
10
11
1

KeyboardInterrupt: 

In [2]:
dat = pd.read_csv("clean_wide.csv")
dat.head()

Unnamed: 0,user,entry,character_1,character_2,character_3,character_4,character_5,character_6,character_7,character_8,...,digraph_35,digraph_36,digraph_37,digraph_38,digraph_39,digraph_40,digraph_41,del,err,phrasetime
0,1,1,8,10,1,6,23,21,13,10,...,0.00477,0.01747,0.016675,0.001636,0.017868,,,1,0,0.303296
1,1,2,8,10,1,6,23,21,13,10,...,0.016757,0.00835,0.009592,0.018797,0.000709,0.00169,0.006765,2,1,0.305861
2,1,3,8,10,1,6,23,21,13,13,...,0.001077,0.001366,0.005116,0.002441,0.001095,0.008573,,2,1,0.30562
3,1,4,8,10,1,6,23,21,13,13,...,0.01478,0.001881,0.001936,0.008713,0.007559,0.00924,0.011532,2,0,0.272646
4,1,5,8,10,1,6,23,21,13,10,...,0.004859,0.016099,0.008431,0.004978,0.007762,,,1,0,0.336338


In [7]:
df = dat.to_numpy()
print(df.shape)

(16815, 128)


In [22]:
x = df[:,2:]
y = df[:,0]

print(x.shape)
print(y.shape)
print(x)
print(y)

(16815, 126)
(16815,)
[[ 8.         10.          1.         ...  1.          0.
   0.30329609]
 [ 8.         10.          1.         ...  2.          1.
   0.305861  ]
 [ 8.         10.          1.         ...  2.          1.
   0.30562019]
 ...
 [ 8.         10.          1.         ...  2.          1.
   1.5771718 ]
 [ 8.         10.          1.         ...  2.          1.
   1.55091119]
 [ 8.         10.          1.         ...  1.          0.
   1.48837495]]
[ 1.  1.  1. ... 25. 25. 25.]


In [13]:
dataset = torch.utils.data.TensorDataset(torch.Tensor(x), torch.Tensor(y))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=500, shuffle=True)

In [19]:
class LogisticRegression(torch.nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = torch.nn.Linear(126, 200)
        self.linear2 = torch.nn.Linear(200, 200)
        self.linear3 = torch.nn.Linear(200, 26)

    def forward(self, x):
        x = F.sigmoid(self.linear1(x))
        x = F.sigmoid(self.linear2(x))
        out = F.sigmoid(self.linear3(x))
        
        return out

In [20]:
model = LogisticRegression()

criterion = torch.nn.CrossEntropyLoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01) 

PRINT_RATE = 20

## Optimization Loop

for epoch in range(1000): 
    loss_sum = 0
    total, correct = 0, 0
    for idx, (x_batch, y_batch) in enumerate(dataloader):
        print(idx)
        y_pred = model(x_batch)
        loss = criterion(y_pred, y_batch.long())
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step()
        loss_sum += loss
        y_pred_i = torch.argmax(y_pred, dim=-1)
        correct += torch.sum(torch.eq(y_pred_i, y_batch)).item()
        total += len(y_batch)
    print('epoch {}, loss {}'.format(epoch, loss_sum.item())) 
    print(total, correct, correct/total)

0
1
2
3
4
5
6
7
8
9
10




11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
epoch 0, loss nan
16815 0 0.0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
epoch 1, loss nan
16815 0 0.0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
epoch 2, loss nan
16815 0 0.0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
epoch 3, loss nan
16815 0 0.0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
epoch 4, loss nan
16815 0 0.0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
epoch 5, loss nan
16815 0 0.0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
epoch 6, loss nan
16815 0 0.0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
epoch 7, loss nan
16815 0 0.0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18


KeyboardInterrupt: 