<a href="https://colab.research.google.com/github/jigsawfallingintoplace/KAUST/blob/master/Spanish_to_English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install laserembeddings
!pip install ekphrasis
!pip install emoji

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

omit_list = ['url', 'email', 'phone', 'user']

text_processor = TextPreProcessor(
    # omit=['url', 'email', 'phone', 'user'],
    # terms that will be normalized
    normalize=omit_list,
    # terms that will be annotated
    # annotate={"hashtag", "allcaps", "elongated", "repeated",
    #     'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=False,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=lambda s: s.split(),
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

sentences = [
    "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
    "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
    "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/."
]

for s in sentences:
    print(" ".join(text_processor.pre_process_doc(s)))

In [None]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.6/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [17]:
import csv
import emoji
from laserembeddings import Laser
import torch
from torch import optim, nn

laser = Laser()

en_trainfile = "/content/2018-E-c-En-train.txt"
en_devfile = "/content/2018-E-c-En-dev.txt"
en_testfile = "/content/2018-E-c-En-test-gold.txt"

es_trainfile = "/content/2018-E-c-Es-train.txt"
es_testfile = "/content/2018-E-c-Es-test-gold.txt"
es_devfile = "/content/2018-E-c-Es-dev.txt"
savepath = "/content/LASERSentiment.model"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def file_to_data(file):
    with open(file) as f:
        reader = csv.reader(f, delimiter="\t")
        data = list(reader)
    return data

traindata = file_to_data(en_trainfile)
devdata = file_to_data(es_devfile)
testdata = file_to_data(es_testfile)
en_testdata = file_to_data(en_testfile)

def process(s):
  ret = " ".join(text_processor.pre_process_doc(emoji.demojize(s)))
  for item in omit_list:
    ret.replace("<" + item + ">", '')
  return ret

def get_all_tweets(data): 
    return [process(d[1]) for d in data[1:]]

def get_label_lists(data):
    return [[int(x) for x in d[2:]] for d in data[1:]]

def get_label_tensors(data):
    label_tensors = []
    for d in data[1:]:
        tmp = torch.zeros(11)
        for i in range(11):
            if d[2 + i] == '1':
                tmp[i] = 1
        label_tensors.append(tmp)
    return label_tensors

train_tweets = get_all_tweets(traindata)
train_embeddings = laser.embed_sentences(train_tweets, lang='en')
dev_tweets = get_all_tweets(devdata)
dev_embeddings = laser.embed_sentences(dev_tweets, lang='es')
test_tweets = get_all_tweets(testdata)
test_embeddings = laser.embed_sentences(test_tweets, lang='es')

In [18]:
en_test_tweets = get_all_tweets(en_testdata)
en_test_embeddings = laser.embed_sentences(en_test_tweets, lang='en')

In [19]:
class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()
        self.layer_1 = nn.Linear(1024, 512)
        self.layer_2 = nn.Linear(512, 512)
        self.layer_3 = nn.Linear(512, 512)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(512)
        self.batchnorm3 = nn.BatchNorm1d(512)
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        # x = self.dropout(x)
        return x

In [20]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.layer_4 = nn.Linear(512, 512)
        self.layer_out = nn.Linear(512, 11) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm4 = nn.BatchNorm1d(512)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.layer_4(x)
        x = self.batchnorm4(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        # x = self.sigmoid(x)
        return x

In [21]:
class LanguageClassifier(nn.Module):
    def __init__(self):
        super(LanguageClassifier, self).__init__()
        self.layer_4 = nn.Linear(512, 512)
        self.layer_out = nn.Linear(512, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm4 = nn.BatchNorm1d(512)
    def forward(self, x):
        x = self.layer_4(x)
        x = self.batchnorm4(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        # x = nn.Softmax(dim=1)(x)
        return x

In [22]:
from torch.utils.data import TensorDataset, DataLoader

train_embeddings_tensors = torch.from_numpy(train_embeddings)
label_tensors = get_label_tensors(traindata)
assert len(train_embeddings_tensors) == len(label_tensors)

train_dataset = TensorDataset(train_embeddings_tensors, torch.stack(label_tensors))
train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True)
train_iter_source = iter(train_dataloader)


test_label_tensors = get_label_tensors(testdata)
y_true = torch.stack(test_label_tensors)

test_dataset = TensorDataset(torch.from_numpy(test_embeddings), y_true)
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=False)

train_iter_target = iter(test_dataloader)

In [23]:
loss_function = nn.BCEWithLogitsLoss()
lambd = 0.01
learning_rate = 0.005
Q_learning_rate = 0.005
F = FeatureExtractor()
P = SentimentClassifier()
Q = LanguageClassifier()
F, P, Q = F.to(device), P.to(device), Q.to(device)
optimizer = optim.Adam(list(F.parameters()) + list(P.parameters()), lr=learning_rate)
q_optimizer = optim.Adam(Q.parameters(), lr=Q_learning_rate)

In [24]:
# net.load_state_dict(torch.load(savepath))

In [25]:
def freeze(net):
    for p in net.parameters():
        p.requires_grad = False

def unfreeze(net):
    for p in net.parameters():
        p.requires_grad = True

def get_batch_source():
  global train_iter_source
  try:
    return next(train_iter_source)
  except:
    train_iter_source = iter(train_dataloader)
    return next(train_iter_source)

def get_batch_target():
  global train_iter_target
  try:
    return next(train_iter_target)
  except:
    train_iter_target = iter(test_dataloader)
    return next(train_iter_target)

In [None]:
from sklearn.metrics import label_ranking_average_precision_score, f1_score, jaccard_score, hamming_loss


scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=60, verbose=True)


dev_label_tensors = get_label_tensors(devdata)
dev_y_true = torch.stack(dev_label_tensors)
up = 0

dev_dataset = TensorDataset(torch.from_numpy(dev_embeddings), dev_y_true)
dev_dataloader = DataLoader(dev_dataset, batch_size=10, shuffle=False)

loss_diagram = []
num_epochs = 100
q_iter = 5
cnt = 0
clip_lower = -0.01
clip_upper = 0.01
running_loss = 0
jac = 0
while (jac < 0.45):
    freeze(P)
    freeze(F)
    unfreeze(Q)
    for q in range(q_iter):
      for p in Q.parameters():
        p.data.clamp_(clip_lower, clip_upper)
      
      Q.zero_grad()

      X_source, _ = get_batch_source()
      X_target, _ = get_batch_target()
      feature_source = F(X_source)
      feature_target = F(X_target)
      loss_q = torch.mean(-Q(feature_source)) + torch.mean(Q(feature_target))
      q_optimizer.zero_grad()
      loss_q.backward()
      q_optimizer.step()

    unfreeze(F)
    unfreeze(P)
    freeze(Q)

    for p in Q.parameters():
        p.data.clamp_(clip_lower, clip_upper) 

    F.zero_grad()
    P.zero_grad()

    X_source, Y_source = get_batch_source()
    X_target, Y_target = get_batch_target()
    feature_source = F(X_source)
    sentiment_source = P(feature_source)
    language_source = Q(feature_source)

    feature_target = F(X_target)
    language_target = Q(feature_target)

    loss_sentiment = loss_function(sentiment_source, Y_source)
    loss_sentiment.backward(retain_graph=True)
    loss_language = lambd*(torch.mean(language_source) - torch.mean(language_target))
    loss_language.backward(retain_graph=True)

    optimizer.step()

    
        

    all_list = []
    for x, y in dev_dataloader:
      guess = P(F(x))
      all_list.append(guess)
    concat_tensor = torch.cat(all_list)
    dev_y_score = concat_tensor.detach().numpy()
    dev_y_pred = dev_y_score.copy()
    for i in range(dev_y_pred.shape[0]):
      for j in range(dev_y_pred.shape[1]):
        if (dev_y_pred[i][j] >= 0.5):
          dev_y_pred[i][j] = 1
        else:
          dev_y_pred[i][j] = 0
    jac = jaccard_score(dev_y_true, dev_y_pred, average='samples')
    print("Jaccard Score:", jac)
    scheduler.step(jac)


  _warn_prf(average, modifier, msg_start, len(result))


Jaccard Score: 0.010309278350515464
Jaccard Score: 0.05608738340697103
Jaccard Score: 0.10088365243004419
Jaccard Score: 0.13193421698576338
Jaccard Score: 0.14727540500736377
Jaccard Score: 0.16512027491408937
Jaccard Score: 0.16666666666666666
Jaccard Score: 0.18544428080510553
Jaccard Score: 0.20189003436426115
Jaccard Score: 0.22140402552773683
Jaccard Score: 0.25184094256259204
Jaccard Score: 0.24975454099165437
Jaccard Score: 0.240672557682867
Jaccard Score: 0.2373588610702013
Jaccard Score: 0.22022582228767795
Jaccard Score: 0.2068483063328424
Jaccard Score: 0.19653902798232695
Jaccard Score: 0.1990427098674521
Jaccard Score: 0.1940844378988709
Jaccard Score: 0.19592538046146293
Jaccard Score: 0.1947717231222386
Jaccard Score: 0.20851742758959252
Jaccard Score: 0.21961217476681397
Jaccard Score: 0.22476681394207168
Jaccard Score: 0.23649975454099167
Jaccard Score: 0.2540500736377025
Jaccard Score: 0.2601865488463426
Jaccard Score: 0.2601865488463426
Jaccard Score: 0.270618556701

In [None]:
net = lambda x: P(F(x))

In [None]:
test_label_tensors = get_label_tensors(en_testdata)
y_true = torch.stack(test_label_tensors)

test_dataset = TensorDataset(torch.from_numpy(en_test_embeddings), y_true)
test_dataloader = DataLoader(test_dataset, batch_size=100, shuffle=False)

In [None]:
THRESHOLD = 0.5
print("Threshold:", THRESHOLD)
up = 0

all_list = []

for x, y in test_dataloader:
    guess = net(x)
    all_list.append(guess)
    # print(guess)
    # values, indices = torch.topk(guess, 2)
    # print("values:", values)
    # print("indices:", indices)
    for i in range(x.shape[0]):
        for j in range(11):
            if (guess[i][j] >= THRESHOLD and y[i][j] == 1) or (guess[i][j] < THRESHOLD and y[i][j] == 0):
              up += 1

concat_tensor = torch.cat(all_list)
y_score = concat_tensor.detach().numpy()

print("ACCURACY:", up / (len(test_label_tensors)*11))

from sklearn.metrics import f1_score, jaccard_score, hamming_loss
y_pred = y_score.copy()
for i in range(y_pred.shape[0]):
  for j in range(y_pred.shape[1]):
    if (y_pred[i][j] >= THRESHOLD):
      y_pred[i][j] = 1
    else:
      y_pred[i][j] = 0
f1score = f1_score(y_true, y_pred, average=None)
print("F1 Score:", f1score)
f1_micro = f1_score(y_true, y_pred, average='micro')
print("F1 Micro Score:", f1_micro)
f1_macro = f1_score(y_true, y_pred, average='macro')
print("F1 Macro Score:", f1_macro)
jac = jaccard_score(y_true, y_pred, average='samples')
print("Jaccard Score:", jac)
# print(y_pred)

lrap_score = label_ranking_average_precision_score(y_true, y_score)
print("LRAP Score:", lrap_score)

hl = hamming_loss(y_true, y_pred)
print("Hamming Loss:", hl)

In [None]:
torch.save(net.state_dict(), savepath)

In [None]:
lrap_score = label_ranking_average_precision_score(y_true, y_score)
print("LRAP Score:", lrap_score)

In [None]:
from sklearn.metrics import f1_score, jaccard_score, hamming_loss
y_pred = y_score.copy()
for i in range(y_pred.shape[0]):
  for j in range(y_pred.shape[1]):
    if (y_pred[i][j] >= 0):
      y_pred[i][j] = 1
    else:
      y_pred[i][j] = 0
f1score = f1_score(y_true, y_pred, average=None)
print("F1 Score:", f1score)
f1_micro = f1_score(y_true, y_pred, average='micro')
print("F1 Micro Score:", f1_micro)
f1_macro = f1_score(y_true, y_pred, average='macro')
print("F1 Macro Score:", f1_macro)
jac = jaccard_score(y_true, y_pred, average='samples')
print("Jaccard Score:", jac)
print(y_pred)

hl = hamming_loss(y_true, y_pred)
print("Hamming Loss:", hl)

In [None]:
from sklearn.metrics import hamming_loss
hl = hamming_loss(y_true, y_pred)
print(hl)

In [None]:
torch.save(P.state_dict(), '/content/P_es_0713.model')
torch.save(F.state_dict(), '/content/F_es_0713.model')
torch.save(Q.state_dict(), '/content/Q_es_0713.model')