In [32]:
import torch
import sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import os

INPUT_PATH = "data/train_conll_spanglish_tags.csv"
MAX_TWEET = 280

char_to_ind = {}
ind_to_char = {}

char_to_ind.update({"UNK":0})
ind_to_char.update({0:"UNK"})

count = 1

with open(INPUT_PATH, 'r') as f:
    for line in f:
        for char in line.split('\t')[1]:
            if char.lower() not in char_to_ind:
                char_to_ind.update({char.lower():count})
                ind_to_char.update({count:char.lower()})
                count += 1

print(char_to_ind)
#print(ind_to_char)

n_letters = len(char_to_ind)       
            

{'UNK': 0, 's': 1, 'o': 2, ' ': 3, 't': 4, 'h': 5, 'a': 6, 'm': 7, 'e': 8, 'n': 9, 'r': 10, 'w': 11, 'c': 12, 'u': 13, 'd': 14, 'g': 15, 'l': 16, 'i': 17, 'p': 18, 'v': 19, 'j': 20, 'b': 21, '#': 22, 'y': 23, 'k': 24, 'z': 25, 'f': 26, 'ñ': 27, '@': 28, '_': 29, "'": 30, 'x': 31, '✌': 32, '😂': 33, '😆': 34, ',': 35, '>': 36, ':': 37, '*': 38, '-': 39, '?': 40, ';': 41, ')': 42, 'q': 43, '❤': 44, '/': 45, '.': 46, '"': 47, '2': 48, '♥': 49, '1': 50, '4': 51, '!': 52, '(': 53, '5': 54, '0': 55, '$': 56, '8': 57, '3': 58, '&': 59, '7': 60, '[': 61, ']': 62, '<': 63, '6': 64, '😔': 65, '=': 66, '️': 67, '😖': 68, '👌': 69, '9': 70, '💯': 71, '✋': 72, '👊': 73, '👏': 74, '💁': 75, '😢': 76, '😳': 77, '😱': 78, '😭': 79, '😡': 80, '🔫': 81, '😄': 82, '👅': 83, '🙌': 84, '💃': 85, '💙': 86, '👐': 87, '🎧': 88, 'ت': 89, '👍': 90, '😁': 91, '💕': 92, '😃': 93, '♩': 94, '♪': 95, '♬': 96, '😩': 97, '💔': 98, '😍': 99, '♡': 100, 'ü': 101, '👎': 102, '😒': 103, '\\': 104, '❄': 105, '⛄': 106, '😎': 107, '🎉': 108, '😫': 109, '💪': 1

In [15]:
tag_to_ind = {}
ind_to_tag = {}

word_counts = {}

tag_count = 0

with open(INPUT_PATH, 'r') as f:
    for line in f:
        words = line.split('\t')[1].split(' ')
        tags = line.split('\t')[3].replace('\n','').split(' ')[1:]
        for i in range(len(words)):
            if tags[i] not in tag_to_ind:
                tag_to_ind.update({tags[i]:tag_count})
                ind_to_tag.update({tag_count:tags[i]})
                word_counts.update({tag_count:{}})
                tag_count += 1
            if words[i] not in word_counts[tag_to_ind[tags[i]]]:
                word_counts[tag_to_ind[tags[i]]].update({words[i]:1})
            else:
                word_counts[tag_to_ind[tags[i]]].update({words[i]:word_counts[tag_to_ind[tags[i]]][words[i]]+1})

n_tags = len(tag_to_ind)

In [38]:
def letterToTensor(letter, language):
    tensor = torch.zeros(1, n_letters + n_tags)
    if letter.lower() not in char_to_ind:
        tensor[0][char_to_ind['UNK']]
    else:
        tensor[0][char_to_ind[letter.lower()]] = 1
    tensor[0][n_letters + tag_to_ind[language]] = 1
    return tensor
    
def lineToTensor(line):
    tensor = torch.zeros(MAX_TWEET, n_letters + n_tags)
    words = line.split(' ')
    tags = []
    
    for word in words:
        counts = []
        for i in range(n_tags):
            if word in word_counts[i]:
                counts.append(word_counts[i][word])
            else:
                counts.append(0)
        if max(counts) > 0:
            tag = counts.index(max(counts))
        else:
            tag = tag_to_ind['unk']
        tags.append(tag)
        
    position = 0
    
    for i in range(len(words)):
        print(i)
        tag = tags[i]
        for letter in words[i]:
            print(letter)
            if letter.lower() not in char_to_ind:
                tensor[position][char_to_ind['UNK']] = 1
            else:
                tensor[position][char_to_ind[letter.lower()]] = 1
            tensor[position][n_letters + tag] = 1
            position += 1
        if i != len(words) - 1:
            tensor[position][char_to_ind[' ']] = 1
            tensor[position][n_letters + tag_to_ind['other']] = 1
            position += 1
    return tensor

def batchToTensor(batch):
    tensor = torch.zeros(len(batch),MAX_TWEET,n_letters + n_tags)
    for sentence, line in enumerate(batch):
        tensor[sentence] = lineToTensor(line)
    return tensor

print(letterToTensor('h','lang1'))
print(lineToTensor('hello how are tu'))
print(batchToTensor(['hello how are tu','estoy bien thanks']))

tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.,