In [41]:
import torch
import sklearn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import os

INPUT_PATH = "data/train_conll_hinglish_tags.csv"
MAX_TWEET = 280

char_to_ind = {}
ind_to_char = {}

char_to_ind.update({"UNK":0})
ind_to_char.update({0:"UNK"})

count = 1

with open(INPUT_PATH, 'r') as f:
    for line in f:
        for char in line.split('\t')[1]:
            if char.lower() not in char_to_ind:
                char_to_ind.update({char.lower():count})
                ind_to_char.update({count:char.lower()})
                count += 1

print(char_to_ind)
#print(ind_to_char)

n_letters = len(char_to_ind)       
            

{'UNK': 0, '@': 1, ' ': 2, 'a': 3, 'd': 4, 'i': 5, 'l': 6, 'n': 7, 's': 8, 'r': 9, 'b': 10, 'u': 11, 't': 12, 'p': 13, 'k': 14, 'g': 15, 'h': 16, 'q': 17, 'e': 18, 'o': 19, 'm': 20, '-': 21, 'c': 22, 'y': 23, 'j': 24, 'v': 25, '…': 26, '/': 27, '.': 28, 'x': 29, 'f': 30, '8': 31, '3': 32, '1': 33, '_': 34, 'w': 35, '7': 36, '6': 37, '♥': 38, '+': 39, '#': 40, '2': 41, '😂': 42, 'z': 43, '4': 44, '0': 45, '5': 46, '9': 47, '(': 48, '’': 49, "'": 50, '?': 51, '!': 52, '❤': 53, '😘': 54, '😍': 55, '☺': 56, '💓': 57, '😜': 58, '🥺': 59, '👍': 60, '🏻': 61, '🤣': 62, '~': 63, ')': 64, '%': 65, '—': 66, '👌': 67, '🔥': 68, '🌹': 69, '&': 70, '✴': 71, '❔': 72, '❓': 73, '⁉': 74, '❗': 75, '😏': 76, '😗': 77, '💏': 78, '🏵': 79, '️': 80, '🇮': 81, '🇳': 82, '💪': 83, '🌺': 84, '😉': 85, '😆': 86, '=': 87, '😓': 88, '😩': 89, '“': 90, '”': 91, '🙏': 92, '😊': 93, '🐖': 94, '😝': 95, '😛': 96, '😖': 97, '🤗': 98, '💐': 99, '😁': 100, '➕': 101, '✔': 102, '👋': 103, '😭': 104, '😹': 105, '–': 106, '💜': 107, '*': 108, '🤨': 109, 'ạ': 11

In [42]:
tag_to_ind = {}
ind_to_tag = {}

word_counts = {}

tag_count = 0

with open(INPUT_PATH, 'r') as f:
    for line in f:
        words = line.split('\t')[1].split(' ')
        tags = line.split('\t')[3].replace('\n','').split(' ')[1:]
        for i in range(len(words)):
            if tags[i] not in tag_to_ind:
                tag_to_ind.update({tags[i]:tag_count})
                ind_to_tag.update({tag_count:tags[i]})
                word_counts.update({tag_count:{}})
                tag_count += 1
            if words[i] not in word_counts[tag_to_ind[tags[i]]]:
                word_counts[tag_to_ind[tags[i]]].update({words[i]:1})
            else:
                word_counts[tag_to_ind[tags[i]]].update({words[i]:word_counts[tag_to_ind[tags[i]]][words[i]]+1})

print(tag_to_ind)
n_tags = len(tag_to_ind)

{'O': 0, 'Hin': 1, 'Eng': 2, 'EMT': 3}


In [43]:
def letterToTensor(letter, language):
    tensor = torch.zeros(1, n_letters + n_tags)
    if letter.lower() not in char_to_ind:
        tensor[0][char_to_ind['UNK']]
    else:
        tensor[0][char_to_ind[letter.lower()]] = 1
    tensor[0][n_letters + tag_to_ind[language]] = 1
    return tensor
    
def lineToTensor(line):
    tensor = torch.zeros(MAX_TWEET, n_letters + n_tags)
    words = line.split(' ')
    tags = []
    
    for word in words:
        counts = []
        for i in range(n_tags):
            if word in word_counts[i]:
                counts.append(word_counts[i][word])
            else:
                counts.append(0)
        if max(counts) > 0:
            tag = counts.index(max(counts))
        else:
            #tag = tag_to_ind['unk']
            tag = tag_to_ind['O']
        tags.append(tag)
        
    position = 0
    
    for i in range(len(words)):
        tag = tags[i]
        for letter in words[i]:
            if letter.lower() not in char_to_ind:
                tensor[position][char_to_ind['UNK']] = 1
            else:
                tensor[position][char_to_ind[letter.lower()]] = 1
            tensor[position][n_letters + tag] = 1
            position += 1
        if i != len(words) - 1:
            tensor[position][char_to_ind[' ']] = 1
            #tensor[position][n_letters + tag_to_ind['other']] = 1
            tensor[position][n_letters + tag_to_ind['O']] = 1
            position += 1
    return tensor

def batchToTensor(batch):
    tensor = torch.zeros(len(batch),MAX_TWEET,n_letters + n_tags)
    for sentence, line in enumerate(batch):
        tensor[sentence] = lineToTensor(line)
    return tensor

#print(letterToTensor('h','lang1'))
print(letterToTensor('h','Hin'))
print(lineToTensor('hello how are tu'))
print(batchToTensor(['hello how are tu','estoy bien thanks']))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.,

In [44]:
trainpath = os.path.join("data", "train_conll_hinglish.csv")
train = pd.read_csv(trainpath, sep='\\t', names=["ID","SENTENCE","LABEL"])
test = pd.read_csv("sample_test.csv", names=["ID", "SENTENCE", "LABEL"])

  


In [45]:
print(train['SENTENCE'][0].lower())
train_char_features = torch.sum(batchToTensor(train['SENTENCE']),1)

@ adilnisarbutt pakistan ka ghra tauq he pakistan israel ko tasleem nahein kerta isko palestine kehta he- occupied palestine


In [46]:
char_features = train_char_features

print(char_features.shape)
char_features = pd.DataFrame(char_features)
labels = pd.Series.as_matrix(train.LABEL)

torch.Size([15131, 902])


  """


In [47]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
model_NB = MultinomialNB()

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(char_features, labels, train_size=0.75)
# X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(char_features, train.LABEL, train_size=0.75)

print(X_train_tfidf)
print(y_train_tfidf.shape)

model_NB.fit(X_train_tfidf, y_train_tfidf)
predictions_tfidf = model_NB.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test_tfidf, predictions_tfidf)

              0           1            2            3           4    \
8292   tensor(0.)  tensor(2.)  tensor(20.)  tensor(17.)  tensor(4.)   
6449   tensor(0.)  tensor(1.)  tensor(20.)  tensor(15.)  tensor(1.)   
4579   tensor(0.)  tensor(0.)  tensor(25.)  tensor(14.)  tensor(2.)   
13294  tensor(0.)  tensor(1.)  tensor(25.)  tensor(19.)  tensor(7.)   
11176  tensor(0.)  tensor(0.)  tensor(13.)   tensor(6.)  tensor(2.)   
9564   tensor(0.)  tensor(1.)  tensor(25.)  tensor(16.)  tensor(3.)   
7287   tensor(0.)  tensor(0.)   tensor(9.)   tensor(9.)  tensor(0.)   
1765   tensor(0.)  tensor(1.)  tensor(24.)  tensor(18.)  tensor(2.)   
7879   tensor(0.)  tensor(2.)  tensor(15.)  tensor(16.)  tensor(2.)   
14535  tensor(0.)  tensor(2.)  tensor(24.)  tensor(14.)  tensor(7.)   
1355   tensor(0.)  tensor(1.)  tensor(24.)  tensor(12.)  tensor(2.)   
5877   tensor(0.)  tensor(2.)  tensor(23.)  tensor(15.)  tensor(4.)   
11816  tensor(0.)  tensor(2.)  tensor(20.)  tensor(14.)  tensor(5.)   
9318  

In [48]:
precision, recall, f1, _ = precision_recall_fscore_support(y_test_tfidf, predictions_tfidf, average='macro')
print("[NB] accuracy: {}, f1-score: {}, precision: {}, recall: {}".format(accuracy_tfidf, f1, precision, recall))

[NB] accuracy: 0.45545863071636267, f1-score: 0.4409353151261845, precision: 0.4672481912821436, recall: 0.47828980825700734


In [40]:
clf = SVC(kernel='linear', gamma='auto')
clf.fit(X_train_tfidf, y_train_tfidf)
predictions_svc = clf.predict(X_test_tfidf)
accuracy_svc = accuracy_score(y_test_tfidf, predictions_svc)
# accuracy_svc
precision, recall, f1, _ = precision_recall_fscore_support(y_test_tfidf, predictions_svc, average='macro')
print("[Linear SVM] accuracy: {}, f1-score: {}, precision: {}, recall: {}".format(accuracy_svc, f1, precision, recall))

[Linear SVM] accuracy: 0.5074666666666666, f1-score: 0.2757442933654795, precision: 0.47132970555486636, recall: 0.35421163848227094
