# Bibliotecas importadas

In [1]:
import re
import pandas as pd
import numpy as np
import nltk
import time

from nltk import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
#from sklearn.linear_model import LogisticRegression
#from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from collections import defaultdict, Counter
from sklearn import preprocessing
from nltk.stem import SnowballStemmer
from string import punctuation

## Limpieza en los datos
* Cambiar todas las palabras de mayúsculas a minúsculas
* Se han eliminado las '@' de @USUARIO con el fin de facilitar el etiquetado morfológico
* Quitar los links 
* Quitar los emojis
* Cambiar los slangs, abreviaturas y contracciones en su significado
* Se han reemplazado todos los números por el símbolo '0'
* Cambiar los hashtag por su palabra agresiva o odiosa
* Quitar los signos de puntuación y quitar espacios (tabuladores, etc)

In [6]:
pattern_URL="(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})"

def procesar(file, namefile):    
    file[file.columns[1]] = [clean_text(i) for i in file[file.columns[1]]]    
    file.to_csv(namefile, sep='\t', encoding='utf-8', index=False)
    return
    
def clean_text(text):
    text = text.lower()   
    #text=re.sub("@([A-Za-z0-9_]{1,15})", "@USUARIO", text)
    text=re.sub("@([A-Za-z0-9_]{1,15})", " ", text)
    text=re.sub(pattern_URL, " ", text)
    text= remove_emoji(text)
    
    text= replace_all('Dictionary/SP/SPabb.txt', text)      
    text= replace_all('Dictionary/SP/SPslang.txt', text)
    text= replace_all('Dictionary/SP/SPcontractions.txt', text)
    text= remove_stopwords(text)
    text=re.sub("\d+", "0", text)      
    text= change_hashtag(text)
    text=re.sub(r" +", " ", re.sub(r"\t", " ", re.sub(r"\n+", "\n", re.sub('(?:[.,\/!$%?¿?!¡\^&\*;:{}=><\-_`~()”“"\'\|])', " ",text))))
    text = text.strip()
    return text

def remove_stopwords(text):    
    stopwords=set(nltk.corpus.stopwords.words("spanish"))
    for i in stopwords:
        text = re.sub(r"\b%s\b" % i, "", text)
    return text

def extract_hashtag(s):
    hs = re.findall(r"#(\w+)", s)
    return hs

def change_hashtag(text):    
    input_file_agresiva = open('Dictionary/agresivas_es.txt', 'r', encoding="utf8")
    input_file_agresiva.seek(0)
    input_file_agresiva = input_file_agresiva.read().splitlines()
    h = extract_hashtag(text)
    for cadena in h:
        for agresivo in input_file_agresiva:
            if cadena.find(agresivo) != -1:
                text = text.replace("#"+cadena,agresivo)
        text = text.replace("#"+cadena,"")
    return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # emoticons
                               "\U0001F300-\U0001F5FF"  # symbols & pictographs                               
                               "\U0001F680-\U0001F6FF"  # transport & map symbols
                               "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "\U00002702-\U000027B0"
                               "\U000024C2-\U0001F251"
                               "\U0001f926-\U0001f937"
                               "\u200d"
                               "\u2640-\u2642"
                               "\U0001F1F2-\U0001F1F4"  # Macau flag
                               "\U0001F1E6-\U0001F1FF"  # flags
                               "\U0001F600-\U0001F64F"
                               "\U0001F1F2"
                               "\U0001F1F4"
                               "\U0001F620"
                               "]+", flags=re.UNICODE)   
    text = emoji_pattern.sub(r'', text) # no emoji
    return text

def replace_all(path, text):
    dic = create_dictionary_words(path)    
    for i, j in dic.items():
        text = re.sub(r"(^|\s)%s(\s|$)" % i, " "+j+" ", text)
        # r"\b%s\b"% enables replacing by whole word matches only
    return text

def create_dictionary_words(path):
    # create a dictionary of words-to-replace and words-to-replace-with
    input_file = open(path, 'r', encoding="utf8")
    input_file.seek(0)
    input_file = input_file.read().splitlines()
    input_array = [w.strip().split('\t') for w in input_file]
    output_dict = dict()
    for s in input_array:
        output_dict[s[0]]= s[1]
    return output_dict

### Extraer los hashtag

In [7]:
import re
corpus_train_es = pd.read_csv('corpus/public_development_esTaskA/train_es.tsv',delimiter='\t',encoding='utf-8')
corpus_dev_es = pd.read_csv('corpus/public_development_esTaskA/dev_es.tsv',delimiter='\t',encoding='utf-8')

def extract_hash_tags(s):
    hs = re.findall(r"#(\w+)", s)
    return hs

def lista(text):
    lista = []
    for w in text:
        array = extract_hash_tags(w)
        if array !=[]:
            for x in array:
                lista.append(x)
    return lista

# sacar hashtag hate text
hate_train=corpus_train_es[corpus_train_es['HS'] != 0]
text1 = hate_train[hate_train.columns[1]]
lista1 = lista(text1)
hate_dev=corpus_dev_es[corpus_dev_es['HS'] != 0]
text2 = hate_dev[hate_dev.columns[1]]
lista2 = lista(text2)
a = set(lista1)
b = set(lista2)
c = a | b

# sacar hashtag aggressive text
aggressive_train=corpus_train_es[corpus_train_es['AG'] != 0]
text1 = aggressive_train[aggressive_train.columns[1]]
lista1=lista(text1)
aggressive_dev=corpus_dev_es[corpus_dev_es['AG'] != 0]
text2 = aggressive_dev[aggressive_dev.columns[1]]
lista2=lista(text2)
a = set(lista1)
b = set(lista2)
d = a | b

#sacar todos los hashtag
c | d

{'17A',
 '21DARV',
 '3Ago',
 'ASCO',
 'AcogidaDigna',
 'Afregar',
 'AgresiónManterosBcnPdV',
 'AlCongresoPorLosJubilados',
 'Algeciras',
 'AsiaCentral',
 'AsíVivimosElRacismo',
 'AñoNuevoEnCombate',
 'Barcelona',
 'BatiArjona',
 'BestBoyBand',
 'Betis',
 'Bienvenidos13',
 'C',
 'CARMENA',
 'CNCO',
 'CaerEnTentacion',
 'CamilaVallejo',
 'Canarias',
 'Carmena',
 'Catalunya',
 'Ceuta',
 'Chile',
 'ChileParaLosChilenos',
 'ChilePrimero',
 'CloseBorders',
 'Closeborders',
 'CostaRica',
 'CállateYfriega',
 'DeNada',
 'DeRegaloTeMereces',
 'DebatePorElFuturo',
 'DefiendeChile',
 'DefiendeEspaña',
 'Desmontando',
 'DimisionInutilMarlaska',
 'DonaldCasado',
 'ELECCIONESGENERALESYA',
 'ELECCIONESYA',
 'ESP',
 'EXCLUSIVA',
 'ElCascabel01A',
 'EleccionesGeneralesYA',
 'EnTuJeta',
 'EsMuyDeProVida',
 'Espana',
 'España',
 'EspañaLoPrimero',
 'Europa',
 'ExatlonMx',
 'ExperimentoHistorico',
 'FelizDomingo',
 'FelizLunes',
 'FelizSabado',
 'Femimoda',
 'FronterasSeguras',
 'FueraRodolfoNoriega',
 'Gr

### Procesando el corpus B

In [8]:
#Leyendo el corpus B
corpus_train_esB = pd.read_csv('corpus/public_development_esTaskB/train_es.tsv',delimiter='\t',encoding='utf-8')
corpus_dev_esB = pd.read_csv('corpus/public_development_esTaskB/dev_es.tsv',delimiter='\t',encoding='utf-8')

## Guardar texto medio limpio para sacar etiquetas POS

In [4]:
def procesarme(file, namefile):    
    file[file.columns[1]] = [clean(i) for i in file[file.columns[1]]]    
    file.to_csv(namefile, sep='\t', encoding='utf-8', index=False)
    return

def clean(text):
    #text=re.sub("@([A-Za-z0-9_]{1,15})", "@USUARIO", text)
    text=re.sub("@([A-Za-z0-9_]{1,15})", " ", text)
    text=re.sub(pattern_URL, " ", text)
    text= remove_emoji(text)
    
    text= replace_all('Dictionary/SP/SPabb.txt', text)      
    text= replace_all('Dictionary/SP/SPslang.txt', text)
    text= replace_all('Dictionary/SP/SPcontractions.txt', text)
        
    text= change_hashtag(text)
    text=re.sub(r" +", " ", re.sub(r"\t", " ", re.sub(r"\n+", "\n", text)))
    text = text.strip()
    return text

#Guardando el corpus
procesarme(corpus_train_esB, "corpus/public_development_esTaskB/train_es_cPOSB.tsv")
procesarme(corpus_dev_esB, "corpus/public_development_esTaskB/dev_es_cPOSB.tsv")

### Guardar el texto limpio B

In [4]:
#Guardando el corpus ya procesado B
procesar(corpus_train_esB, "corpus/public_development_esTaskB/train_es_cleanB.tsv")
procesar(corpus_dev_esB, "corpus/public_development_esTaskB/dev_es_cleanB.tsv")

# Procesando el corpus limpio B

In [2]:
#Leyendo el corpus ya procesado B
corpus_train_esB = pd.read_csv('corpus/public_development_esTaskB/train_es_cleanB.tsv',delimiter='\t',encoding='utf-8')
corpus_dev_esB = pd.read_csv('corpus/public_development_esTaskB/dev_es_cleanB.tsv',delimiter='\t',encoding='utf-8')

#corpus_train_esB[corpus_train_esB.columns[1]]

train_idB = corpus_train_esB[corpus_train_esB.columns[0]]
X_train_textB = corpus_train_esB[corpus_train_esB.columns[1]].fillna(' ')
y_train_hsB = corpus_train_esB[corpus_train_esB.columns[2]]
y_train_trB = corpus_train_esB[corpus_train_esB.columns[3]]
y_train_agB = corpus_train_esB[corpus_train_esB.columns[4]]

test_idB = corpus_dev_esB[corpus_train_esB.columns[0]]
X_test_textB = corpus_dev_esB[corpus_dev_esB.columns[1]].fillna(' ')
y_test_hsB = corpus_dev_esB[corpus_dev_esB.columns[2]]
y_test_trB = corpus_dev_esB[corpus_dev_esB.columns[3]]
y_test_agB = corpus_dev_esB[corpus_dev_esB.columns[4]]

#leyendo el corpus medio limpio para extracción de otras caracts
corpus_train_esCB = pd.read_csv('corpus/public_development_esTaskB/train_es_cPOSB.tsv',delimiter='\t',encoding='utf-8')
corpus_dev_esCB = pd.read_csv('corpus/public_development_esTaskB/dev_es_cPOSB.tsv',delimiter='\t',encoding='utf-8')
train_B = corpus_train_esCB[corpus_train_esCB.columns[1]].fillna(' ')
test_B = corpus_dev_esCB[corpus_dev_esCB.columns[1]].fillna(' ')

#leyendo el corpus etiqueta POS
corpus_train_esPOSB = pd.read_csv('corpus/public_development_esTaskB/train_es_cPOSTAGB.tsv',delimiter='\t',encoding='utf-8')
corpus_dev_esPOSB = pd.read_csv('corpus/public_development_esTaskB/dev_es_cPOSTAGB.tsv',delimiter='\t',encoding='utf-8')
train_posB = corpus_train_esPOSB[corpus_train_esPOSB.columns[1]].fillna(' ')
test_posB = corpus_dev_esPOSB[corpus_dev_esPOSB.columns[1]].fillna(' ')

In [10]:
#Procesar los corpus_train que sean hs
hate=corpus_train_esB[corpus_train_esB['HS'] != 0]

X_train_hs_textB = hate[hate.columns[1]].fillna(' ')
y_train_hs_agB = hate[hate.columns[4]]

#Procesar los corpus_c que sean hs
hate_c=corpus_train_esCB[corpus_train_esCB['HS'] != 0]
train_hs_B = hate_c[hate_c.columns[1]].fillna(' ')

#Procesar los corpus_pos que sean hs
hate_pos=corpus_train_esPOSB[corpus_train_esPOSB['HS'] != 0]
train_hs_posB = hate_pos[hate_pos.columns[1]].fillna(' ')

## Extract the features

In [3]:
#Extracts character n-grams
def charNgrams(text, n):
    ngrams = []
    ngrams = [text[i:i+n]+'_cng' for i in range(len(text)-n+1)]
    return ngrams

In [4]:
#Extracts word-ngrams, when n=1 is equal to bag of words
def wordNgrams(text, n):
    ngrams = []
    text = [word for word in text.split()]
    ngrams = [' '.join(text[i:i+n])+'' for i in range(len(text)-n+1)]
    return ngrams

In [5]:
#Extracts pos-ngrams, when n=1 is equal to bag of pos
def posNgrams(text, n):
    ngrams = []
    text = [pos for pos in text.split()]
    ngrams = [' '.join(text[i:i+n])+'_png' for i in range(len(text)-n+1)]
    return ngrams

In [6]:
conjunto_agresivas = set()
words_agresiva = open('Dictionary/agresivas_es.txt', 'r', encoding="utf8")
words_agresiva.seek(0)
words_agresiva = words_agresiva.read().splitlines()
stemmer = SnowballStemmer("spanish")
for agresiva in words_agresiva:
    conjunto_agresivas.add(stemmer.stem(agresiva))

def AggressiveNgrams(text, n):
    n_grams = []
    tokens = text.split(" ")
    fws = []
    for word in tokens:
        if stemmer.stem(word) in conjunto_agresivas:
            fws.append(word)
    n_grams=[('_'.join(fws[i:i+n])) + "_awn" for i in range(len(fws)-n+1)]
    return n_grams

def lexPatterns(text):
    patterns=[]
    #Extracts patterns
    for word in words_agresiva:
        w = re.findall(word, text)
        w = ['lex_patt' for p in w]
        patterns.extend(w)   
    return patterns

In [7]:
def morfoPatterns(text):
    patterns=[]
    #Extracts patterns
    
    Vb_adj = re.findall(r'vm..000 aq.....',text)
    Vb_adj= ['morfo_patt' for p in Vb_adj]
    patterns.extend(Vb_adj)
    
    adj_vb = re.findall(r'aq..... vm..000',text)
    adj_vb= ['morfo_patt' for p in adj_vb]
    patterns.extend(adj_vb)
    
    sust_adj = re.findall(r'n.0.000 aq.....',text)
    sust_adj= ['morfo_patt' for p in sust_adj]
    patterns.extend(sust_adj)
    
    adj_sust = re.findall(r'aq..... n.0.000',text)
    adj_sust= ['morfo_patt' for p in adj_sust]
    patterns.extend(adj_sust)
    
    pron_vb = re.findall(r'pd...... vm..000',text)
    pron_vb= ['morfo_patt' for p in pron_vb]
    patterns.extend(pron_vb)
    
    return patterns

In [8]:
def wordSkipgrams(text,n):
    skipgrams = []
    text = [word for word in text.split()]
    lista = list(nltk.skipgrams(text, 2, n))
    skipgrams = [' '.join(i[0]+' '+ i[1])+'' for i in lista]
    return skipgrams

In [9]:
def funcNgrams(text, n):
    stop_words = nltk.corpus.stopwords.words("spanish")
    patt=r'\b(' + ('|'.join(re.escape(key) for key in stop_words)).lstrip('|') + r')\b'
    pattern = re.compile(patt)
    text = re.sub(r'[.,\/!$%?¿?!¡\^&\*;:{}=><\-_`~()”“"\'\|]*', "",text)
    #text = re.sub(r"[" + punctuation + "]*", "", text)
    terms = pattern.findall(text)
    n_grams=[('_'.join(terms[i:i+n])) + "_fwn" for i in range(len(terms)-n+1)]
    return n_grams

In [10]:
def simbPunctNgrams(text, n):
    simb_punt = '.,\/!$%?¿!¡^&*;:{}=><-_`~()”“\'\|'
    lis_character = list(text)
    fws = []
    for c in lis_character:
        if c in simb_punt:
            fws.append(c)
    n_grams=[(' '.join(fws[i:i+n])) + "_pwn" for i in range(len(fws)-n+1)]
    return n_grams

In [11]:
def extract_features(text,pos,tfs,cn,wn,pn,an,hs_ag,tr,sn,fn,sp):
    features = []
    for n in cn:
        if n != 0:
            features.extend(charNgrams(text,n))
    for n in wn:
        if n != 0:
            features.extend(wordNgrams(text,n))
    for n in pn:
        if n != 0:
            features.extend(posNgrams(pos,n))
    for n in an:
        if n != 0:
            features.extend(AggressiveNgrams(text,n))
    for n in sn:
        if n!=0:
            features.extend(wordSkipgrams(text,n))
    for n in fn:
        if n!=0:
            features.extend(funcNgrams(tfs,n))
    for n in sn:
        if n!=0:
            features.extend(simbPunctNgrams(tfs,n))
    
    if hs_ag:
        features.extend(lexPatterns(text))
    if tr:
        features.extend(morfoPatterns(text))
    return features

In [12]:
# Extracts all features in a set of 'texts' and return as a string separated with the simbol '&%$'
def process_texts(texts,poss,textfs,cn,wn,pn,an,hs_ag,tr,sn,fn,sp):
    occurrences=defaultdict(int)
    featuresList=[]
    featuresDict=Counter()
    text_pos= list(zip(texts,poss,textfs))   
    for (text,pos,tfs) in text_pos:
        features=extract_features(text,pos,tfs,cn,wn,pn,an,hs_ag,tr,sn,fn,sp)
        featuresDict.update(features)
        featuresList.append('&%$'.join(features))
    return featuresList, featuresDict

# Clasificador B

### clasificador B - HS

In [13]:
def clasificadorHS(cn, wn, pn, an, sn, fn, sp):
    start_time = time.time()
    print('Reading file') 
    
    '''
    vect = CountVectorizer(min_df=3, ngram_range=(2,5)).fit(X_train_textB)
    vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train_textB)
    X_train_vectorized = vect.transform(X_train_textB)
    '''
    
    print(' - Extracting features')
    train_features, dicOfFeatures = process_texts(X_train_textB, train_posB, train_B,cn,wn,pn,an,True, False,sn,fn,sp)
    
    vectorizer = CountVectorizer(lowercase=False, min_df=3, tokenizer=lambda x: x.split('&%$'))
    #vectorizer = TfidfVectorizer(lowercase=False, min_df=5, tokenizer=lambda x: x.split('&%$'))
    X_train_vectorized = vectorizer.fit_transform(train_features)
    X_train_vectorized = X_train_vectorized.astype(float)
    print('\t', 'labels', len(y_train_hsB))
    print('\t', 'tweets', len(X_train_textB))
    print('\t', 'vocabulary size',len(dicOfFeatures))
    print('\t', 'class dictribution',Counter(y_train_hsB) )
    
    ###### Clasificador
    print(' - Training Classifier')
        
    modelMnB=MultinomialNB()
    modelSVC = SVC(C=10000, random_state=0)   
    #modelLR = LogisticRegression(C=100)
    #modelMLPC = MLPClassifier()
    #modelReg = MLPRegressor()
    
    cvScoreMnb=cross_val_score(modelMnB, X_train_vectorized, y_train_hsB, cv=10, scoring='f1').mean()
    print('10-Fold Cross-validation Multinomial Naive Bayes',cvScoreMnb)
    
    cvScoreSVC=cross_val_score(modelSVC, X_train_vectorized, y_train_hsB, cv=10, scoring='f1').mean()
    print('10-Fold Cross-validation Linear SVC',cvScoreSVC)
    
    #cvScoreLG=cross_val_score(modelLR, X_train_vectorized, y_train_hsB, cv=10, scoring='f1').mean()
    #print('10-Fold Cross-validation Logistic Regression',cvScoreLG)
    
    ######Entrenar clasificador#########
    
    modelMnB.fit(X_train_vectorized, y_train_hsB) #ajusta al calificador    
    modelSVC.fit(X_train_vectorized, y_train_hsB)      
    #modelLR.fit(X_train_vectorized, y_train_hsB)
    #modelMLPC.fit(X_train_vectorized, y_train_hsB) 
    #modelReg.fit(X_train_vectorized, y_train_hsB)
    
    ###### Test ########################
    print ('Reading Test files')
    
    print(' - Extracting Test features')
    #X_test_vectorized = vect.transform(X_test_textB)
    test_features, dicOfFeaturesTest = process_texts(X_test_textB, test_posB,test_B,cn,wn,pn,an, True, False,sn,fn,sp)
    
    X_test_vectorized = vectorizer.transform(test_features)
    X_test_vectorized = X_test_vectorized.astype(float)
    X_test_vectorized = preprocessing.Binarizer().fit_transform(X_test_vectorized)
    print('\t', len(X_test_textB), 'unknown texts')
        
    # Predicting Test
    print(' - Predicting Test')
    
    predictionsMnB = modelMnB.predict(X_test_vectorized) #funcion para predecir
    predictionsSVC = modelSVC.predict(X_test_vectorized)
    #predictions = cross_val_predict(model, X_test_vectorized, cv=10) #probando validacion cruzada predict
    #predictionsLR = modelLR.predict(X_test_vectorized)
    #predictionsMPLC = modelMLPC.predict(X_test_vectorized)
    #predictionsReg = modelReg.predict(X_test_vectorized)
    #predictions = [round(w) for w in predictionsMPLC]
    
    print('elapsed time:', time.time() - start_time)
    
    ###### Evaluation metrics ########################
    print('Evaluation metrics')
    print(' - ACC')
    print('\t', 'MultinomialNB', accuracy_score(y_test_hsB, predictionsMnB))
    print('\t', 'SVC', accuracy_score(y_test_hsB, predictionsSVC))
    #print('\t', 'LogisticRegression', accuracy_score(y_test_hsB, predictionsLR))
    #print('\t', 'MLPClassifier', accuracy_score(y_test_hsB, predictionsMPLC))
    #print('\t', 'MLPRegressor', accuracy_score(y_test_hsB, predictionsReg))
    print(' - F1')
    print('\t', 'MultinomialNB', f1_score(y_test_hsB, predictionsMnB))
    print('\t', 'SVC', f1_score(y_test_hsB, predictionsSVC))
    #print('\t', 'LogisticRegression', f1_score(y_test_hsB, predictionsLR))
    #print('\t', 'MLPClassifier', f1_score(y_test_hsB, predictionsMPLC))
    #print('\t', 'MLPRegressor', f1_score(y_test_hsB, predictionsReg))
    
    return predictionsMnB

In [14]:
cnvalues=[3,4,5]#character n-grams
wnvalues=[1,2,3]# word n-grams
pnvalues=[2,3]#  pos n-grams
anvalues=[2]# aggressive words n-grams
skipgrams=[2,3,4] #skipgrams n-grams
fngrams=[3,4] # stop words n-grams
spgrams=[3,4] #punctuacion simbol n-gramas

predictionsHS = clasificadorHS(cnvalues, wnvalues, pnvalues, anvalues,skipgrams, fngrams, spgrams)

Reading file
 - Extracting features
	 labels 4469
	 tweets 4469
	 vocabulary size 401670
	 class dictribution Counter({0: 2631, 1: 1838})
 - Training Classifier
10-Fold Cross-validation Multinomial Naive Bayes 0.7542817725587252
10-Fold Cross-validation Linear SVC 0.7332681798984042
Reading Test files
 - Extracting Test features
	 500 unknown texts
 - Predicting Test
elapsed time: 379.10306000709534
Evaluation metrics
 - ACC
	 MultinomialNB 0.794
	 SVC 0.764
 - F1
	 MultinomialNB 0.7706013363028953
	 SVC 0.7107843137254902


### clasificador B - TR

In [15]:
def clasificadorTR(cn, wn, pn, an, sn, fn, sp):
    start_time = time.time()
    print('Reading file') 
    
    '''
    vect = CountVectorizer(min_df=3, ngram_range=(2,5)).fit(X_train_textB)
    vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train_textB)
    X_train_vectorized = vect.transform(X_train_textB)
    '''
    
    print(' - Extracting features')
    train_features, dicOfFeatures = process_texts(X_train_textB, train_posB,train_B,cn,wn,pn,an, False, True,sn, fn, sp)
    
    vectorizer = CountVectorizer(lowercase=False, min_df=3, tokenizer=lambda x: x.split('&%$'))
    #vectorizer = TfidfVectorizer(lowercase=False, min_df=5, tokenizer=lambda x: x.split('&%$'))
    X_train_vectorized = vectorizer.fit_transform(train_features)
    X_train_vectorized = X_train_vectorized.astype(float)
    print('\t', 'labels', len(y_train_trB))
    print('\t', 'tweets', len(X_train_textB))
    print('\t', 'vocabulary size',len(dicOfFeatures))
    print('\t', 'class dictribution',Counter(y_train_trB) )
    
    ###### Clasificador
    print(' - Training Classifier')
        
    modelMnB=MultinomialNB()
    modelSVC = SVC(C=10000, random_state=0)   
    #modelLR = LogisticRegression(C=100)
    #modelMLPC = MLPClassifier()
    #modelReg = MLPRegressor()
    
    cvScoreMnb=cross_val_score(modelMnB, X_train_vectorized, y_train_trB, cv=10, scoring='f1').mean()
    print('10-Fold Cross-validation Multinomial Naive Bayes',cvScoreMnb)
    
    cvScoreSVC=cross_val_score(modelSVC, X_train_vectorized, y_train_trB, cv=10, scoring='f1').mean()
    print('10-Fold Cross-validation Linear SVC',cvScoreSVC)
    
    #cvScoreLG=cross_val_score(modelLR, X_train_vectorized, y_train_trB, cv=10, scoring='f1').mean()
    #print('10-Fold Cross-validation Logistic Regression',cvScoreLG)
    
    ######Entrenar clasificador#########
    
    modelMnB.fit(X_train_vectorized, y_train_trB) #ajusta al calificador    
    modelSVC.fit(X_train_vectorized, y_train_trB)      
    #modelLR.fit(X_train_vectorized, y_train_trB)
    #modelMLPC.fit(X_train_vectorized, y_train_trB) 
    #modelReg.fit(X_train_vectorized, y_train_trB)
    
    ###### Test ########################
    print ('Reading Test files')
    
    print(' - Extracting Test features')
    #X_test_vectorized = vect.transform(X_test_textB)
    test_features, dicOfFeaturesTest = process_texts(X_test_textB, test_posB, test_B,cn,wn,pn,an, False, True,sn, fn, sp)
    
    X_test_vectorized = vectorizer.transform(test_features)
    X_test_vectorized = X_test_vectorized.astype(float)
    X_test_vectorized = preprocessing.Binarizer().fit_transform(X_test_vectorized)
    print('\t', len(X_test_textB), 'unknown texts')
        
    # Predicting Test
    print(' - Predicting Test')
    
    predictionsMnB = modelMnB.predict(X_test_vectorized) #funcion para predecir
    predictionsSVC = modelSVC.predict(X_test_vectorized)
    #predictions = cross_val_predict(model, X_test_vectorized, cv=10) #probando validacion cruzada predict
    #predictionsLR = modelLR.predict(X_test_vectorized)
    #predictionsMPLC = modelMLPC.predict(X_test_vectorized)
    #predictionsReg = modelReg.predict(X_test_vectorized)
    #predictions = [round(w) for w in predictionsMPLC]
    
    print('elapsed time:', time.time() - start_time)
    
    ###### Evaluation metrics ########################
    print('Evaluation metrics')
    print(' - ACC')
    print('\t', 'MultinomialNB', accuracy_score(y_test_trB, predictionsMnB))
    print('\t', 'SVC', accuracy_score(y_test_trB, predictionsSVC))
    #print('\t', 'LogisticRegression', accuracy_score(y_test_trB, predictionsLR))
    #print('\t', 'MLPClassifier', accuracy_score(y_test_trB, predictionsMPLC))
    #print('\t', 'MLPRegressor', accuracy_score(y_test_trB, predictionsReg))
    print(' - F1')
    print('\t', 'MultinomialNB', f1_score(y_test_trB, predictionsMnB))
    print('\t', 'SVC', f1_score(y_test_trB, predictionsSVC))
    #print('\t', 'LogisticRegression', f1_score(y_test_trB, predictionsLR))
    #print('\t', 'MLPClassifier', f1_score(y_test_trB, predictionsMPLC))
    #print('\t', 'MLPRegressor', f1_score(y_test_trB, predictionsReg))
    
    return predictionsSVC

In [16]:
cnvalues=[3,4,5]#character n-grams
wnvalues=[1,2,3]# word n-grams
pnvalues=[2,3]#  pos n-grams
anvalues=[0]# aggressive words n-grams
skipgrams=[2,3,4] #skipgrams n-grams
fngrams=[3,4] # stop words n-grams
spgrams=[3,4] #punctuacion simbol n-gramas

predictionsTR = clasificadorTR(cnvalues, wnvalues, pnvalues, anvalues,skipgrams,fngrams,spgrams)

Reading file
 - Extracting features
	 labels 4469
	 tweets 4469
	 vocabulary size 400633
	 class dictribution Counter({0: 3352, 1: 1117})
 - Training Classifier
10-Fold Cross-validation Multinomial Naive Bayes 0.698207039890715
10-Fold Cross-validation Linear SVC 0.7481261649587411
Reading Test files
 - Extracting Test features
	 500 unknown texts
 - Predicting Test
elapsed time: 179.0982208251953
Evaluation metrics
 - ACC
	 MultinomialNB 0.806
	 SVC 0.86
 - F1
	 MultinomialNB 0.7015384615384617
	 SVC 0.7388059701492536


### clasificador B - AG

In [17]:
def clasificadorAG(cn, wn, pn, an,sn,fn, sp):
    start_time = time.time()
    print('Reading file') 
    
    '''
    vect = CountVectorizer(min_df=3, ngram_range=(2,5)).fit(X_train_textB)
    vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train_textB)
    X_train_vectorized = vect.transform(X_train_textB)
    '''
    
    print(' - Extracting features')
    train_features, dicOfFeatures = process_texts(X_train_textB, train_posB,train_B,cn,wn,pn,an, True, False,sn,fn, sp)
    
    vectorizer = CountVectorizer(lowercase=False, min_df=3, tokenizer=lambda x: x.split('&%$'))
    #vectorizer = TfidfVectorizer(lowercase=False, min_df=5, tokenizer=lambda x: x.split('&%$'))
    X_train_vectorized = vectorizer.fit_transform(train_features)
    X_train_vectorized = X_train_vectorized.astype(float)
    print('\t', 'labels', len(y_train_agB))
    print('\t', 'tweets', len(X_train_textB))
    print('\t', 'vocabulary size',len(dicOfFeatures))
    print('\t', 'class dictribution',Counter(y_train_agB) )
    
    ###### Clasificador
    print(' - Training Classifier')
        
    modelMnB=MultinomialNB()
    modelSVC = SVC(C=10000, random_state=0)   
    #modelLR = LogisticRegression(C=100)
    #modelMLPC = MLPClassifier()
    #modelReg = MLPRegressor()
    
    cvScoreMnb=cross_val_score(modelMnB, X_train_vectorized, y_train_agB, cv=10, scoring='f1').mean()
    print('10-Fold Cross-validation Multinomial Naive Bayes',cvScoreMnb)
    
    cvScoreSVC=cross_val_score(modelSVC, X_train_vectorized, y_train_agB, cv=10, scoring='f1').mean()
    print('10-Fold Cross-validation Linear SVC',cvScoreSVC)
    
    #cvScoreLG=cross_val_score(modelLR, X_train_vectorized, y_train_agB, cv=10, scoring='f1').mean()
    #print('10-Fold Cross-validation Logistic Regression',cvScoreLG)
    
    ######Entrenar clasificador#########
    
    modelMnB.fit(X_train_vectorized, y_train_agB) #ajusta al calificador    
    modelSVC.fit(X_train_vectorized, y_train_agB)      
    #modelLR.fit(X_train_vectorized, y_train_agB)
    #modelMLPC.fit(X_train_vectorized, y_train_agB) 
    #modelReg.fit(X_train_vectorized, y_train_agB)
    
    ###### Test ########################
    print ('Reading Test files')
    
    print(' - Extracting Test features')
    #X_test_vectorized = vect.transform(X_test_textB)
    test_features, dicOfFeaturesTest = process_texts(X_test_textB, test_posB, test_B,cn,wn,pn,an, True, False,sn,fn, sp)
    
    X_test_vectorized = vectorizer.transform(test_features)
    X_test_vectorized = X_test_vectorized.astype(float)
    X_test_vectorized = preprocessing.Binarizer().fit_transform(X_test_vectorized)
    print('\t', len(X_test_textB), 'unknown texts')
        
    # Predicting Test
    print(' - Predicting Test')
    
    predictionsMnB = modelMnB.predict(X_test_vectorized) #funcion para predecir
    predictionsSVC = modelSVC.predict(X_test_vectorized)
    #predictions = cross_val_predict(model, X_test_vectorized, cv=10) #probando validacion cruzada predict
    #predictionsLR = modelLR.predict(X_test_vectorized)
    #predictionsMPLC = modelMLPC.predict(X_test_vectorized)
    #predictionsReg = modelReg.predict(X_test_vectorized)
    #predictions = [round(w) for w in predictionsMPLC]
    
    print('elapsed time:', time.time() - start_time)
    
    ###### Evaluation metrics ########################
    print('Evaluation metrics')
    print(' - ACC')
    print('\t', 'MultinomialNB', accuracy_score(y_test_agB, predictionsMnB))
    print('\t', 'SVC', accuracy_score(y_test_agB, predictionsSVC))
    #print('\t', 'LogisticRegression', accuracy_score(y_test_agB, predictionsLR))
    #print('\t', 'MLPClassifier', accuracy_score(y_test_agB, predictionsMPLC))
    #print('\t', 'MLPRegressor', accuracy_score(y_test_agB, predictionsReg))
    print(' - F1')
    print('\t', 'MultinomialNB', f1_score(y_test_agB, predictionsMnB))
    print('\t', 'SVC', f1_score(y_test_agB, predictionsSVC))
    #print('\t', 'LogisticRegression', f1_score(y_test_agB, predictionsLR))
    #print('\t', 'MLPClassifier', f1_score(y_test_agB, predictionsMPLC))
    #print('\t', 'MLPRegressor', f1_score(y_test_agB, predictionsReg))
    
    return predictionsMnB

In [18]:
cnvalues=[3,4,5]#character n-grams
wnvalues=[1,2,3]# word n-grams
pnvalues=[2,3]#  pos n-grams
anvalues=[2]# aggressive words n-grams
skipgrams = [2,3,4] #skipgrams n-grams
fngrams=[3,4] # stop words n-grams
spgrams=[3,4] #punctuacion simbol n-gramas

predictionsAG = clasificadorAG(cnvalues, wnvalues, pnvalues, anvalues,skipgrams,fngrams,spgrams)

Reading file
 - Extracting features
	 labels 4469
	 tweets 4469
	 vocabulary size 401670
	 class dictribution Counter({0: 2984, 1: 1485})
 - Training Classifier
10-Fold Cross-validation Multinomial Naive Bayes 0.6877082818827899
10-Fold Cross-validation Linear SVC 0.6689278295273798
Reading Test files
 - Extracting Test features
	 500 unknown texts
 - Predicting Test
elapsed time: 357.78141927719116
Evaluation metrics
 - ACC
	 MultinomialNB 0.792
	 SVC 0.804
 - F1
	 MultinomialNB 0.712707182320442
	 SVC 0.69375


### clasificador B - AG (a partir del corpus train_HS)

In [29]:
def clasificadorAG_hs(cn, wn, pn, an,sn,fn, sp):
    start_time = time.time()
    print('Reading file') 
    
    '''
    vect = CountVectorizer(min_df=3, ngram_range=(2,5)).fit(X_train_textB)
    vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train_textB)
    X_train_vectorized = vect.transform(X_train_textB)
    '''
    
    print(' - Extracting features')
    train_features, dicOfFeatures=process_texts(X_train_hs_textB,train_hs_posB,train_hs_B,cn,wn,pn,an,True,False,sn,fn,sp)
    
    #vectorizer = CountVectorizer(lowercase=False, min_df=3, tokenizer=lambda x: x.split('&%$'))
    vectorizer = TfidfVectorizer(lowercase=False, min_df=5, tokenizer=lambda x: x.split('&%$'))
    X_train_vectorized = vectorizer.fit_transform(train_features)
    X_train_vectorized = X_train_vectorized.astype(float)
    print('\t', 'labels', len(y_train_hs_agB))
    print('\t', 'tweets', len(X_train_hs_textB))
    print('\t', 'vocabulary size',len(dicOfFeatures))
    print('\t', 'class dictribution',Counter(y_train_hs_agB) )
    
    ###### Clasificador
    print(' - Training Classifier')
        
    modelMnB=MultinomialNB()
    modelSVC = SVC(C=10000, random_state=0)   
    #modelLR = LogisticRegression(C=100)
    #modelMLPC = MLPClassifier()
    #modelReg = MLPRegressor()
    
    cvScoreMnb=cross_val_score(modelMnB, X_train_vectorized, y_train_hs_agB, cv=10, scoring='f1').mean()
    print('10-Fold Cross-validation Multinomial Naive Bayes',cvScoreMnb)
    
    cvScoreSVC=cross_val_score(modelSVC, X_train_vectorized, y_train_hs_agB, cv=10, scoring='f1').mean()
    print('10-Fold Cross-validation Linear SVC',cvScoreSVC)
    
    #cvScoreLG=cross_val_score(modelLR, X_train_vectorized, y_train_hs_agB, cv=10, scoring='f1').mean()
    #print('10-Fold Cross-validation Logistic Regression',cvScoreLG)
    
    ######Entrenar clasificador#########
    
    modelMnB.fit(X_train_vectorized, y_train_hs_agB) #ajusta al calificador    
    modelSVC.fit(X_train_vectorized, y_train_hs_agB)      
    #modelLR.fit(X_train_vectorized, y_train_hs_agB)
    #modelMLPC.fit(X_train_vectorized, y_train_hs_agB) 
    #modelReg.fit(X_train_vectorized, y_train_hs_agB)
    
    ###### Test ########################
    print ('Reading Test files')
    
    print(' - Extracting Test features')
    #X_test_vectorized = vect.transform(X_test_textB)
    test_features, dicOfFeaturesTest = process_texts(X_test_textB, test_posB,test_B,cn,wn,pn,an, True, False,sn,fn, sp)
    
    X_test_vectorized = vectorizer.transform(test_features)
    X_test_vectorized = X_test_vectorized.astype(float)
    X_test_vectorized = preprocessing.Binarizer().fit_transform(X_test_vectorized)
    print('\t', len(X_test_textB), 'unknown texts')
        
    # Predicting Test
    print(' - Predicting Test')
    
    predictionsMnB = modelMnB.predict(X_test_vectorized) #funcion para predecir
    predictionsSVC = modelSVC.predict(X_test_vectorized)
    #predictions = cross_val_predict(model, X_test_vectorized, cv=10) #probando validacion cruzada predict
    #predictionsLR = modelLR.predict(X_test_vectorized)
    #predictionsMPLC = modelMLPC.predict(X_test_vectorized)
    #predictionsReg = modelReg.predict(X_test_vectorized)
    #predictions = [round(w) for w in predictionsMPLC]
    
    print('elapsed time:', time.time() - start_time)
    
    ###### Evaluation metrics ########################
    print('Evaluation metrics')
    print(' - ACC')
    print('\t', 'MultinomialNB', accuracy_score(y_test_agB, predictionsMnB))
    print('\t', 'SVC', accuracy_score(y_test_agB, predictionsSVC))
    #print('\t', 'LogisticRegression', accuracy_score(y_test_agB, predictionsLR))
    #print('\t', 'MLPClassifier', accuracy_score(y_test_agB, predictionsMPLC))
    #print('\t', 'MLPRegressor', accuracy_score(y_test_agB, predictionsReg))
    print(' - F1')
    print('\t', 'MultinomialNB', f1_score(y_test_agB, predictionsMnB))
    print('\t', 'SVC', f1_score(y_test_agB, predictionsSVC))
    #print('\t', 'LogisticRegression', f1_score(y_test_agB, predictionsLR))
    #print('\t', 'MLPClassifier', f1_score(y_test_agB, predictionsMPLC))
    #print('\t', 'MLPRegressor', f1_score(y_test_agB, predictionsReg))
    
    return predictionsMnB

In [30]:
cnvalues=[3,4,5]#character n-grams
wnvalues=[1,2,3]# word n-grams
pnvalues=[2,3]#  pos n-grams
anvalues=[2]# aggressive words n-grams
skipgrams=[2,3,4] #skipgrams n-grams
fngrams=[3,4] # stop words n-grams
spgrams=[3,4] #punctuacion simbol n-gramas

predictionsAG = clasificadorAG_hs(cnvalues, wnvalues, pnvalues, anvalues,skipgrams,fngrams,spgrams)

Reading file
 - Extracting features
	 labels 1838
	 tweets 1838
	 vocabulary size 172329
	 class dictribution Counter({1: 1485, 0: 353})
 - Training Classifier
10-Fold Cross-validation Multinomial Naive Bayes 0.8932615602217251




10-Fold Cross-validation Linear SVC 0.9007236282231977




Reading Test files
 - Extracting Test features
	 500 unknown texts
 - Predicting Test
elapsed time: 36.748313665390015
Evaluation metrics
 - ACC
	 MultinomialNB 0.356
	 SVC 0.434
 - F1
	 MultinomialNB 0.5208333333333334
	 SVC 0.5486443381180224


### Función para el archivo de salida B

In [19]:
def output_tsv(testid, predictionsHS, predictionsTR, predictionsAG):    
    d = {'id': testid, 'HS': predictionsHS, 'TR': predictionsTR, 'AG': predictionsAG}
    file = pd.DataFrame(data=d)  
    file.to_csv('corpus/public_development_esTaskB/es_b.tsv', sep='\t', encoding='utf-8', index=False)
    return

In [20]:
###### File output ########################
print('Writing output file')
output_tsv(test_idB, predictionsHS, predictionsTR, predictionsAG)
print('- File created...', 'answers saved to file:','corpus/public_development_esTaskB/es_b.tsv')

Writing output file
- File created... answers saved to file: corpus/public_development_esTaskB/es_b.tsv
