## Carga de paquetes

In [366]:
import pandas as pd
import re
import unidecode
import string
import nltk
import emoji
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline

In [306]:
pd.set_option('max_colwidth', 500)

## Importar datos dentro de un dataframe de Pandas

In [307]:
df = pd.read_csv('data/simonve_data.csv', sep=';', usecols=['comment_id', 'text', 'polarity'])

In [308]:
df.head()

Unnamed: 0,comment_id,text,polarity
0,38768,donde lo compra a ese precio???? ..... le estan viendo la cara.,10
1,41962,"David Foronda (Podemos): “A favor de regular los transgénicos, hay que tener en cuenta la soberanía alimentar... https://t.co/9dhQHZNfXn",-1
2,46156,"RT @segbruce: 21% la aprueba, 73% la rechaza. Así como va, Michelle Bachelet desaparecerá de las encuestas, ya no la apoyan ni sus votante…",-1
3,52072,RT @rpl2010: #CUIDATUDINEROMV #LANUEVAMAYORÍA O #LAVIEJAPILLERÍA #AFP #INJUSTAS EDUARDO #FREI RICARDO #LAGOS Y MICHELLE #BACHELET https://t…,-1
4,40533,"RT @Chevige: Ricardo Lagos el mismo que utilizó la LEY ANTITERRORISTA DE PINOCHET para reprimir, aplaude a Almagro #NuevaMayoria https://t.…",-1


In [309]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43527 entries, 0 to 43526
Data columns (total 3 columns):
comment_id    43527 non-null int64
text          43522 non-null object
polarity      43527 non-null int64
dtypes: int64(2), object(1)
memory usage: 1020.2+ KB


## Conteo del número de clasificaciones por comentario

In [310]:
classifications_by_comment = df.groupby(['comment_id'])['comment_id'].agg(['count'])
classifications_by_comment.head()

Unnamed: 0_level_0,count
comment_id,Unnamed: 1_level_1
34624,4
34693,3
34772,3
34863,3
35030,3


## Remoción de comentarios con solo una clasificación

In [311]:
df = df[df.apply(lambda r: classifications_by_comment.loc[r['comment_id']]['count'] > 1, axis=1)]

## Cálculo del valor de clasificación más frecuente

In [312]:
df = df.groupby(['comment_id', 'text'])[['polarity']].agg(pd.Series.mode)

In [313]:
df = df.reset_index()

In [314]:
df.drop(columns=['comment_id'], inplace=True)

In [315]:
df.head()

Unnamed: 0,text,polarity
0,Abogado de Michelle Bachelet otorgó asesoría jurídica a mujer que realizó la denuncia. https://t.co/lV5gnWcfmm,1
1,RT @Alitop_: Faltan 635 dias para que se acabe esta pesadilla llamada Michelle Bachelet #CuentaRegresiva #ChaoBachelet,-1
2,Michelle Bachelet está trotando para estar en forma. Michelle Bachelet está tratando de aprobar sus reformas chavo!! https://t.co/0QAX2Hu2Gh,0
3,"RT @ElLibido: 2/15 Hace pocos días, los “amigos” de @derechatuitera masificaron imagen, sobre supuesto vino de Michelle Bachelet. https://t…",-1
4,"Alcalde de Pozo Almonte, José Fernando Muñoz junto a la Presidenta, Michelle Bachelet e Intendenta de Tarapacá. https://t.co/v1IxZ4D3aG",0


In [316]:
df.count()

text        13711
polarity    13711
dtype: int64

## Remoción de comentarios con más de una clasificación más frecuente (moda)

In [317]:
df = df[df.apply(lambda r: type(r['polarity']) != pd.np.ndarray, axis=1)]

In [318]:
df.count()

text        12685
polarity    12685
dtype: int64

In [319]:
df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5485
0,4647
1,2497
10,56


## Remoción de los indefinidos

In [320]:
df = df[~(df.polarity == 10)]

In [321]:
df.count()

text        12629
polarity    12629
dtype: int64

## Funciones de limpieza de texto

In [322]:
def remove_tweet_user_mentions(s):
    """Remove twitter users from text."""
    return re.sub(r'@\S+', ' ', s, re.UNICODE)

def remove_tweet_hashtags(s):
    """Remove tweet hashtags."""
    return re.sub(r'#\S+', ' ', s, re.UNICODE)

def remove_links(s):
    """Remove links, urls from text."""
    return re.sub(r'(http|https)\S+', ' ', s, re.UNICODE)

def normalize_string(s):
    """# To lower case and remove accents marks"""
    return unidecode.unidecode(s)

def to_lower(s):
    return s.lower()

def remove_emojis(s):
    """Remove emojis (emoticons) from text."""
    return emoji.get_emoji_regexp().sub(u' ', s)

def remove_punctuation(s):
    punctuation_symbols = string.punctuation
    punctuation_symbols += '¿¡'
    s = s.replace('\u2026', ' ')  # triple dots
    s = s.replace('\u25ba', ' ')  # BLACK RIGHT-POINTING POINTER
    s = s.replace('\u201c', ' ')  # left double quote
    s = s.replace('\u201d', ' ')  # right double quote
    return s.translate(str.maketrans(' ', ' ', punctuation_symbols))

def remove_numbers(s):
    return re.sub(r'\d+', ' ', s)

def remove_stopwords(s):
    stoplist = nltk.corpus.stopwords.words('spanish')
    stoplist.remove('no')
    stoplist.extend([
        'rt',
        'q',
        'd',
        'x'
    ])
    return u' '.join([w for w in s.split() if w not in stoplist])

def remove_extra_whites(s):
    """Remove white characters repetitions from text."""
    return ' '.join(s.split())

def normalize(s):
    s = remove_tweet_user_mentions(s)
    s = remove_tweet_hashtags(s)
    s = remove_links(s)
    s = to_lower(s)
    s = remove_punctuation(s)
    s = remove_stopwords(s)
    s = remove_emojis(s)
    s = remove_numbers(s)
    s = remove_extra_whites(s)
    return s

## Limpieza de texto

In [323]:
normalized_df = df.copy()

In [324]:
normalized_df['text'] = normalized_df['text'].map(normalize)

In [325]:
normalized_df.head()

Unnamed: 0,text,polarity
0,abogado michelle bachelet otorgó asesoría jurídica mujer realizó denuncia,1
1,faltan dias acabe pesadilla llamada michelle bachelet,-1
2,michelle bachelet trotando forma michelle bachelet tratando aprobar reformas chavo,0
3,hace pocos días amigos masificaron imagen supuesto vino michelle bachelet,-1
4,alcalde pozo almonte josé fernando muñoz junto presidenta michelle bachelet intendenta tarapacá,0


## Remoción de comentarios con solo una palabra

In [326]:
normalized_df = normalized_df[normalized_df.apply(lambda r: len(r.text.split()) > 1, axis=1)]

In [327]:
normalized_df.count()

text        12436
polarity    12436
dtype: int64

## Datasets de entrenamiento y prueba

In [350]:
X = normalized_df.text
y = normalized_df.polarity.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Naive Bayes

In [357]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [358]:
y_pred = nb.predict(X_test)

In [359]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.610128617363344


In [360]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.60      0.87      0.71      1112
           0       0.59      0.53      0.56       881
           1       0.85      0.17      0.29       495

    accuracy                           0.61      2488
   macro avg       0.68      0.52      0.52      2488
weighted avg       0.65      0.61      0.57      2488



## Support Vector Machine

In [361]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [362]:
y_pred = sgd.predict(X_test)

In [363]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6089228295819936


In [365]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.59      0.89      0.71      1112
           0       0.63      0.41      0.50       881
           1       0.67      0.34      0.45       495

    accuracy                           0.61      2488
   macro avg       0.63      0.55      0.55      2488
weighted avg       0.62      0.61      0.58      2488



## Regresión Logística

In [367]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [368]:
y_pred = logreg.predict(X_test)

In [369]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5727491961414791


In [371]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.66      0.66      0.66      1112
           0       0.50      0.54      0.52       881
           1       0.51      0.43      0.47       495

    accuracy                           0.57      2488
   macro avg       0.56      0.54      0.55      2488
weighted avg       0.57      0.57      0.57      2488

