In [1]:
import emoji, re, string, time, os
import pandas as pd
import numpy as np
from scipy.stats import randint
import pickle

#nlp
import nltk
from nltk.corpus import stopwords
import spacy

#dataviz
import matplotlib.pyplot as plt
import seaborn as sns

#features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.calibration import CalibratedClassifierCV

#models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble  import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

#data balancing
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

get_ipython().run_line_magic('matplotlib', 'inline')
sns.set(style="darkgrid")


In [2]:
subset = 'viral-15-words'
path_dir = 'results/' + subset + '/ml/'
path_dir

'results/viral-15-words/ml/'

In [3]:
data_dir = '../data' #+ '/vis_processed_texts.p'
preprocessed = False # the texts were already pre-processed
processed_texts_filename = 'processed_texts-'+subset+'.p'
for filename in os.listdir(data_dir):
    print(filename)
    if filename == processed_texts_filename:
        preprocessed = True
preprocessed 

filepath = '../data/telegram_experimentos.csv'
df = pd.read_csv(filepath)    

.ipynb_checkpoints
dataframe_golpe_telegram.csv
dataframe_rotulado_completo.csv
dataset_sem_repeticoes.csv
dataset_telegram_processado.csv
grouped_texts.pickle
preprocessed_corpus.p.pickle
processed_texts.p
telegram_content_only.csv
telegram_experimentos.csv
train-test
word2vec.model


# Funções que serão usadas para processamento de texto

In [4]:
unicode_emoji = {}
for key, value in emoji.EMOJI_DATA.items():
    try:
        unicode_emoji[key] = value['pt']
    except:
        pass

#emojis and punctuation
emojis_list = list(unicode_emoji)
punct = list(string.punctuation)
emojis_punct = emojis_list + punct

def processEmojisPunctuation(text, remove_punct = True):
    '''
    Put spaces between emojis. Removes punctuation.
    '''
    #get all unique chars
    chars = set(text)
    #for each unique char in text, do:
    for c in chars:
        #remove punctuation
        if remove_punct:
            if c in emojis_list:
                text = text.replace(c, ' ' + c + ' ')
            if c in punct:
                text = text.replace(c, ' ')

        #put spaces between punctuation
        else:
            if c in emojis_punct:
                text = text.replace(c, ' ' + c + ' ')          

    text = text.replace('  ', ' ')
    return text

#stop words removal
stop_words = list(stopwords.words('portuguese'))
new_stopwords = ['aí','pra','vão','vou','onde','lá','aqui',
                 'tá','pode','pois','so','deu','agora','todo',
                 'nao','ja','vc', 'bom', 'ai','kkk','kkkk','ta', 'voce', 'alguem', 'ne', 'pq',
                 'cara','to','mim','la','vcs','tbm', 'tudo']
stop_words = stop_words + new_stopwords
final_stop_words = []
for sw in stop_words:
    sw = ' '+ sw + ' '
    final_stop_words.append(sw)

def removeStopwords(text):
    for sw in final_stop_words:
        text = text.replace(sw,' ')
    text = text.replace('  ',' ')
    return text

#lemmatization
nlp = spacy.load('pt_core_news_sm')
def lemmatization(text):
    doc = nlp(text)
    for token in doc:
        if token.text != token.lemma_:
            text = text.replace(token.text, token.lemma_)
    return text


def domainUrl(text):
    '''
    Substitutes an URL in a text for the domain of this URL
    Input: an string
    Output: the string with the modified URL
    '''    
    if 'http' in text:
        re_url = '[^\s]*https*://[^\s]*'
        matches = re.findall(re_url, text, flags=re.IGNORECASE)
        for m in matches:
            domain = m.split('//')
            domain = domain[1].split('/')[0]
            text = re.sub(re_url, domain, text, 1)
        return text
    else:
        return text 

def preprocess(text):
    text = text.lower().strip()
    text = domainUrl(text)
    text = processEmojisPunctuation(text)
    text = removeStopwords(text)
    text = lemmatization(text)
    return text


# Definir quais experimentos serão feitos

In [5]:
experiments = ['ml-tfidf-unibitri_gram-random_oversampling',
 'ml-tfidf-unibi_gram-random_oversampling',
 'ml-tfidf-unibitri_gram-processed-random_oversampling',
 'ml-tfidf-unibitriquad_gram-processed-random_oversampling',
 'ml-tfidf-bigram-random_oversampling',
 'ml-bow-unibitri_gram-random_oversampling',
 'ml-tfidf-processed-smote',
 'ml-bow-processed-random_oversampling',
 'ml-tfidf-random_oversampling',
 'ml-tfidf-processed-random_oversampling',
 'ml-tfidf-smote',
 'ml-tfidf-undersampling',
 'ml-tfidf',
 'ml-tfidf-processed',
 'ml-bow-unibitri_gram-processed-random_oversampling',
 'ml-bow-random_oversampling',
 'ml-bow',
 'ml-bow-processed',
 'ml-bow-random_oversampling-processed',
 'ml-bow-random_oversampling-max_features',
 'ml-tfidf-random_oversampling-max_features',
 'ml-tfidf-processed-random_oversampling-max_features',
 'ml-bow-processed-random_oversampling-max_features',
 'ml-tfidf-trigram-random_oversampling',
 'ml-bow-processed-smote']


In [5]:
experiments1 = ['ml-tfidf-unibi_gram-processed-random_oversampling',
               'ml-bow-unibi_gram-processed-random_oversampling',
               'ml-bow-unibi_gram-random_oversampling'] 

# Definindo a função getTestMetrics

In [6]:
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score, roc_auc_score

def getTestMetrics(y_test, y_pred, y_prob,full_metrics=True, print_charts=False):
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    precision_neg = precision_score(y_test, y_pred, pos_label=0)
    recall = recall_score(y_test, y_pred)
    recall_neg = recall_score(y_test, y_pred, pos_label=0)
    f1 = f1_score(y_test, y_pred)
    f1_neg = 2 * (precision_neg * recall_neg) / (precision_neg + recall_neg)
    roc_auc = roc_auc_score(y_test, y_prob)
    
    return (acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc)

In [7]:
for experiment in experiments1:
    start_time = time.time()
    
    if subset == 'viral':
        df = df[df['viral']==1]
        
    if subset == 'viral-15-words':
        df = df[df['sharings']>1]
        df = df[df['words']>15]
    
    
    texts = df['text_content']
    y = df['Golpe']
       
    #removing duplicates
        
    df = df.drop_duplicates(subset=['text_content'])    
    texts = df['text_content']
    y = df['Golpe']
    
    
    
    # # Pre-processing
    # * convert url in just the domain
    # * separate emojis
    # * punctuation
    
    # [Some suggestions in this work](https://github.com/miguelfzafra/Latest-News-Classifier/blob/master/0.%20Latest%20News%20Classifier/03.%20Feature%20Engineering/03.%20Feature%20Engineering.ipynb)
    # 
    # * **Special character cleaning**
    # 
    # * **Upcase/downcase**
    # 
    # * **Punctuation signs** 
    # 
    # * **Possessive pronouns**
    # 
    # * **Stemming or Lemmatization**
    # 
    # * **Stop words**  
    
    #if experiment is with pre-processed text
    if 'processed' in experiment:
            #text was already pre-processed
            if preprocessed:
                if subset != 'viral':
                    pro_texts = pickle.load(open( "../data/processed_texts.p", "rb" ))
                else:
                    pro_texts = pickle.load(open( "../data/processed_texts-viral.p", "rb" ))
            else:
                pro_texts = [preprocess(t) for t in texts]
                if subset != 'viral':
                    pickle.dump(pro_texts, open( "../data/processed_texts.p", "wb" ))
                else:
                    pickle.dump(pro_texts, open( "../data/processed_texts-viral.p", "wb" ))
    else:
        #only use lowercase and separates emojis and punctuation
        pro_texts = [processEmojisPunctuation(t.lower(),remove_punct = False) for t in texts]
    
    # Train-test split
    
    #random state = 42 for reprudictibility
    texts_train, texts_test, y_train, y_test = train_test_split(pro_texts, y, test_size=0.2, 
                                                                        stratify = y, random_state=42)
    
    full_texts_train, full_texts_test, y_train, y_test = train_test_split(texts, y, test_size=0.2, 
                                                                        stratify = y, random_state=42)
    
    # Vectorization
    
    max_feat = 500
    #print(experiment)
    #vectorizer = None
    
    if 'tfidf' in experiment:
        if 'max_features' in experiment:
            vectorizer = TfidfVectorizer(max_features = max_feat)
        elif 'bigram' in experiment:
            vectorizer = TfidfVectorizer(ngram_range =(2,2))
        elif 'trigram' in experiment:
            vectorizer = TfidfVectorizer(ngram_range =(3,3)) 
        elif 'unibi_gram' in experiment:
            vectorizer = TfidfVectorizer(ngram_range =(1,2))
        elif 'unibitri_gram' in experiment:
            vectorizer = TfidfVectorizer(ngram_range =(1,3))       
        elif 'unibitriquad_gram' in experiment:
            vectorizer = TfidfVectorizer(ngram_range =(1,3))  
        else:
            vectorizer = TfidfVectorizer()
            
    elif 'bow' in experiment:
        if 'max_features' in experiment:
            vectorizer = CountVectorizer(max_features = max_feat, binary=True)
        elif 'bigram' in experiment:
            vectorizer = CountVectorizer(binary=True, ngram_range =(2,2))
        elif 'trigram' in experiment:
            vectorizer = CountVectorizer(binary=True, ngram_range =(3,3)) 
        elif 'unibi_gram' in experiment:
            vectorizer = CountVectorizer(binary=True, ngram_range =(1,2))
        elif 'unibitri_gram' in experiment:
            vectorizer = CountVectorizer(binary=True, ngram_range =(1,3))
        else:
            vectorizer = CountVectorizer(binary=True)
    print("Olá", vectorizer)
    vectorizer.fit(texts_train)   
    X_train = vectorizer.transform(texts_train)
    X_test = vectorizer.transform(texts_test)
    X = vectorizer.transform(pro_texts)
    
    
    if 'smote' in experiment:
        #oversampling with SMOTE
        sm = SMOTE(random_state = 42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    elif 'undersampling' in experiment:
        rus = RandomUnderSampler(random_state = 42)
        X_train, y_train = rus.fit_resample(X_train, y_train)
    elif 'random_oversampling' in experiment:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train = ros.fit_resample(X_train, y_train)
    
    vocab_size = X_train.shape[1]
    
    # Metrics
    scenario = []
    model = []
    accuracy_score_list = []
    precision_score_list = []
    precision_score_neg_list = []
    recall_score_list = []
    recall_score_neg_list = []
    f1_score_list = []
    f1_score_neg_list = []
    auc_score_list = []
       
    # ## Models training and test
    
    # ## Models training and test
    
    # In[39]:
    
    
    print('Logistic Regression')
    logreg = LogisticRegression().fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    y_prob = logreg.predict_proba(X_test)[:,1]
    model.append('logistic regression')
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)
    
    
    # In[25]:
    
    
    print('Bernoulli Naive-Bayes')
    bnb = BernoulliNB().fit(X_train, y_train)
    y_pred = bnb.predict(X_test)
    y_prob = bnb.predict_proba(X_test)[:,1]
    model.append('bernoulli naive-bayes')
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)
    
    
    # In[40]:
    
    
    print('Multinomial Naive-Bayes')
    mnb = MultinomialNB().fit(X_train, y_train)
    y_pred = mnb.predict(X_test)
    y_prob = mnb.predict_proba(X_test)[:,1]
    model.append('multinomial naive-bayes')
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)
    
    
    # In[41]:
    
    
    print('Linear Support Vector Machine')
    svm = LinearSVC(dual=False).fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    svm2 = LinearSVC()
    clf = CalibratedClassifierCV(svm2) 
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)[:,1]
    model.append('linear svm')
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)
    
    
    # In[42]:
    
    
    print('KNN')
    knn = KNeighborsClassifier().fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    y_prob = knn.predict_proba(X_test)[:,1]
    model.append('knn')
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)
    
    
    # In[45]:
    
    
    print('Linear SVM with SGD training.')
    sgd = SGDClassifier().fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    model.append('sgd')
    svm.fit(X_train, y_train)
    svm2 = SGDClassifier(loss='hinge')
    clf = CalibratedClassifierCV(svm2) 
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)[:,1]
    
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)
    
    
    # In[43]:
    
    
    print('Random Forest')
    rf = RandomForestClassifier().fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)[:,1]
    model.append('random forest')
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)
    
    
    # In[44]:
    
    
    print('Gradient Boosting')
    gb = GradientBoostingClassifier(n_estimators=200).fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    y_prob = gb.predict_proba(X_test)[:,1]
    model.append('gradient boosting')
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)    
    
    # In[46]:
    
    
    print('Multilayer perceptron')
    mlp = MLPClassifier(max_iter = 6, verbose=True, early_stopping= True).fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    y_prob = mlp.predict_proba(X_test)[:,1]
    model.append('mlp')
    acc, precision, precision_neg, recall, recall_neg, f1, f1_neg, roc_auc = getTestMetrics(y_test, y_pred, y_prob, full_metrics = True, print_charts = False)
    accuracy_score_list.append(acc)
    precision_score_list.append(precision)
    precision_score_neg_list.append(precision_neg)
    recall_score_list.append(recall)
    recall_score_neg_list.append(recall_neg)
    f1_score_list.append(f1)
    f1_score_neg_list.append(f1_neg)
    auc_score_list.append(roc_auc)
    end_time = time.time()
    ellapsed_time = end_time - start_time
    print('ellapsed time (min):', ellapsed_time/60)    
    
    df_metrics = pd.DataFrame({'model':model,                                 
                                     'vocab':[vocab_size]*len(model),
                                     'auc score': auc_score_list,
                                     'accuracy':accuracy_score_list,
                                     'precision 1': precision_score_list,
                                     'recall 1': recall_score_list,
                                     'f1 score 1': f1_score_list,
                                     'precision 0': precision_score_neg_list,
                                     'recall 0': recall_score_neg_list,                                 
                                     'f1 score 0': f1_score_neg_list
                                     })
    
    df_metrics['precision avg'] = (df_metrics['precision 1'] + df_metrics['precision 0'])/2
    df_metrics['recall avg'] = (df_metrics['recall 1'] + df_metrics['recall 0'])/2
    df_metrics['f1 avg'] = (df_metrics['f1 score 1'] + df_metrics['f1 score 0'])/2
    df_metrics.set_index('model', inplace=True)

    filepath = '../results/' + experiment + '.csv'
    print(filepath)
    df_metrics.to_csv(filepath) 
    
#%% update files
#    df_update = pd.read_csv(filepath)
#    df_update.set_index('model', inplace=True)
#    df_update.update(df_metrics)
#    df_update = df_update.reset_index()
#    df_update.to_csv(filepath, index = False)    
    
    
    # In[35]:
    
    
    #df_metrics.to_csv(filepath, index = False)

ERROR! Session/line number was not unique in database. History logging moved to new session 653
Olá TfidfVectorizer(ngram_range=(1, 2))
Logistic Regression
Bernoulli Naive-Bayes
Multinomial Naive-Bayes
Linear Support Vector Machine
KNN
Linear SVM with SGD training.
Random Forest
Gradient Boosting
Multilayer perceptron
Iteration 1, loss = 0.58602275
Validation score: 0.993421
Iteration 2, loss = 0.31341921
Validation score: 0.993421
Iteration 3, loss = 0.15182036
Validation score: 0.995614
Iteration 4, loss = 0.07940967
Validation score: 0.995614
Iteration 5, loss = 0.04756176
Validation score: 0.995614
Iteration 6, loss = 0.03172811
Validation score: 0.995614
ellapsed time (min): 3.434911223252614
../results/ml-tfidf-unibi_gram-processed-random_oversampling.csv




Olá CountVectorizer(binary=True, ngram_range=(1, 2))
Logistic Regression
Bernoulli Naive-Bayes
Multinomial Naive-Bayes
Linear Support Vector Machine




KNN
Linear SVM with SGD training.
Random Forest
Gradient Boosting
Multilayer perceptron
Iteration 1, loss = 0.29265859
Validation score: 0.995614
Iteration 2, loss = 0.03648675
Validation score: 0.997807
Iteration 3, loss = 0.01432455
Validation score: 0.997807
Iteration 4, loss = 0.00907153
Validation score: 0.997807
Iteration 5, loss = 0.00659063
Validation score: 0.997807
Iteration 6, loss = 0.00522914
Validation score: 0.997807
ellapsed time (min): 3.1309083461761475
../results/ml-bow-unibi_gram-processed-random_oversampling.csv




Olá CountVectorizer(binary=True, ngram_range=(1, 2))
Logistic Regression
Bernoulli Naive-Bayes
Multinomial Naive-Bayes
Linear Support Vector Machine




KNN
Linear SVM with SGD training.
Random Forest
Gradient Boosting
Multilayer perceptron
Iteration 1, loss = 0.24709085
Validation score: 0.993421
Iteration 2, loss = 0.02369632
Validation score: 0.995614
Iteration 3, loss = 0.00876925
Validation score: 0.995614
Iteration 4, loss = 0.00592191
Validation score: 0.995614
Iteration 5, loss = 0.00455981
Validation score: 0.995614
Iteration 6, loss = 0.00373556
Validation score: 0.997807
ellapsed time (min): 2.81797620455424
../results/ml-bow-unibi_gram-random_oversampling.csv




In [None]:
df_model_statistics = pd.read_csv('../results/ml-tfidf-unibitri_gram-random_oversampling.csv')

df_model_statistics