In [1]:
from threading import Thread
from queue import Queue
import sys
import os
import pandas as pd
import numpy as np
from multiprocessing import Pool
import pickle
from sklearn.metrics import f1_score

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import random
import time

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score, auc, confusion_matrix, accuracy_score

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import tree



In [2]:
import tldextract

In [3]:
HOME_PATH = '/home/joaomcouto/git/E02_misinformation_detection/'
MODELS_PATH = HOME_PATH + 'MM/Models/'
TABLES_PATH = HOME_PATH
GRAPHS_PATH = HOME_PATH + 'MM/Graphs/'
IMAGES_PATH = HOME_PATH + 'MM/Clusters/'

#N_FOLDS = 5
N_FOLDS = 5
RANDOM_STATE = 1

TASK_NAME = 'test_b4_data_split_mod'

In [4]:
def dataframe_summary(df):
    print(df.shape)
    print(df.describe())
    print("\n")
    print(df.info())
    return df.head(100)
    
def dataframe_sample(dfaa):
    return dfaa.sample(n=10).head(30)

pd.set_option('display.max_colwidth',500)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [5]:
#Recebe features e gera x combinações de tamanho r
def random_combinations(iterable, r, x, seed=10):
    #r é o tamanho dos modelos a serem gerados
    #x é o numero de modelos a serem gerados
    #iterable é o conjunto de features sobre as quais cada modelo sera definidos
    #ps: aqui, um modelo é o conjunto de features contida nele
    pool = tuple(iterable) #Transforma o conjunto de features numa tupla
    n = len(pool) #n é o numero de features 
    a = [] #a vai ser uma lista de listas onde cada elemento é um modelo gerado
    random.seed(seed) 
    for i in range(x): #Vai gerar um modelo x vezes
        indices = sorted(random.sample(range(n), r)) #Seleciona r indices aleatorios do conjunto de features
        a.insert(len(a), tuple(pool[i] for i in indices)) #Insere as features desses indices, em a, como um tupla
    return list(set(a)) #Ao final "a" tem até x modelos unicos com r features cada

In [6]:
#Recebe uma lista de de modelos de mesmo tamanho encontra melhores hiperparametros para esse tamanho
#Os hiperparametros encontrados são salvos em um arquivo
#Isso é calculado treinando todos os modelos em comb com cada combinação de hipeparamentro e,
# pegando a combinação de maior F1 (antes era AUC)
def gridSearch(df, comb, c):
    #comb é uma lista de tuplas onde cada uma é um modelo (conjunto de features)
                #essa lista é um subset de no maximo 50 dos modelos em eval_panel
    #df é o dataframe com os dados (features e label) das instancias
    #c é o tamanho dos modelos em comb
    bestParams = {'random_state': 20200225, 'criterion': 'gini', 'max_depth': None,
                  'min_samples_split': 71, 'min_samples_leaf': 29, 'min_impurity_decrease': 0.0}
    bestAUC = -1

    mdrange = [None, 3, 5, 10]
    criterions = ['gini', 'entropy']
    mssrange = list(range(5, 101, 5))
    mslrange = list(range(5, 51, 5))
    midrange = [0.0, 0.01, 0.1]

    #SÓ PRA TESTE RAPIDO DO CODIGO
#     mdrange = [None, 3, 10]
#     criterions = ['gini']
#     mssrange = list(range(5, 101, 25))
#     mslrange = list(range(5, 51, 25))
#     midrange = [0.0, 0.01, 0.1]
    
    #numero total de combinações de parametros que serão testados COM CADA modelo
    combinations = len(mdrange)*len(mssrange)*len(mslrange)*len(midrange)*len(criterions)
    
    sys.stdout.write(' GridSearch: pegando %d modelos tamanho %d, avaliando em %d combinações de parametros\n' % (
    len(comb), c, combinations))
    sys.stdout.flush()
    

    #Verifica se o gridsearch ja foi feito pra esse tamanho de modelo (salvo em $TASK_NAME$-size%TAMANHO%-gridsearch.pkl)
    #Se for, 
    #coloca os parametros na lista griddone onde cada posição indice i é bestParams pra modelos tamanho i e,
    #ja retorna o melhor conjunto de parametros.
    if os.path.isfile(MODELS_PATH + 'MultipleModels_DecisionTrees/' + TASK_NAME + '-size%d-gridsearch.pkl' % c):
        with open(MODELS_PATH + 'MultipleModels_DecisionTrees/' + TASK_NAME + '-size%d-gridsearch.pkl' % c, 'rb') as pkldic:
            bestParams = pickle.load(pkldic)
            griddone[c] = combinations
        return bestParams

    tdone = 0.0
    begin = time.time()
    z = 0
    for crit in criterions:
        for md in mdrange:
            for mss in mssrange:
                for msl in mslrange:
                    for mid in midrange:
                        tdone += 1
                        aucs = []
                        # ff eh um modelo pq comb eh uma lista de tuplas onde cada tupla é um modelo(conjunto de features) de mesmo tamanho
                        for ff in comb:
                            f = []
                            for x in ff:  # transforma a tupla com o modelo em uma lista (f)
                                f.insert(len(f), x)
                            auc, accmedia, preds, probs,f1,_ = select_features_platelabel(df, f,{'random_state': 1, 'max_depth': md, 'min_samples_split': mss, 'min_samples_leaf': msl, 'min_impurity_decrease': mid, 'criterion': crit}, nfolds=4,f1i=True)
                            aucs.extend(auc)
                            z += 1
                            # print(z,len(comb))

                        if np.mean(f1) > bestAUC:
                            bestParams['max_depth'] = md
                            bestParams['min_samples_split'] = mss
                            bestParams['min_samples_leaf'] = msl
                            bestParams['min_impurity_decrease'] = mid
                            bestParams['criterion'] = crit
                            bestAUC = np.mean(f1)

                        if c != 0:
                            now = time.time()
                            elapsed = now-begin
                            perinstance = float(elapsed)/float(tdone)
                            predicted = perinstance * combinations
                            griddone[c] += 1
                            sys.stdout.write('  GridSearch (size %02d) Progress: %.3f%% (%d/%d) [Elapsed: %ds | Predicted %ds | Avg: %ds]\r' % (
                                c, 100.0*tdone/combinations, tdone, combinations, elapsed, predicted, perinstance))
                            sys.stdout.flush()
                            if(tdone == combinations):
                                sys.stdout.write('\n')
                                sys.stdout.flush()

    with open(MODELS_PATH + 'MultipleModels_DecisionTrees/' + TASK_NAME + '-size%d-gridsearch.pkl' % c, 'wb') as pkldic:
        pickle.dump(bestParams, pkldic)
    return(bestParams)


In [7]:
# Recebe varios modelos DE MESMO TAMANHO, cada uma é uma tupla em comb

# Efetiva ou carrega o gridsearch para o tamanho, 
    #dai pra cada modelo de comb:
        #chama o select_panel com os params do grid search 
        #(que treina,classifica todas as instancias via folds, retorna performance e preds),
        # e salva stats de performance do modelo no arquivo exit_stat e,
        # as preds dada por ele em exit_outp e por fim,
        # retorna uma lista com as auc medias dos modelos do tamanho
        
#PS: em exit_stat teremos
    #[features modelo];media auc dos folds;media f1 dos folds;media f1w dos folds;,
    #[auc por fold],[f1 por fold],[f1w por fold];
    #media acuracia dos folds

#classifica, salva resultados+preds em exit_stat,exit_outp e,
# retorna uma lista com as aucs medias dos modelos daquele tamanho
def eval_panel_platelabel(df, comb, c, exit_stat, exit_outp):
    #df dado
    #comb lista com modelos com c features
    #exit_stat path para o csv onde resultados serão guardados 
    #exit_outp path para o csv onde predições serão guardadas 
    
    #fpath parece ser a mesma coisa que exit_outp
    fpath = MODELS_PATH + 'MultipleModels_DecisionTrees/' + TASK_NAME + '-size%d-preds.csv' % c

    performed = [] #Uma lista com todas as combinações de features (modelos) ja efetuados
    
    #Verifica se o csv com as predições ja foi criado e tem conteudo, isso indica uma execução parcial previa para esse tamanho)
    #Sendo o caso, carrega as combinações de features ja exploradas na lista performed
    if os.path.isfile(fpath) and os.path.getsize(fpath) > 0:
        performed = list(pd.read_csv(
            fpath, delimiter=';', header=0)['features'])
        
        #done[i] armazena o número de combinações(modelos) exploradas com i features
        done[c] += len(performed)
        global predone
        #predone armazena o número total de combinações que já haviam sido exploradas (encontradas no csv)
        predone += len(performed)
        
        #assegura que os proximos resultados e predições serão appendados numa nova linha
        exit_outp.write('\n')
        exit_stat.write('\n')
        
    #Não sendo o caso estruturamos os headers (primeira linha)do csv para receber as predições e probabilidades,
    #de cada instancia no dado (df)
    else:
        exit_outp.write('features')
        for i in range(len(df)):
            exit_outp.write(';pred%d' % (i+1))
        for i in range(len(df)):
            exit_outp.write(';prob%d' % (i+1))
        exit_outp.write('\n')

    #Faz o gridsearch usando 0,1% dos modelos em comb até no maximo 50 modelos daquele tamanho
    params = gridSearch(df, comb[:max(50, int(0.001*float(len(comb))))], c)

    ncomb = [] #Sera uma list com os modelos em comb menos daqueles que já foram explorados (estao em performed)
    begin = time.time()
    tdone = 0.0 #Armazena o total de novas combinações exploradas (não estavam em performed e foram efetivadas)

    for ff in comb:
        f = []
        for x in ff:  # transforma a tupla com o modelo em uma lista só pra conseguir ver se ta no performed
            f.insert(len(f), x)

        if str(f) not in performed:
            ncomb.append(ff)
            
    comb = ncomb
    
    sys.stdout.write(' MultipleModels: treinando e avaliando %d modelos tamanho %d c/ params do gridsearch\n' % (
    len(comb), c))
    sys.stdout.flush()
    
    res = []
    # ff eh um modelo pois comb eh uma lista de tuplas onde cada tupla é um modelo(conjunto de features) de mesmo tamanho
    #lembrando que comb foi atualizado para conter apenas as combinações que já não estavam em performed
    for ff in comb:
        tdone += 1
        now = time.time()
        elapsed = now-begin
        perinstance = float(elapsed)/float(tdone) #Tempo médio gasto em cada combinação
        predicted = perinstance * len(comb) #Tempo estimado até o fim de todas as combinações DESSE TAMANHO
        sys.stdout.write('  MM (size %02d) Progress: %.3f%% (%d/%d) [Elapsed: %ds | Predicted %ds | Avg: %ds]\r' % (
            c, 100.0*tdone/len(comb), tdone, len(comb), elapsed, predicted, perinstance))
        sys.stdout.flush()
        if(tdone == len(comb)):
            sys.stdout.write('\n\t\n')
            sys.stdout.flush()

        global s #Variavel global que define o número maximos de modelos que exploraremos pra um tamanho
                    #Até segunda ordem setado em 10000
                        #Necessario pous com um tamanho elevado de features as combinações possiveis são MUITA
                            #Nao precisamos explorar mais que s
        if done[c] > s: 
            break

        f = []
        for x in ff:  # transforma a tupla com o modelo em uma lista
            f.insert(len(f), x)
        auc, accmedia, preds, probs, f1, f1w = select_features_platelabel(
            df, f, params, nfolds=N_FOLDS, f1i=True)  # Chama a funcao central de treinamento p/ modelo f
        done[c] += 1
        res.append(np.mean(auc))
        exit_stat.write("%s;%f;%f;%f;%s;%s;%s;%s\n" %
                        (str(f), np.mean(auc),np.mean(f1),np.mean(f1w),auc,f1,f1w,accmedia))

        exit_outp.write("%s" % str(f))
        for p in preds:
            exit_outp.write(';%d' % p)
        for p in probs:
            exit_outp.write(';%f' % p)
        exit_outp.write('\n')
    return(res)

In [8]:
def delete_last_lines(ifile):
    with open(ifile, "r+", encoding="utf-8") as file:

        # Move the pointer (similar to a cursor in a text editor) to the end of the file
        file.seek(0, os.SEEK_END)

        # This code means the following code skips the very last character in the file -
        # i.e. in the case the last line is null we delete the last line
        # and the penultimate one
        pos = file.tell() - 1

        # Read each character in the file one at a time from the penultimate
        # character going backwards, searching for a newline character
        # If we find a new line, exit the search
        while pos > 0 and file.read(1) != "\n":
            pos -= 1
            file.seek(pos, os.SEEK_SET)

        # So long as we're not at the start of the file, delete all the characters ahead
        # of this position
        if pos > 0:
            file.seek(pos, os.SEEK_SET)
            file.truncate()

In [9]:
# Recebe UM modelo/parametros de decision tree e retorna scores, predições
# Retorna: aucs por fold, média de acuracia nos folds,
def select_features_platelabel(df, features, params, nfolds, f1i=False):  
    #

    X = df[features].values
    y = df[label_column_name].values
    predList = np.zeros(len(df))
    probList = np.zeros(len(df))

    cv = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=1)
    foldNum = 0
    a = []
    b = []
    c = []
    d = []
    for (train, val) in cv.split(X, y):
        #print(np.sum(y[train]),np.sum(y[val],len(y))
        foldNum = foldNum + 1

        # Modelo arvore
        classifier = DecisionTreeClassifier(class_weight='balanced', 
                                            max_depth=params['max_depth'], 
                                            min_samples_leaf=params['min_samples_leaf'],
                                            min_samples_split=params['min_samples_split'], 
                                            min_impurity_decrease=params['min_impurity_decrease'], 
                                            criterion=params['criterion'])
        classifier = classifier.fit(X[train], y[train])
        probas_ = classifier.predict_proba(X[val])
        
        #Extraimos[0] pois predict_proba retorna uma coluna pra cada classe, pegamos as probabilidades da 0
        probas = [probas_[x][0] for x in range(len(probas_))]

        pred = classifier.predict(X[val])
        area1 = roc_auc_score(y[val], probas_[:, 1])
        area2 = accuracy_score(y[val], pred)  # guarda acuracia

        #print('b',np.sum(y[val]),np.sum(pred),len(y[val]))
        f1 = f1_score(y[val],pred, average='binary')
        #print('w',np.sum(y[val]),np.sum(pred),len(y[val]))
        f1w = f1_score(y[val],pred, average='weighted')
        
        #a guarda o AUC score de cada fold
        a.insert(len(a), area1)
        #b guarda a acuraria score de cada fold
        b.insert(len(b), area2)
        #c guarda o F1-binary score de cada fold
        c.insert(len(c),f1)
        #d guarda o F1-weighted score de cada fold
        d.insert(len(d),f1w)

        for j in range(len(val)):
            #Como cada instancia em df vai estar no conjunto de val em algum fold,
            #predlist armazena as predições para todos eles em seus respectivos folds
            #probLIst armazena as probabilidades para todos eles em seus respectivos folds
            
            #val[i] contem o indice em X da i-esima instancia atualmente na validação
            #Assim se a instancia indice 3 do dataframe é o primeiro elemento no conjunto de validação atual,
            #  estamos fazendo predList[3] = pred[0] já que pred é indexado na ordem de val
            predList[val[j]] = pred[j]
            probList[val[j]] = probas[j]

    if f1i:
        return a, np.mean(b), predList, probList,c,d
    return a, np.mean(b), predList, probList

In [10]:
#df = pd.read_csv(TABLES_PATH + 'totalcx7.csv')
df = pd.read_pickle("./dfSourceFeatures28Aug2021.pkl")

In [11]:
dfSource = pd.read_pickle('dfSubdomainSourceFeaturesBaseSamuel_comRankingFeatures_v2.pkl')
newRows = []
for index, row in dfSource.iterrows():
    if('www.' in row['subdomain']):
        rowCopy = row.copy()
        ext = tldextract.extract(row['subdomain'])
        rowCopy['subdomain'] = ext.domain +'.'+ ext.suffix
        if(rowCopy['subdomain'] not in list(dfSource['subdomain'])):
            newRows.append(rowCopy.values)
dfSource = dfSource.append(pd.DataFrame(newRows, columns=dfSource.columns)).reset_index(drop=True)
dfSource = dfSource.infer_objects()
        
        

In [12]:
dataframe_summary(dfSource)

(214, 15)
       desinformacao_label  subdomain_ip_latitude  subdomain_ip_longitude  \
count           214.000000             214.000000              214.000000   
mean              0.705607              25.841168              -91.378972   
std               0.456838              24.982042               33.831517   
min               0.000000             -30.040000             -122.390000   
25%               0.000000              32.995000             -121.840000   
50%               1.000000              37.420000              -96.800000   
75%               1.000000              38.980000              -73.980000   
max               1.000000              52.310000               31.240000   

       domain_route_hops  domain_dns_caa_txt_count  subdomain_tenMill_rank  \
count         214.000000                214.000000            1.470000e+02   
mean           12.387850                  3.056075            1.877128e+06   
std             4.282736                  3.094237            

Unnamed: 0,desinformacao_label,subdomain,subdomain_ip,subdomain_ip_cc,subdomain_ip_is_brazil,subdomain_ip_is_us,subdomain_ip_latitude,subdomain_ip_longitude,subdomain_as_n,subdomain_as_cc,subdomain_ipcc_equal_ascc,domain_route_hops,domain_dns_caa_txt_count,subdomain_tenMill_rank,subdomain_tenMill_open_page_rank
0,0,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57
1,0,www.uol.com.br,54.239.152.111,US,False,True,38.91,-77.03,16509,US,True,12,6,11962.0,5.05
2,0,www.correio24horas.com.br,204.199.44.210,BR,True,False,-19.92,-43.95,3549,US,False,11,3,51590.0,4.51
3,0,tribunaonline.com.br,35.201.90.53,US,False,True,39.11,-94.54,15169,US,True,14,2,3138498.0,3.19
4,0,www.opovo.com.br,104.21.52.3,US,False,True,37.33,-121.84,13335,US,True,9,3,25745.0,4.71
5,0,www.diariodolitoral.com.br,172.67.150.33,US,False,True,37.77,-122.39,13335,US,True,9,2,238745.0,4.18
6,0,www.zmnoticias.com.br,143.208.10.6,BR,True,False,-23.55,-46.63,28209,BR,True,14,1,,
7,0,www.estadaomatogrosso.com.br,64.225.27.79,US,False,True,38.91,-77.03,14061,US,True,11,3,,
8,0,www.folhape.com.br,104.21.48.71,US,False,True,37.33,-121.84,13335,US,True,9,4,145293.0,4.27
9,0,www.gazetadopovo.com.br,13.224.210.67,US,False,True,39.99,-75.14,16509,US,True,12,5,18758.0,4.82


In [13]:
dfSource['desinformacao_label']= dfSource['desinformacao_label'].astype('bool')
dfSource['subdomain_ip_cc']= dfSource['subdomain_ip_cc'].astype('category')
dfSource['subdomain_as_cc']= dfSource['subdomain_as_cc'].astype('category')

In [14]:
dfEnvironment = pd.read_pickle('features_env_v1.pkl')
dfEnvironment = dfEnvironment.infer_objects()

In [15]:
dataframe_summary(dfEnvironment) 

(7146, 12)
            retweets          likes  initial_tweets  tweet_count_max  \
count    7146.000000    7146.000000     7146.000000     7.146000e+03   
mean      372.032326    1398.328296       91.600756     2.651567e+05   
std      4531.019018   19762.137200      754.983432     3.385045e+05   
min         0.000000       0.000000        1.000000     0.000000e+00   
25%         0.000000       0.000000        1.000000     8.322800e+04   
50%         2.000000       2.000000        2.000000     1.634510e+05   
75%        15.000000      67.000000        9.000000     3.174220e+05   
max    232628.000000  776532.000000    20037.000000     3.077916e+06   

       tweet_count_avg  followers_count_max  followers_count_avg  \
count     7.146000e+03         7.146000e+03         7.146000e+03   
mean      1.100409e+05         8.993120e+05         3.233824e+05   
std       1.128256e+05         2.405971e+06         7.913217e+05   
min       0.000000e+00         0.000000e+00         0.000000e+00   


Unnamed: 0,retweets,likes,initial_tweets,tweet_count_max,tweet_count_avg,followers_count_max,followers_count_avg,following_count_max,following_count_avg,verified_proportion,url,desinformacao_label
0,4.0,17.0,6,718820,144588.5,12971449,2174644.0,614,241.0,0.333333,https://g1.globo.com/am/amazonas/noticia/a-gente-sai-de-casa-com-a-sensacao-de-que-nao-vai-voltar-diz-agente-sobre-rotina-em-cadeia-no-am.ghtml,0
1,0.0,0.0,1,317422,317422.0,1333445,1333445.0,163,163.0,1.0,https://www.uol.com.br/esporte/olimpiadas/ultimas-noticias/2021/08/04/aulas-galvao-ensina-andreoli-a-narrar-nomes-dificeis-de-atletas-olimpicos.htm,0
2,2.0,3.0,2,132680,67299.0,12481,6255.0,128,107.5,0.0,http://agorarn.com.br/chamada/apos-ataque-onibus-transporte-publico-de-natal-volta-funcionar-neste-domingo/,0
3,0.0,0.0,2,132680,67736.0,12481,6315.5,315,221.5,0.0,http://agorarn.com.br/cidades/governo-cidadao-garante-continuidade-de-investimentos-no-distrito-irrigado-do-baixo-acu/,0
4,0.0,0.0,7,132680,61114.714286,12481,3895.286,2110,866.0,0.0,http://agorarn.com.br/destaquefotos/juiz-quebra-sigilo-bancario-de-empresa-que-prestou-servicos-a-campanha-de-fatima/,0
5,0.0,0.0,2,132680,132205.5,12481,10981.5,468,298.0,0.0,http://agorarn.com.br/destaques/nordestao-inaugura-seu-segundo-atacarejo-no-rn-nesta-quinta-feira/,0
6,0.0,0.0,4,132680,39521.5,12481,3574.0,1703,773.75,0.0,http://agorarn.com.br/politica/dinheiro-sonegado-seria-suficiente-para-tampar-rombo-da-previdencia-diz-styvenson/,0
7,84.0,62.0,74,873842,119939.081081,3997654,114096.3,17121,2045.72973,0.054054,http://josiasdesouza.blogosfera.uol.com.br/2014/10/31/aecio-chama-aliados-para-reaparicao-na-quarta/,0
8,106.0,119.0,40,980382,129596.125,3997654,117263.1,55035,4134.75,0.1,http://josiasdesouza.blogosfera.uol.com.br/2015/06/21/dilma-e-pesadelo-do-qual-lula-nao-pode-acordar/,0
9,358.0,450.0,59,505006,136371.474576,3997654,317978.7,15397,1999.016949,0.118644,http://josiasdesouza.blogosfera.uol.com.br/2016/06/24/lava-jato-agora-prepara-denuncias-contra-lula/,0


In [16]:
dfEnvironment['retweets']= dfEnvironment['retweets'].astype('int')
dfEnvironment['likes']= dfEnvironment['likes'].astype('int')
dfEnvironment['desinformacao_label']= dfEnvironment['desinformacao_label'].astype('bool')

In [17]:
import json
with open("filtradas_fake_titles.txt") as f:
    news_dict = []
    for news in f:
        news_dict.append(json.loads(news))
    dfContentFalse = pd.DataFrame.from_dict(news_dict)
    
#jornal21brasil.com.br tava como fonte apenas .com por isso nao dava match
dfContentFalse.loc[dfContentFalse['url'] == 'https://www.jornal21brasil.com.br/2019/12/congresso-corta-saude-e-educacao-para.html', 'fonte'] = 'jornal21brasil.com.br'
dfContentFalse = dfContentFalse.infer_objects() #Botaram features numa coluna só... o infer n vai fazer nada tem que tratar dps

In [18]:
dataframe_summary(dfContentFalse)

(292, 4)
                                                                                                      url  \
count                                                                                                 292   
unique                                                                                                292   
top     https://revistaoeste.com/tecnologia/ivermectina-reduz-em-56-as-mortes-por-covid-19-mostra-estudo/   
freq                                                                                                    1   

                         fonte  \
count                      292   
unique                     105   
top     tribunanacional.com.br   
freq                         9   

                                                                                     Titulo  \
count                                                                                   292   
unique                                                                              

Unnamed: 0,url,fonte,Titulo,features
0,https://revistaoeste.com/tecnologia/ivermectina-reduz-em-56-as-mortes-por-covid-19-mostra-estudo/,revistaoeste.com,"Ivermectina reduz em 56% as mortes por covid-19, mostra estudo","{'Toxicity': 0.09523559, 'Threat': 0.42579982, 'Insult': 0.019164484, 'sentiment_score': 0.10000000149011612, 'sentiment_magnitude': 0.10000000149011612, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 6.9, 'ARI': 5.5, 'Coleman-Liau': 8.2, 'Flesch Index': 64.9, 'Fog Index': 8.0, 'Lix': 20.1, 'SMOG-Grading': 8.5, 'n_characters': 50, 'n_words': 11, 'word_avg_lenght': 4.55, 'n_sentences': 1, 'sentence_avg_length': 11.0, 'n_short_sen..."
1,https://revistaoeste.com/mundo/coronavac-nao-e-adequada-para-combater-a-pandemia-diz-primeiro-ministro-da-italia/,revistaoeste.com,"CoronaVac não é adequada para combater a pandemia, diz primeiro-ministro da Itália","{'Toxicity': 0.09367218, 'Threat': 0.28688467, 'Insult': 0.09072053, 'sentiment_score': -0.5, 'sentiment_magnitude': 0.5, 'sentiment': 'Negative', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 12.2, 'ARI': 9.3, 'Coleman-Liau': 12.2, 'Flesch Index': 30.9, 'Fog Index': 17.5, 'Lix': 51.5, 'SMOG-Grading': 14.0, 'n_characters': 67, 'n_words': 13, 'word_avg_lenght': 5.15, 'n_sentences': 1, 'sentence_avg_length': 13.0, 'n_short_sentence': 0, 'n_long_sentence..."
2,https://revistaoeste.com/mundo/com-54-da-populacao-vacinada-chile-tem-91-das-utis-ocupadas/,revistaoeste.com,"Com 54% da população vacinada, Chile tem 91% das UTIs ocupadas","{'Toxicity': 0.026644873, 'Threat': 0.07751938, 'Insult': 0.022364318, 'sentiment_score': 0.10000000149011612, 'sentiment_magnitude': 0.10000000149011612, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 6.8, 'ARI': 3.0, 'Coleman-Liau': 4.8, 'Flesch Index': 67.8, 'Fog Index': 11.5, 'Lix': 28.7, 'SMOG-Grading': 10.7, 'n_characters': 47, 'n_words': 12, 'word_avg_lenght': 3.92, 'n_sentences': 1, 'sentence_avg_length': 12.0, 'n_short_..."
3,https://gazetabrasil.com.br/celebridades/2021/07/21/eric-clapton-diz-que-nao-vai-fazer-shows-em-locais-em-que-vacinacao-seja-obrigatoria/,gazetabrasil.com.br,eric clapton diz que nao vai fazer shows em locais em que vacinacao seja obrigatoria,"{'Toxicity': 0.038852096, 'Threat': 0.07751938, 'Insult': 0.029333333, 'sentiment_score': -0.20000000298023224, 'sentiment_magnitude': 0.20000000298023224, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 8.4, 'ARI': 8.1, 'Coleman-Liau': 9.7, 'Flesch Index': 61.9, 'Fog Index': 11.3, 'Lix': 35.0, 'SMOG-Grading': 10.7, 'n_characters': 70, 'n_words': 15, 'word_avg_lenght': 4.67, 'n_sentences': 1, 'sentence_avg_length': 15.0, 'n_short..."
4,https://pleno.news/brasil/politica-nacional/pazuello-stf-impediu-plano-do-governo-de-combate-a-pandemia.html,pleno.news,Pazuello: STF impediu plano do governo de combate à pandemia,"{'Toxicity': 0.099415205, 'Threat': 0.23644228, 'Insult': 0.21493441, 'sentiment_score': 0.0, 'sentiment_magnitude': 0.0, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 1, 'hashtags_number': 0, 'Kincaid': 4.5, 'ARI': 6.5, 'Coleman-Liau': 9.6, 'Flesch Index': 70.7, 'Fog Index': 1.8, 'Lix': 60.1, 'SMOG-Grading': 3.0, 'n_characters': 49, 'n_words': 9, 'word_avg_lenght': 5.44, 'n_sentences': 2, 'sentence_avg_length': 4.5, 'n_short_sentence': 1, 'n_long_sentence': 0, '..."
5,https://pleno.news/mundo/cidade-do-mexico-ivermectina-reduziu-internacoes-em-ate-76.html,pleno.news,Cidade do México: Ivermectina reduziu internações em até 76%,"{'Toxicity': 0.036300257, 'Threat': 0.07751938, 'Insult': 0.015363104, 'sentiment_score': 0.10000000149011612, 'sentiment_magnitude': 0.10000000149011612, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 3.7, 'ARI': 1.0, 'Coleman-Liau': 3.4, 'Flesch Index': 78.2, 'Fog Index': 5.8, 'Lix': 32.8, 'SMOG-Grading': 6.9, 'n_characters': 46, 'n_words': 11, 'word_avg_lenght': 4.18, 'n_sentences': 2, 'sentence_avg_length': 5.5, 'n_short_sen..."
6,https://www.mises.org.br/article/3298/lockdown-a-nova-ideologia-totalitaria,mises.org.br,Lockdown: a nova ideologia totalitária,"{'Toxicity': 0.08244847, 'Threat': 0.2877095, 'Insult': 0.061078124, 'sentiment_score': 0.0, 'sentiment_magnitude': 0.0, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 8.4, 'ARI': 16.8, 'Coleman-Liau': 1.7, 'Flesch Index': 36.6, 'Fog Index': 0.4, 'Lix': 101.0, 'SMOG-Grading': 3.0, 'n_characters': 8, 'n_words': 1, 'word_avg_lenght': 8.0, 'n_sentences': 1, 'sentence_avg_length': 1.0, 'n_short_sentence': 1, 'n_long_sentence': 0, 'n..."
7,https://www.mises.org.br/article/3277/comecamos-com-os-lockdowns-e-estamos-indo-para-o-grande-reset-atualizado,mises.org.br,"Começamos com os lockdowns. E estamos indo para ""O Grande Reset""","{'Toxicity': 0.025995096, 'Threat': 0.08718721, 'Insult': 0.015294225, 'sentiment_score': 0.10000000149011612, 'sentiment_magnitude': 0.20000000298023224, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 2, 'hashtags_number': 0, 'Kincaid': 3.5, 'ARI': 1.2, 'Coleman-Liau': 3.8, 'Flesch Index': 80.9, 'Fog Index': 5.7, 'Lix': 22.7, 'SMOG-Grading': 6.9, 'n_characters': 50, 'n_words': 12, 'word_avg_lenght': 4.17, 'n_sentences': 2, 'sentence_avg_length': 6.0, 'n_short_sen..."
8,https://brasilsemmedo.com/anvisa-confessa-ineficacia-de-mascaras-e-diz-se-basear-apenas-na-oms/,brasilsemmedo.com,Anvisa confessa ineficácia de máscaras e diz se basear apenas na OMS,"{'Toxicity': 0.2248475, 'Threat': 0.20398833, 'Insult': 0.31592187, 'sentiment_score': 0.0, 'sentiment_magnitude': 0.0, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 1, 'hashtags_number': 0, 'Kincaid': 7.6, 'ARI': 4.1, 'Coleman-Liau': 5.2, 'Flesch Index': 65.7, 'Fog Index': 11.3, 'Lix': 21.1, 'SMOG-Grading': 10.7, 'n_characters': 55, 'n_words': 14, 'word_avg_lenght': 3.93, 'n_sentences': 1, 'sentence_avg_length': 14.0, 'n_short_sentence': 0, 'n_long_sentence': 0,..."
9,https://brasilsemmedo.com/eric-clapton-se-nega-a-tocar-em-locais-que-exijam-vacinacao/,brasilsemmedo.com,Eric Clapton se nega a tocar em locais que exijam vacinação,"{'Toxicity': 0.081871085, 'Threat': 0.07751938, 'Insult': 0.06836176, 'sentiment_score': 0.0, 'sentiment_magnitude': 0.0, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 6.8, 'ARI': 3.0, 'Coleman-Liau': 4.8, 'Flesch Index': 67.8, 'Fog Index': 8.1, 'Lix': 20.3, 'SMOG-Grading': 8.5, 'n_characters': 47, 'n_words': 12, 'word_avg_lenght': 3.92, 'n_sentences': 1, 'sentence_avg_length': 12.0, 'n_short_sentence': 0, 'n_long_sentence': 0,..."


In [19]:
with open("filtradas_true_titles.txt") as f:
    news_dict = []
    for news in f:
        news_dict.append(json.loads(news))
    dfContentTrue = pd.DataFrame.from_dict(news_dict)
dfContentTrue = dfContentTrue.infer_objects()

In [20]:
dataframe_summary(dfContentTrue)

(1181, 4)
                                                                                                                            url  \
count                                                                                                                      1181   
unique                                                                                                                     1180   
top     https://www.diariodolitoral.com.br/mundo/japao-ve-disseminacao-inedita-da-covid-19-e-recorde-de-casos-em-toquio/147888/   
freq                                                                                                                          2   

               fonte  \
count           1181   
unique            33   
top     g1.globo.com   
freq             288   

                                                                     Titulo  \
count                                                                  1181   
unique                                                 

Unnamed: 0,url,fonte,Titulo,features
0,https://g1.globo.com/politica/noticia/2021/06/24/governo-diz-que-bolsonaro-repassou-denuncias-sobre-compra-da-covaxin-a-pazuello.ghtml,g1.globo.com,Governo diz que Bolsonaro repassou denúncias sobre contrato da Covaxin a Pazuello,"{'Toxicity': 0.24297374, 'Threat': 0.2184854, 'Insult': 0.3228127, 'sentiment_score': -0.10000000149011612, 'sentiment_magnitude': 0.10000000149011612, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 8.5, 'ARI': 10.1, 'Coleman-Liau': 13.1, 'Flesch Index': 57.0, 'Fog Index': 11.4, 'Lix': 59.2, 'SMOG-Grading': 10.7, 'n_characters': 69, 'n_words': 13, 'word_avg_lenght': 5.31, 'n_sentences': 1, 'sentence_avg_length': 13.0, 'n_short_s..."
1,https://www.uol.com.br/vivabem/noticias/redacao/2020/05/22/da-para-receber-cabeleireiro-e-manicure-em-casa-durante-a-pandemia-entenda.htm,www.uol.com.br,Dá para receber cabeleireiro e manicure em casa durante a pandemia? Entenda,"{'Toxicity': 0.28396788, 'Threat': 0.52348995, 'Insult': 0.26752764, 'sentiment_score': 0.20000000298023224, 'sentiment_magnitude': 0.4000000059604645, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 7.4, 'ARI': 5.9, 'Coleman-Liau': 9.6, 'Flesch Index': 52.7, 'Fog Index': 12.4, 'Lix': 56.0, 'SMOG-Grading': 9.7, 'n_characters': 62, 'n_words': 12, 'word_avg_lenght': 5.17, 'n_sentences': 2, 'sentence_avg_length': 6.0, 'n_short_sente..."
2,https://www.correio24horas.com.br/noticia/nid/a-poupanca-e-uma-alternativa-de-investimento-na-pandemia/,www.correio24horas.com.br,A poupança é uma alternativa de investimento na pandemia?,"{'Toxicity': 0.012811388, 'Threat': 0.07751938, 'Insult': 0.013248383, 'sentiment_score': 0.0, 'sentiment_magnitude': 0.0, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 1, 'hashtags_number': 0, 'Kincaid': 10.2, 'ARI': 7.1, 'Coleman-Liau': 11.0, 'Flesch Index': 37.9, 'Fog Index': 12.5, 'Lix': 42.3, 'SMOG-Grading': 10.7, 'n_characters': 46, 'n_words': 9, 'word_avg_lenght': 5.11, 'n_sentences': 1, 'sentence_avg_length': 9.0, 'n_short_sentence': 0, 'n_long_sentence':..."
3,https://tribunaonline.com.br/associacao-medica-entra-com-acao-para-receitar-cloroquina-no-espirito-santo,tribunaonline.com.br,Associação Médica entra com ação para receitar cloroquina no Espírito Santo,"{'Toxicity': 0.033501644, 'Threat': 0.07751938, 'Insult': 0.02169453, 'sentiment_score': 0.0, 'sentiment_magnitude': 0.0, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 5.2, 'ARI': 4.6, 'Coleman-Liau': 5.4, 'Flesch Index': 84.5, 'Fog Index': 11.3, 'Lix': 35.0, 'SMOG-Grading': 10.7, 'n_characters': 59, 'n_words': 15, 'word_avg_lenght': 3.93, 'n_sentences': 1, 'sentence_avg_length': 15.0, 'n_short_sentence': 0, 'n_long_sentence': ..."
4,https://www.correio24horas.com.br/noticia/nid/conheca-vini-boca-de-09-o-diabo-ozado-em-forma-de-guri-que-ja-mandou-o-corona-partir/,www.correio24horas.com.br,"Conheça Vini Boca de 09, o ‘diabo ozado’ em forma de guri que já mandou o corona partir","{'Toxicity': 0.68951297, 'Threat': 0.23167875, 'Insult': 0.71376973, 'sentiment_score': 0.30000001192092896, 'sentiment_magnitude': 0.30000001192092896, 'sentiment': 'Positive', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 5.5, 'ARI': 4.2, 'Coleman-Liau': 2.8, 'Flesch Index': 89.6, 'Fog Index': 7.6, 'Lix': 19.0, 'SMOG-Grading': 3.0, 'n_characters': 65, 'n_words': 19, 'word_avg_lenght': 3.42, 'n_sentences': 1, 'sentence_avg_length': 19.0, 'n_short_sen..."
5,https://www.uol.com.br/nossa/noticias/afp/2020/05/18/cornavirus-famoso-monte-fuji-no-japao-ficara-fechado-este-verao.htm,www.uol.com.br,"Coronavírus: famoso Monte Fuji, no Japão, ficará fechado este verão","{'Toxicity': 0.035832137, 'Threat': 0.22077091, 'Insult': 0.029333333, 'sentiment_score': 0.10000000149011612, 'sentiment_magnitude': 0.10000000149011612, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 2.5, 'ARI': -0.8, 'Coleman-Liau': 1.3, 'Flesch Index': 87.9, 'Fog Index': 5.7, 'Lix': 22.7, 'SMOG-Grading': 6.9, 'n_characters': 45, 'n_words': 12, 'word_avg_lenght': 3.75, 'n_sentences': 2, 'sentence_avg_length': 6.0, 'n_short_se..."
6,https://g1.globo.com/politica/cpi-da-covid/noticia/2021/07/20/secretaria-do-ministerio-da-saude-sugeriu-a-portugal-atendimento-precoce-contra-a-covid.ghtml,g1.globo.com,Secretária do Ministério da Saúde sugeriu a Portugal 'atendimento precoce' contra a Covid,"{'Toxicity': 0.06344788, 'Threat': 0.22395833, 'Insult': 0.070534416, 'sentiment_score': 0.30000001192092896, 'sentiment_magnitude': 0.30000001192092896, 'sentiment': 'Positive', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 9.8, 'ARI': 7.8, 'Coleman-Liau': 8.8, 'Flesch Index': 53.1, 'Fog Index': 11.4, 'Lix': 41.0, 'SMOG-Grading': 10.7, 'n_characters': 72, 'n_words': 16, 'word_avg_lenght': 4.5, 'n_sentences': 1, 'sentence_avg_length': 16.0, 'n_short_s..."
7,https://www.opovo.com.br/coronavirus/2021/05/14/tempo-medio-de-internacao-aumenta-mais-de-100--em-relacao-a-1---onda-no-ceara.html,www.opovo.com.br,Covid-19: Tempo médio de internação aumenta mais de 100% em relação à 1ª onda no Ceará,"{'Toxicity': 0.030144952, 'Threat': 0.07751938, 'Insult': 0.021756342, 'sentiment_score': 0.30000001192092896, 'sentiment_magnitude': 0.30000001192092896, 'sentiment': 'Positive', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 0, 'Kincaid': 1.8, 'ARI': -1.8, 'Coleman-Liau': -0.3, 'Flesch Index': 99.2, 'Fog Index': 3.8, 'Lix': 20.0, 'SMOG-Grading': 3.0, 'n_characters': 60, 'n_words': 19, 'word_avg_lenght': 3.16, 'n_sentences': 2, 'sentence_avg_length': 9.5, 'n_short_..."
8,https://g1.globo.com/df/distrito-federal/noticia/2021/08/02/vacinacao-contra-covid-19-2a-dose-da-pfizer-sera-aplicada-nesta-segunda-veja-locais.ghtml,g1.globo.com,Vacinação contra Covid-19: 2ª dose da Pfizer é aplicada nesta segunda; veja locais no DF,"{'Toxicity': 0.041336272, 'Threat': 0.15131783, 'Insult': 0.027591081, 'sentiment_score': 0.0, 'sentiment_magnitude': 0.0, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 1, 'hashtags_number': 0, 'Kincaid': 5.1, 'ARI': 3.6, 'Coleman-Liau': 6.8, 'Flesch Index': 70.1, 'Fog Index': 5.7, 'Lix': 21.9, 'SMOG-Grading': 6.9, 'n_characters': 60, 'n_words': 13, 'word_avg_lenght': 4.62, 'n_sentences': 2, 'sentence_avg_length': 6.5, 'n_short_sentence': 0, 'n_long_sentence': 0,..."
9,https://g1.globo.com/bemestar/podcast/noticia/2021/06/23/bem-estar-96-por-que-a-covid-reduziu-a-expectativa-de-vida-do-brasileiro.ghtml,g1.globo.com,Bem Estar #96: Por que a Covid reduziu a expectativa de vida do brasileiro?,"{'Toxicity': 0.08244847, 'Threat': 0.16374269, 'Insult': 0.08803569, 'sentiment_score': 0.0, 'sentiment_magnitude': 0.0, 'sentiment': 'Neutral', 'exclamation_number': 0, 'uppercase_words_number': 0, 'hashtags_number': 1, 'Kincaid': 4.8, 'ARI': 1.9, 'Coleman-Liau': 4.8, 'Flesch Index': 72.8, 'Fog Index': 8.5, 'Lix': 28.4, 'SMOG-Grading': 8.5, 'n_characters': 59, 'n_words': 14, 'word_avg_lenght': 4.21, 'n_sentences': 2, 'sentence_avg_length': 7.0, 'n_short_sentence': 1, 'n_long_sentence': 0, '..."


In [21]:
print("Dimensões originais content falso, content true")
print(len(list(dfContentFalse['url'])))
print(len(list(dfContentTrue['url'])))
#print(len(list(dfSource[dfSource['desinformacao_label']==1]['subdomain'])))
#print(len(list(dfSource[dfSource['desinformacao_label']==0]['subdomain'])))


a = pd.merge(dfContentTrue,dfSource, left_on='fonte', right_on='subdomain')
b = pd.merge(dfContentFalse,dfSource, left_on='fonte', right_on='subdomain')
dfFeatures = pd.concat([a,b],ignore_index=True)
print("Dimensões false,true pós merge entre Content e Source")
print(len(list(dfFeatures[dfFeatures['desinformacao_label']==1]['url'])))
print(len(list(dfFeatures[dfFeatures['desinformacao_label']==0]['url'])))

dfFeatures = dfFeatures.drop('fonte', 1)
dfFeatures = pd.merge(dfFeatures,dfEnvironment , left_on='url', right_on='url')
print("Dimensões false,true pós merge entre Content+Source e environment")
print(len(list(dfFeatures[dfFeatures['desinformacao_label_x']==1]['url'])))
print(len(list(dfFeatures[dfFeatures['desinformacao_label_x']==0]['url'])))

dfFeatures = dfFeatures.drop('desinformacao_label_y', 1)
dfFeatures = dfFeatures.rename(columns={'desinformacao_label_x': 'desinformacao_label'})

#Extraindo features de content da coluna unica de features
#Cria uma lista pra cada feature com os valores de todas as instancias
content_cols = list(dict(dfFeatures['features'][0]).keys())
extracted_content_features = dict()
for col in content_cols:
    extracted_content_features[col] = []
for index, row in dfFeatures.iterrows():
    row_content_features = dict(row['features'])
    for col in content_cols:
        extracted_content_features[col].append(row_content_features[col])

for col in content_cols:
    dfFeatures[col] = extracted_content_features[col]
    
dfFeatures = dfFeatures.drop('features',1)
dfFeatures = dfFeatures.infer_objects()
    


    

Dimensões originais content falso, content true
292
1181
Dimensões false,true pós merge entre Content e Source
283
1189
Dimensões false,true pós merge entre Content+Source e environment
240
754


  dfFeatures = dfFeatures.drop('fonte', 1)
  dfFeatures = dfFeatures.drop('desinformacao_label_y', 1)
  dfFeatures[col] = extracted_content_features[col]
  dfFeatures = dfFeatures.drop('features',1)


In [22]:
dataframe_summary(dfFeatures)

(994, 130)
       subdomain_ip_latitude  subdomain_ip_longitude  domain_route_hops  \
count             994.000000              994.000000         994.000000   
mean                7.976942              -71.275141          11.726358   
std                30.372807               34.667188           2.876366   
min               -30.040000             -122.390000           0.000000   
25%               -22.870000             -116.652500          11.000000   
50%                -3.720000              -51.230000          11.000000   
75%                37.770000              -43.950000          12.000000   
max                52.310000               31.240000          25.000000   

       domain_dns_caa_txt_count  subdomain_tenMill_rank  \
count                994.000000            9.100000e+02   
mean                   5.467807            6.350626e+05   
std                    4.315301            1.425072e+06   
min                    0.000000            9.000000e+01   
25%               

Unnamed: 0,url,Titulo,desinformacao_label,subdomain,subdomain_ip,subdomain_ip_cc,subdomain_ip_is_brazil,subdomain_ip_is_us,subdomain_ip_latitude,subdomain_ip_longitude,subdomain_as_n,subdomain_as_cc,subdomain_ipcc_equal_ascc,domain_route_hops,domain_dns_caa_txt_count,subdomain_tenMill_rank,subdomain_tenMill_open_page_rank,retweets,likes,initial_tweets,tweet_count_max,tweet_count_avg,followers_count_max,followers_count_avg,following_count_max,following_count_avg,verified_proportion,Toxicity,Threat,Insult,sentiment_score,sentiment_magnitude,sentiment,exclamation_number,uppercase_words_number,hashtags_number,Kincaid,ARI,Coleman-Liau,Flesch Index,Fog Index,Lix,SMOG-Grading,n_characters,n_words,word_avg_lenght,n_sentences,sentence_avg_length,n_short_sentence,n_long_sentence,n_paragraphs,paragraph_avg_length,n_questions,n_passive_sentence,longest_sentence,shortest_sentence,n_verbs_to_be,n_verbs_auxiliary,n_conjunctions,%_conjunctions,n_pronouns,%_pronouns,n_prepositions,%_prepositions,n_nomilizations,%_nomilizations,funct,pronoun,ppron,i,we,you,shehe,they,ipron,article,verb,auxverb,past,present,future,adverb,preps,conj,negate,quant,number,swear,social,family,friend,humans,affect,posemo,negemo,anx,anger,sad,cogmech,insight,cause,discrep,tentat,certain,inhib,incl,excl,percept,see,hear,feel,bio,body,health,sexual,ingest,relativ,motion,space,time,work,achieve,leisure,home,money,relig,death,assent,nonfl,filler
0,https://g1.globo.com/politica/noticia/2021/06/24/governo-diz-que-bolsonaro-repassou-denuncias-sobre-compra-da-covaxin-a-pazuello.ghtml,Governo diz que Bolsonaro repassou denúncias sobre contrato da Covaxin a Pazuello,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,19,186,15,401981,54042.87,108641,8170.8,4987,1140.2,0.066667,0.242974,0.218485,0.322813,-0.1,0.1,Neutral,0,0,0,8.5,10.1,13.1,57.0,11.4,59.2,10.7,69,13,5.31,1,13.0,0,0,1,1.0,0,0,13,13,0,0,0,0.0,0,0.0,2,0.15,0,0.0,33.333333,16.666667,8.333333,0.0,0.0,8.333333,8.333333,0.0,16.666667,8.333333,8.333333,0.0,0.0,8.333333,0.0,0.0,25.0,8.333333,0.0,0.0,0.0,8.333333,25.0,0.0,0.0,8.333333,16.666667,8.333333,8.333333,0.0,0.0,0.0,16.666667,0.0,0.0,8.333333,8.333333,0.0,0.0,8.333333,8.333333,8.333333,0.0,8.333333,0.0,8.333333,0.0,0.0,0.0,8.333333,8.333333,0.0,16.666667,0.0,16.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.333333,0.0
1,https://g1.globo.com/politica/cpi-da-covid/noticia/2021/07/20/secretaria-do-ministerio-da-saude-sugeriu-a-portugal-atendimento-precoce-contra-a-covid.ghtml,Secretária do Ministério da Saúde sugeriu a Portugal 'atendimento precoce' contra a Covid,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,41,239,8,26504,12826.25,136618,17443.88,1556,605.375,0.125,0.063448,0.223958,0.070534,0.3,0.3,Positive,0,0,0,9.8,7.8,8.8,53.1,11.4,41.0,10.7,72,16,4.5,1,16.0,0,0,1,1.0,0,0,16,16,0,0,0,0.0,0,0.0,4,0.25,0,0.0,38.461538,15.384615,15.384615,0.0,0.0,15.384615,15.384615,0.0,15.384615,15.384615,0.0,0.0,0.0,0.0,0.0,0.0,38.461538,0.0,0.0,0.0,0.0,0.0,23.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.076923,0.0,0.0,0.0,0.0,0.0,0.0,15.384615,7.692308,0.0,0.0,0.0,0.0,7.692308,0.0,7.692308,0.0,7.692308,7.692308,0.0,15.384615,0.0,15.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.384615,0.0
2,https://g1.globo.com/df/distrito-federal/noticia/2021/08/02/vacinacao-contra-covid-19-2a-dose-da-pfizer-sera-aplicada-nesta-segunda-veja-locais.ghtml,Vacinação contra Covid-19: 2ª dose da Pfizer é aplicada nesta segunda; veja locais no DF,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,0,0,1,48982,48982.0,50911,50911.0,1,1.0,1.0,0.041336,0.151318,0.027591,0.0,0.0,Neutral,0,1,0,5.1,3.6,6.8,70.1,5.7,21.9,6.9,60,13,4.62,2,6.5,0,0,1,2.0,0,0,8,5,0,0,0,0.0,0,0.0,1,0.08,0,0.0,26.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,13.333333,0.0,13.333333,0.0,0.0,13.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.333333,6.666667,0.0,0.0,0.0,0.0,0.0,0.0,6.666667,6.666667,6.666667,0.0,0.0,6.666667,0.0,6.666667,0.0,0.0,26.666667,0.0,13.333333,6.666667,6.666667,6.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://g1.globo.com/bemestar/podcast/noticia/2021/06/23/bem-estar-96-por-que-a-covid-reduziu-a-expectativa-de-vida-do-brasileiro.ghtml,Bem Estar #96: Por que a Covid reduziu a expectativa de vida do brasileiro?,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,6,21,1,27775,27775.0,710573,710573.0,15,15.0,1.0,0.082448,0.163743,0.088036,0.0,0.0,Neutral,0,0,1,4.8,1.9,4.8,72.8,8.5,28.4,8.5,59,14,4.21,2,7.0,1,0,1,2.0,1,1,11,3,1,0,0,0.0,0,0.0,2,0.14,1,0.07,50.0,21.428571,14.285714,0.0,0.0,14.285714,14.285714,0.0,21.428571,14.285714,7.142857,7.142857,0.0,0.0,0.0,0.0,35.714286,7.142857,0.0,0.0,0.0,0.0,21.428571,0.0,0.0,0.0,14.285714,14.285714,0.0,0.0,0.0,0.0,42.857143,0.0,0.0,14.285714,14.285714,0.0,0.0,28.571429,7.142857,0.0,0.0,0.0,0.0,7.142857,0.0,7.142857,0.0,0.0,14.285714,7.142857,14.285714,7.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.285714,0.0
4,https://g1.globo.com/bemestar/vacina/noticia/2021/06/14/ministro-diz-que-3-milhoes-de-doses-da-janssen-devem-chegar-ao-brasil-na-quarta.ghtml,Ministro diz que 3 milhões de doses da Janssen devem chegar ao Brasil na quarta,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,76,675,7,718820,163504.1,12971449,1861950.0,4784,1282.714286,0.285714,0.082448,0.465385,0.065396,0.1,0.1,Neutral,0,0,0,6.9,5.4,5.9,74.3,6.4,28.5,3.0,64,16,4.0,1,16.0,0,0,1,1.0,0,0,16,16,0,0,0,0.0,0,0.0,2,0.12,0,0.0,40.0,6.666667,0.0,0.0,0.0,0.0,0.0,0.0,6.666667,0.0,20.0,6.666667,0.0,13.333333,6.666667,0.0,20.0,6.666667,0.0,0.0,6.666667,6.666667,13.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.666667,6.666667,0.0,13.333333,13.333333,0.0,0.0,13.333333,6.666667,6.666667,0.0,6.666667,0.0,6.666667,0.0,6.666667,0.0,6.666667,13.333333,6.666667,6.666667,0.0,0.0,0.0,0.0,0.0,6.666667,6.666667,0.0,0.0,0.0,0.0
5,https://g1.globo.com/jornal-nacional/noticia/2021/05/05/bolsonaro-volta-a-insinuar-que-a-china-teria-criado-o-coronavirus-propositalmente.ghtml,Bolsonaro volta a insinuar que a China teria criado o coronavírus propositalmente,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,7,18,7,98613,27219.14,18959,5769.143,7218,1776.285714,0.285714,0.199834,0.28771,0.224856,-0.1,0.1,Neutral,0,0,0,10.4,10.1,13.1,44.0,17.5,43.8,14.0,69,13,5.31,1,13.0,0,0,1,1.0,0,0,13,13,0,0,0,0.0,0,0.0,0,0.0,0,0.0,41.666667,33.333333,25.0,0.0,0.0,25.0,25.0,0.0,33.333333,25.0,8.333333,8.333333,0.0,0.0,0.0,0.0,16.666667,8.333333,0.0,0.0,0.0,0.0,41.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.666667,8.333333,8.333333,8.333333,8.333333,0.0,8.333333,16.666667,8.333333,8.333333,8.333333,0.0,0.0,8.333333,8.333333,0.0,0.0,0.0,8.333333,0.0,16.666667,0.0,0.0,0.0,0.0,0.0,8.333333,0.0,0.0,0.0,16.666667,0.0
6,https://g1.globo.com/pr/norte-noroeste/noticia/2021/04/01/sambista-canta-para-equipe-de-saude-apos-ser-vacinado-contra-a-covid-em-londrina-diga-espelho-meu-se-ha-um-idoso-mais-feliz-que-eu.ghtml,"Sambista canta para equipe de saúde após ser vacinado contra a Covid, em Londrina: 'Diga espelho meu se há um idoso mais feliz que eu'",False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,6,33,2,90004,90004.0,273197,273197.0,3,3.0,1.0,0.012811,0.077519,0.01525,0.6,0.6,Positive,0,0,0,5.4,3.3,4.4,80.3,6.9,28.3,6.9,103,27,3.81,2,13.5,0,0,1,2.0,0,0,16,11,0,0,1,0.04,2,0.07,5,0.19,0,0.0,56.0,20.0,16.0,8.0,0.0,4.0,8.0,0.0,12.0,8.0,12.0,8.0,0.0,12.0,0.0,0.0,28.0,12.0,0.0,8.0,4.0,4.0,20.0,0.0,0.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0,40.0,4.0,0.0,8.0,12.0,0.0,4.0,16.0,12.0,8.0,4.0,4.0,0.0,4.0,0.0,4.0,0.0,8.0,20.0,0.0,12.0,12.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0
7,https://g1.globo.com/mundo/noticia/2021/06/19/brasil-tem-quase-250-mil-mortes-por-covid-desde-marco-e-diminui-distancia-para-os-eua.ghtml,Brasil tem quase 250 mil mortes por Covid desde março e diminui distância para os EUA,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,415,2876,22,2585775,275180.8,12971449,1186407.0,38014,3772.636364,0.136364,0.611661,0.881356,0.444507,0.0,0.0,Neutral,0,1,0,5.9,5.4,4.8,85.2,7.2,23.6,3.0,68,18,3.78,1,18.0,0,0,1,1.0,0,0,18,18,0,0,1,0.06,0,0.0,3,0.17,0,0.0,50.0,6.25,6.25,0.0,0.0,6.25,0.0,6.25,0.0,6.25,12.5,6.25,0.0,18.75,0.0,6.25,18.75,6.25,0.0,6.25,6.25,0.0,6.25,0.0,0.0,6.25,6.25,6.25,6.25,0.0,6.25,6.25,37.5,6.25,6.25,0.0,6.25,6.25,12.5,12.5,6.25,0.0,0.0,0.0,0.0,18.75,12.5,6.25,0.0,0.0,18.75,6.25,18.75,18.75,0.0,0.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0
8,https://g1.globo.com/mg/triangulo-mineiro/noticia/2021/04/06/covid-19-veja-boletim-da-prefeitura-de-uberlandia-de-06042021.ghtml,Covid-19: veja boletim da Prefeitura de Uberlândia de 06/04/2021,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,4,4,1,49279,49279.0,13609,13609.0,3,3.0,1.0,0.055042,0.28771,0.057847,0.1,0.1,Neutral,0,0,0,1.7,-1.0,0.6,91.8,6.0,15.0,6.9,38,10,3.8,2,5.0,1,0,1,2.0,0,0,8,2,0,0,0,0.0,0,0.0,2,0.2,0,0.0,33.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.111111,0.0,0.0,11.111111,0.0,0.0,33.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.444444,22.222222,0.0,0.0,0.0,0.0,0.0,22.222222,0.0,11.111111,11.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,https://g1.globo.com/mg/triangulo-mineiro/noticia/2021/04/09/covid-19-veja-boletim-da-prefeitura-de-uberlandia-de-09042021.ghtml,Covid-19: veja boletim da Prefeitura de Uberlândia de 09/04/2021,False,g1.globo.com,186.192.81.31,BR,True,False,-22.87,-42.35,28604,BR,True,11,13,2136.0,5.57,5,7,1,49279,49279.0,13609,13609.0,3,3.0,1.0,0.055042,0.38785,0.061515,0.1,0.1,Neutral,0,0,0,1.7,-1.0,0.6,91.8,6.0,15.0,6.9,38,10,3.8,2,5.0,1,0,1,2.0,0,0,8,2,0,0,0,0.0,0,0.0,2,0.2,0,0.0,33.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.111111,0.0,0.0,11.111111,0.0,0.0,33.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.444444,22.222222,0.0,0.0,0.0,0.0,0.0,22.222222,0.0,11.111111,11.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
dfFeatures.dtypes

url                                   object
Titulo                                object
desinformacao_label                     bool
subdomain                             object
subdomain_ip                          object
subdomain_ip_cc                     category
subdomain_ip_is_brazil                  bool
subdomain_ip_is_us                      bool
subdomain_ip_latitude                float64
subdomain_ip_longitude               float64
subdomain_as_n                        object
subdomain_as_cc                     category
subdomain_ipcc_equal_ascc               bool
domain_route_hops                      int64
domain_dns_caa_txt_count               int64
subdomain_tenMill_rank               float64
subdomain_tenMill_open_page_rank     float64
retweets                               int64
likes                                  int64
initial_tweets                         int64
tweet_count_max                        int64
tweet_count_avg                      float64
followers_

In [24]:
dfFeatures['sentiment']= dfFeatures['sentiment'].astype('category')

In [25]:
label_column_name = 'desinformacao_label'
numericalFeatures = []
boolFeatures = []
categorialFeatures = []
nonFeatures = []
for column in list(dfFeatures.columns):
    if(column) == label_column_name:
        continue
    elif(dfFeatures.dtypes[column].name == 'object'):
        nonFeatures.append(column)
    elif(dfFeatures.dtypes[column].name == 'category'):
        categorialFeatures.append(column)
    elif(dfFeatures.dtypes[column].name == 'bool'):
        boolFeatures.append(column)
    elif(dfFeatures.dtypes[column].name == 'int64' or dfFeatures.dtypes[column].name == 'float64'):
        numericalFeatures.append(column)
    else:
        raise("Alguma feature ta com tipo indefinido")


In [26]:
print(nonFeatures) #Verificar no olho principalmente essa pq uma feature nova inserida como object vai cair aqui
#Como aconteceria no caso de uma feature nova que o infer_objetcs() nao pegou e tu nao setou manualmente dps de inspecionar a saida dele
print('\n')
print(boolFeatures)
print('\n')
print(categorialFeatures)
print('\n')
print(numericalFeatures)

['url', 'Titulo', 'subdomain', 'subdomain_ip', 'subdomain_as_n']


['subdomain_ip_is_brazil', 'subdomain_ip_is_us', 'subdomain_ipcc_equal_ascc']


['subdomain_ip_cc', 'subdomain_as_cc', 'sentiment']


['subdomain_ip_latitude', 'subdomain_ip_longitude', 'domain_route_hops', 'domain_dns_caa_txt_count', 'subdomain_tenMill_rank', 'subdomain_tenMill_open_page_rank', 'retweets', 'likes', 'initial_tweets', 'tweet_count_max', 'tweet_count_avg', 'followers_count_max', 'followers_count_avg', 'following_count_max', 'following_count_avg', 'verified_proportion', 'Toxicity', 'Threat', 'Insult', 'sentiment_score', 'sentiment_magnitude', 'exclamation_number', 'uppercase_words_number', 'hashtags_number', 'Kincaid', 'ARI', 'Coleman-Liau', 'Flesch Index', 'Fog Index', 'Lix', 'SMOG-Grading', 'n_characters', 'n_words', 'word_avg_lenght', 'n_sentences', 'sentence_avg_length', 'n_short_sentence', 'n_long_sentence', 'n_paragraphs', 'paragraph_avg_length', 'n_questions', 'n_passive_sentence', 'longest_sentence

In [27]:
#Analise
print("Diferenças na porcentagem do total em valores de features categoricas entre instancias True e False")
for i, feature in enumerate(categorialFeatures+boolFeatures):
    dfTrue = dfFeatures[dfFeatures[label_column_name]==0]
    dfFalse = dfFeatures[dfFeatures[label_column_name]==1]
    display(
        (
            (dfTrue[feature].value_counts()/len(dfTrue)*100)
            -
            (dfFalse[feature].value_counts()/len(dfFalse)*100)
        ).round(2).to_frame(feature +" (%)").T
    )

Diferenças na porcentagem do total em valores de features categoricas entre instancias True e False


Unnamed: 0,BR,DE,EG,ES,FR,NL,US
subdomain_ip_cc (%),42.98,-0.42,-1.67,-2.08,-2.92,-2.08,-33.81


Unnamed: 0,BR,CA,DE,EG,ES,FR,US
subdomain_as_cc (%),22.74,-2.5,-3.75,-1.67,-2.08,-2.92,-9.82


Unnamed: 0,Neutral,Positive,Negative,Mixed
sentiment (%),2.78,3.7,-4.37,-2.1


Unnamed: 0,False,True
subdomain_ip_is_brazil (%),-42.98,42.98


Unnamed: 0,False,True
subdomain_ip_is_us (%),33.81,-33.81


Unnamed: 0,True,False
subdomain_ipcc_equal_ascc (%),-16.9,16.9


In [28]:
columns = list(dfFeatures.columns)

unwanted_columns = [label_column_name]+nonFeatures #+ categorialFeatures 
features_columns = [
    item for item in columns if item not in unwanted_columns
                   ]
print(len(features_columns))

124


###### Tratamento dos dados (criar one hot pras categoricas ,decidir o que fazer com nans)

In [29]:
dfFeatures_encoded = pd.get_dummies(dfFeatures, columns = categorialFeatures)

In [30]:
dfFeatures_encoded.isnull().sum().to_frame("Missing data").join((dfFeatures_encoded.isnull().sum().to_frame("%")*100/len(dfFeatures_encoded)).round(2).sort_values(by=['%'])).sort_values(by=['%'],ascending=False )

Unnamed: 0,Missing data,%
subdomain_tenMill_open_page_rank,84,8.45
subdomain_tenMill_rank,84,8.45
url,0,0.0
tentat,0,0.0
sad,0,0.0
cogmech,0,0.0
insight,0,0.0
cause,0,0.0
discrep,0,0.0
certain,0,0.0


In [31]:
dfFeatures_encoded['subdomain_tenMill_rank'] = dfFeatures_encoded['subdomain_tenMill_rank'].replace(np.nan, 10000000)
dfFeatures_encoded['subdomain_tenMill_open_page_rank'] = dfFeatures_encoded['subdomain_tenMill_open_page_rank'].replace(np.nan, dfFeatures_encoded.mean()['subdomain_tenMill_open_page_rank'])

  dfFeatures_encoded['subdomain_tenMill_open_page_rank'] = dfFeatures_encoded['subdomain_tenMill_open_page_rank'].replace(np.nan, dfFeatures_encoded.mean()['subdomain_tenMill_open_page_rank'])


In [32]:
encoded_features_columns = [feature for feature in dfFeatures_encoded.columns if feature not in unwanted_columns]
encoded_features_columns

['subdomain_ip_is_brazil',
 'subdomain_ip_is_us',
 'subdomain_ip_latitude',
 'subdomain_ip_longitude',
 'subdomain_ipcc_equal_ascc',
 'domain_route_hops',
 'domain_dns_caa_txt_count',
 'subdomain_tenMill_rank',
 'subdomain_tenMill_open_page_rank',
 'retweets',
 'likes',
 'initial_tweets',
 'tweet_count_max',
 'tweet_count_avg',
 'followers_count_max',
 'followers_count_avg',
 'following_count_max',
 'following_count_avg',
 'verified_proportion',
 'Toxicity',
 'Threat',
 'Insult',
 'sentiment_score',
 'sentiment_magnitude',
 'exclamation_number',
 'uppercase_words_number',
 'hashtags_number',
 'Kincaid',
 'ARI',
 'Coleman-Liau',
 'Flesch Index',
 'Fog Index',
 'Lix',
 'SMOG-Grading',
 'n_characters',
 'n_words',
 'word_avg_lenght',
 'n_sentences',
 'sentence_avg_length',
 'n_short_sentence',
 'n_long_sentence',
 'n_paragraphs',
 'paragraph_avg_length',
 'n_questions',
 'n_passive_sentence',
 'longest_sentence',
 'shortest_sentence',
 'n_verbs_to_be',
 'n_verbs_auxiliary',
 'n_conjunctio

In [33]:
global done
global griddone
global predone
global queue_finished
queue_finished = 0
predone = 0
global s
#s = 10000
s = 100 #Numero maximo de modelos POR TAMANHO
totalmodels = 0
combs = []
done = []
griddone = []

print('Creating Feature Combinations')
numeroMaximoDeFeatures = len(encoded_features_columns) #Por modelo
for c in range(0,numeroMaximoDeFeatures+1 ):
    # print('\t Size:%d'%c)
    if c == 0:
        combs.append([])
    else:
        combs.append(list(set(random_combinations(encoded_features_columns, c, s))))
    done.append(0)
    griddone.append(0)
    totalmodels += len(combs[-1])

    
#Ao final, combs[i] é uma lista de até s tuplas onde cada tupla é um conjunto de i features

Creating Feature Combinations


In [34]:
len(combs[3])

100

In [35]:
len(combs[1])

70

In [36]:
#Chama o eval_panel pra modelos de tamanho c
#Retorna uma lista com a media de aucs dos modelos de tamanho c
def run_mmpool(c):
    comb = combs[c]
    sys.stdout.write("Starting MM size %d (%d models total)\n" % (c, len(comb)))
    #sys.stdout.write("\t with %d models total \n" % len(comb))
    sys.stdout.flush()

    
    
    exit1 = open(MODELS_PATH + 'MultipleModels_DecisionTrees/' + TASK_NAME +
                 '-size%d-result.csv' % c, 'a+')
    exit2 = open(MODELS_PATH + 'MultipleModels_DecisionTrees/' + TASK_NAME +
                 '-size%d-preds.csv' % c, 'a+')

    a = eval_panel_platelabel(dfFeatures_encoded, comb, c, exit1, exit2)

    global queue_finished
    queue_finished += 1
    exit1.close()
    exit2.close()
    return(a)


In [37]:
if 1==1:
    print('Creating Directories')
    if (not os.path.isdir(MODELS_PATH + 'MultipleModels_DecisionTrees')):
        os.mkdir(MODELS_PATH + 'MultipleModels_DecisionTrees')
    #c é o tamanho de modelo
    #for c in range(1, numeroMaximoDeFeatures+1):
    #Deletamos a ultima linha pro caso de ter rolado uma execução parcial
    for c in range(1, 5):
        if os.path.isfile('MultipleModels_DecisionTrees/' + TASK_NAME + '-size%d-result.csv' % c):
            delete_last_lines('MultipleModels_DecisionTrees/' + TASK_NAME +
                              '-size%d-result.csv' % c)
            delete_last_lines('MultipleModels_DecisionTrees/' + TASK_NAME +
                              '-size%d-preds.csv' % c)
    results = []
    for c in range(1,5):
        res = run_mmpool(c)
        results.append(res)
        
        
#     pool = Pool(processes=10)
#     #results = pool.map(run_mmpool, list(range(1, numeroMaximoDeFeatures+1)))
#     results = pool.map(run_mmpool, list(range(1, 3)))
#     time.sleep(10)
#     pool.join()



Creating Directories
Starting MM size 1 (70 models total)
 GridSearch: pegando 50 modelos tamanho 1, avaliando em 72 combinações de parametros
 MultipleModels: treinando e avaliando 0 modelos tamanho 1 c/ params do gridsearch
Starting MM size 2 (100 models total)
 GridSearch: pegando 50 modelos tamanho 2, avaliando em 72 combinações de parametros
 MultipleModels: treinando e avaliando 0 modelos tamanho 2 c/ params do gridsearch
Starting MM size 3 (100 models total)
 GridSearch: pegando 50 modelos tamanho 3, avaliando em 72 combinações de parametros
 MultipleModels: treinando e avaliando 0 modelos tamanho 3 c/ params do gridsearch
Starting MM size 4 (100 models total)
 GridSearch: pegando 50 modelos tamanho 4, avaliando em 72 combinações de parametros
 MultipleModels: treinando e avaliando 0 modelos tamanho 4 c/ params do gridsearch


In [38]:
#Imprime a auc media do modelo de melhor performance em cada tamanho
#SÓ FUNCIONA SE TIVER ROLADO NUMA EXECUÇÃO UNICA!!
#SE ELE TIVER PARADO NO MEIO POR MAIS QUE O EVAL CONSIGA CONTINUAR NORMALMENTE
#ELE NAO EXTRAI OS RESULTS DO CSV(se realmente quiser fazer isso da pra fazer dps)
try:
    for i in range(0, len(results)):
        print(np.max(results[i]))
except:
    print("Esse print só funciona se todos os tamanho tiveram sido processados pelo eval num execução unica")

Esse print só funciona se todos os tamanho tiveram sido processados pelo eval num execução unica
