Análise de sentimentos com word2vec

Código adaptado daqui: https://www.kaggle.com/code/nitin194/twitter-sentiment-analysis-word2vec-doc2vec/notebook

In [2]:
import re    # expressao regular
import nltk  # processamento de textos
import string
import warnings
import numpy as np
import pandas as pd
import seaborn as sns  # gráficos
import matplotlib.pyplot as plt
import gensim

pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
#nfile = '/content/drive/MyDrive/LMs-v2/opcovidbr.csv'
nfile = 'tweetsentbr.csv'
df  = pd.read_csv(nfile)

# Inspecionando o csv

In [4]:
df.shape

(7769, 2)

In [5]:
df.head()

Unnamed: 0,tweet,class
0,Que coisa linda! O Programa #encontro estava mostrando uma familia que adotou um adolescente de 18 anos. Que amor!!!,1
1,"Por mais #Encontro com as Irmãs Galvão, adorei elas.",1
2,Mr. CATRA @OficialMrCatra lançando sua nova música PPK CHORA no @SBTTheNoite k k k��\r\n#TheNoite #MrCatra #PpkChora,1
3,"Cibele arrasou, humilhou!! #VideoShowAoVivo",1
4,A Cearamirinense mais famosa do Brasil no #Edecasa @_robertasa ♥ Linda,1


In [6]:
df['class'].value_counts()

class
1    4773
0    2996
Name: count, dtype: int64

In [7]:
df[df['class'] == 1].head(10)

Unnamed: 0,tweet,class
0,Que coisa linda! O Programa #encontro estava mostrando uma familia que adotou um adolescente de 18 anos. Que amor!!!,1
1,"Por mais #Encontro com as Irmãs Galvão, adorei elas.",1
2,Mr. CATRA @OficialMrCatra lançando sua nova música PPK CHORA no @SBTTheNoite k k k��\r\n#TheNoite #MrCatra #PpkChora,1
3,"Cibele arrasou, humilhou!! #VideoShowAoVivo",1
4,A Cearamirinense mais famosa do Brasil no #Edecasa @_robertasa ♥ Linda,1
5,"nào sei vocês, mas quero @marcos_harter no #powercoupleBrasil com a @emillyaraujoof e num reality tipo Dr Rey. Até no #MasterChefBR tá bom",1
6,Tem q passar essa entrevista em telões nas praças\r\n#TheNoite,1
7,Estou todo derretido com o amor do Serginho e Thomas no #AltasHoras,1
8,Augusto Cury no #ConversaComBial IMPERDÍVEL!!!,1
9,#VideoShowAoVivo Sophia super sensual com essa roupa!!!!!!!!,1


In [8]:
df_new = df[['tweet', 'class']]
df_new.head()

Unnamed: 0,tweet,class
0,Que coisa linda! O Programa #encontro estava mostrando uma familia que adotou um adolescente de 18 anos. Que amor!!!,1
1,"Por mais #Encontro com as Irmãs Galvão, adorei elas.",1
2,Mr. CATRA @OficialMrCatra lançando sua nova música PPK CHORA no @SBTTheNoite k k k��\r\n#TheNoite #MrCatra #PpkChora,1
3,"Cibele arrasou, humilhou!! #VideoShowAoVivo",1
4,A Cearamirinense mais famosa do Brasil no #Edecasa @_robertasa ♥ Linda,1


# Limpeza dos dados (adaptado de [TweetSentBR](https://bitbucket.org/HBrum/tweetsentbr/src/master/) )

In [9]:
# limpeza do word2vec do tweetsentbr original: https://bitbucket.org/HBrum/tweetsentbr/src/master/

# Punctuation list
punctuations = re.escape('!"#%\'()*+,./:;<=>?@[\\]^_`{|}~')

# ##### #
# Regex #
# ##### #
re_remove_brackets = re.compile(r'\{.*\}')
re_remove_html = re.compile(r'<(\/|\\)?.+?>', re.UNICODE)
re_transform_numbers = re.compile(r'\d', re.UNICODE)
re_transform_emails = re.compile(r'[^\s]+@[^\s]+', re.UNICODE)
re_transform_url = re.compile(r'(http|https)://[^\s]+', re.UNICODE)
re_transform_username = re.compile(r'@[^\s]+', re.UNICODE)
# Different quotes are used.
re_quotes_1 = re.compile(r"(?u)(^|\W)[‘’′`']", re.UNICODE)
re_quotes_2 = re.compile(r"(?u)[‘’`′'](\W|$)", re.UNICODE)
re_quotes_3 = re.compile(r'(?u)[‘’`′“”]', re.UNICODE)
re_dots = re.compile(r'(?<!\.)\.\.(?!\.)', re.UNICODE)
re_punctuation = re.compile(r'([,";:]){2},', re.UNICODE)
re_hiphen = re.compile(r' -(?=[^\W\d_])', re.UNICODE)
re_tree_dots = re.compile(u'…', re.UNICODE)
# Differents punctuation patterns are used.
re_punkts = re.compile(r'(\w+)([%s])([ %s])' %
                       (punctuations, punctuations), re.UNICODE)
re_punkts_b = re.compile(r'([ %s])([%s])(\w+)' %
                         (punctuations, punctuations), re.UNICODE)
re_punkts_c = re.compile(r'(\w+)([%s])$' % (punctuations), re.UNICODE)
re_changehyphen = re.compile(u'–')
re_doublequotes_1 = re.compile(r'(\"\")')
re_doublequotes_2 = re.compile(r'(\'\')')
re_trim = re.compile(r' +', re.UNICODE)


def clean_text(text):
    """Apply all regex above to a given string."""
    text = text.lower()
    text = text.replace('\xa0', ' ')
    text = text.replace('#', ' ')
    text = re_tree_dots.sub('...', text)
    text = re.sub('\.\.\.', '', text)
    text = re_remove_brackets.sub('', text)
    text = re_changehyphen.sub('-', text)
    text = re_remove_html.sub(' ', text)
    text = re_transform_numbers.sub('0', text)
    text = re_transform_url.sub('URL', text)
    text = re_transform_username.sub('USERNAME', text)
    text = re_transform_emails.sub('EMAIL', text)
    text = re_quotes_1.sub(r'\1"', text)
    text = re_quotes_2.sub(r'"\1', text)
    text = re_quotes_3.sub('"', text)
    text = re.sub('"', '', text)
    text = re_dots.sub('.', text)
    text = re_punctuation.sub(r'\1', text)
    text = re_hiphen.sub(' - ', text)
    text = re_punkts.sub(r'\1 \2 \3', text)
    text = re_punkts_b.sub(r'\1 \2 \3', text)
    text = re_punkts_c.sub(r'\1 \2', text)
    text = re_doublequotes_1.sub('\"', text)
    text = re_doublequotes_2.sub('\'', text)
    text = re_trim.sub(' ', text)
    return text.strip()

In [11]:
#df_new['clean_tweet'] = df_new['tweet'].apply(lambda x: clean_text(x))

df_new['clean_tweet'] = np.vectorize(clean_text)(df['tweet'])
df_new.head(10)

Unnamed: 0,tweet,class,clean_tweet
0,Que coisa linda! O Programa #encontro estava mostrando uma familia que adotou um adolescente de 18 anos. Que amor!!!,1,que coisa linda ! o programa encontro estava mostrando uma familia que adotou um adolescente de 00 anos . que amor ! !!
1,"Por mais #Encontro com as Irmãs Galvão, adorei elas.",1,"por mais encontro com as irmãs galvão , adorei elas ."
2,Mr. CATRA @OficialMrCatra lançando sua nova música PPK CHORA no @SBTTheNoite k k k��\r\n#TheNoite #MrCatra #PpkChora,1,mr . catra USERNAME lançando sua nova música ppk chora no USERNAME k k k��\r\n thenoite mrcatra ppkchora
3,"Cibele arrasou, humilhou!! #VideoShowAoVivo",1,"cibele arrasou , humilhou ! ! videoshowaovivo"
4,A Cearamirinense mais famosa do Brasil no #Edecasa @_robertasa ♥ Linda,1,a cearamirinense mais famosa do brasil no edecasa USERNAME ♥ linda
5,"nào sei vocês, mas quero @marcos_harter no #powercoupleBrasil com a @emillyaraujoof e num reality tipo Dr Rey. Até no #MasterChefBR tá bom",1,"nào sei vocês , mas quero USERNAME no powercouplebrasil com a USERNAME e num reality tipo dr rey . até no masterchefbr tá bom"
6,Tem q passar essa entrevista em telões nas praças\r\n#TheNoite,1,tem q passar essa entrevista em telões nas praças\r\n thenoite
7,Estou todo derretido com o amor do Serginho e Thomas no #AltasHoras,1,estou todo derretido com o amor do serginho e thomas no altashoras
8,Augusto Cury no #ConversaComBial IMPERDÍVEL!!!,1,augusto cury no conversacombial imperdível ! !!
9,#VideoShowAoVivo Sophia super sensual com essa roupa!!!!!!!!,1,videoshowaovivo sophia super sensual com essa roupa ! !!!!!!!


In [None]:
from emoji import demojize
df_new.clean_tweet = df_new.clean_tweet.apply(lambda x: demojize(x, language='pt'))


In [14]:
# remover palavras com menos que 3 letras
df_new.clean_tweet = df_new.clean_tweet.apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
df_new.head(10)

Unnamed: 0,tweet,class,clean_tweet
0,Que coisa linda! O Programa #encontro estava mostrando uma familia que adotou um adolescente de 18 anos. Que amor!!!,1,coisa linda programa encontro estava mostrando familia adotou adolescente anos amor
1,"Por mais #Encontro com as Irmãs Galvão, adorei elas.",1,mais encontro irmãs galvão adorei elas
2,Mr. CATRA @OficialMrCatra lançando sua nova música PPK CHORA no @SBTTheNoite k k k��\r\n#TheNoite #MrCatra #PpkChora,1,catra USERNAME lançando nova música chora USERNAME thenoite mrcatra ppkchora
3,"Cibele arrasou, humilhou!! #VideoShowAoVivo",1,cibele arrasou humilhou videoshowaovivo
4,A Cearamirinense mais famosa do Brasil no #Edecasa @_robertasa ♥ Linda,1,cearamirinense mais famosa brasil edecasa USERNAME linda
5,"nào sei vocês, mas quero @marcos_harter no #powercoupleBrasil com a @emillyaraujoof e num reality tipo Dr Rey. Até no #MasterChefBR tá bom",1,vocês quero USERNAME powercouplebrasil USERNAME reality tipo masterchefbr
6,Tem q passar essa entrevista em telões nas praças\r\n#TheNoite,1,passar essa entrevista telões praças thenoite
7,Estou todo derretido com o amor do Serginho e Thomas no #AltasHoras,1,estou todo derretido amor serginho thomas altashoras
8,Augusto Cury no #ConversaComBial IMPERDÍVEL!!!,1,augusto cury conversacombial imperdível
9,#VideoShowAoVivo Sophia super sensual com essa roupa!!!!!!!!,1,videoshowaovivo sophia super sensual essa roupa !!!!!!!


# Tokenizando por palavras com [NLTK](https://www.nltk.org/)

In [25]:
def tokenize(text):
  final = ""
  for sent in sent_tokenizer.tokenize(text):

    if sent.count(' ') >= 3 and sent[-1] in ['.', '!', '?', ';']:

      if sent[0:2] == '- ':
        sent = sent[2:]

      elif sent[0] == ' ' or sent[0] == '-':
        sent = sent[1:]

    final += sent
  return final

In [36]:
import nltk
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer
nltk.download('punkt')

sent_tokenizer = PunktSentenceTokenizer('portuguese')

df_new['tokenized_tweet'] = df_new.clean_tweet.apply(lambda x: tokenize(x))
df_new.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oknotok\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,tweet,class,clean_tweet,tokenized_tweet
0,Que coisa linda! O Programa #encontro estava mostrando uma familia que adotou um adolescente de 18 anos. Que amor!!!,1,coisa linda programa encontro estava mostrando familia adotou adolescente anos amor,coisa linda programa encontro estava mostrando familia adotou adolescente anos amor
1,"Por mais #Encontro com as Irmãs Galvão, adorei elas.",1,mais encontro irmãs galvão adorei elas,mais encontro irmãs galvão adorei elas
2,Mr. CATRA @OficialMrCatra lançando sua nova música PPK CHORA no @SBTTheNoite k k k��\r\n#TheNoite #MrCatra #PpkChora,1,catra USERNAME lançando nova música chora USERNAME thenoite mrcatra ppkchora,catra USERNAME lançando nova música chora USERNAME thenoite mrcatra ppkchora
3,"Cibele arrasou, humilhou!! #VideoShowAoVivo",1,cibele arrasou humilhou videoshowaovivo,cibele arrasou humilhou videoshowaovivo
4,A Cearamirinense mais famosa do Brasil no #Edecasa @_robertasa ♥ Linda,1,cearamirinense mais famosa brasil edecasa USERNAME linda,cearamirinense mais famosa brasil edecasa USERNAME linda


In [35]:
tokenized_tweet = df_new['tokenized_tweet'].apply(lambda x: x.split()) # tokenizing
tokenized_tweet.head()

0    [coisa, linda, programa, encontro, estava, mostrando, familia, adotou, adolescente, anos, amor]
1                                                      [mais, encontro, irmãs, galvão, adorei, elas]
2            [catra, USERNAME, lançando, nova, música, chora, USERNAME, thenoite, mrcatra, ppkchora]
3                                                       [cibele, arrasou, humilhou, videoshowaovivo]
4                                   [cearamirinense, mais, famosa, brasil, edecasa, USERNAME, linda]
Name: tokenized_tweet, dtype: object

# Treinando vetores word2vec

In [37]:
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            vector_size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2, # Ignores all words with total frequency lower than 2.
            #sg = 1, # 1 for skip-gram model
            #hs = 0,
            #negative = 10, # for negative sampling
            workers= 32, # no.of cores
            seed = 34
)

model_w2v.train(tokenized_tweet, total_examples= len(df_new['tokenized_tweet']), epochs=20)

(795065, 1106640)

In [38]:
model_w2v.wv.most_similar(positive="amor")

[('deus', 0.8877990245819092),
 ('menos', 0.8791422843933105),
 ('lindooooooooo', 0.8093745708465576),
 ('pelo', 0.8069652318954468),
 ('desfilando', 0.8059484362602234),
 ('apaixonada', 0.8036425113677979),
 ('abençoe', 0.801310658454895),
 ('previa', 0.7879616022109985),
 ('encanto', 0.7858639359474182),
 ('felicidades', 0.7791611552238464)]

In [39]:
model_w2v.wv.most_similar(positive="ódio")

[('perspectiva', 0.9949758648872375),
 ('certa', 0.9944947957992554),
 ('peixes', 0.9938503503799438),
 ('outras', 0.9936400651931763),
 ('pega', 0.9936354160308838),
 ('lulaeuconfio', 0.9936087131500244),
 ('baixa', 0.9936063289642334),
 ('comprar', 0.9935808777809143),
 ('geladinho', 0.9933967590332031),
 ('corre', 0.9931867122650146)]

In [40]:
model_w2v.wv.get_vector('amor')

array([-0.07959905, -0.7473567 , -0.1133026 , -0.6843543 , -0.04944119,
        0.08701448, -0.6818579 , -0.33894506,  0.42806485, -0.17309412,
        0.42322665,  0.02096842,  0.21752028,  0.436957  ,  0.03955846,
        0.19776239,  0.28866455, -0.4251821 , -0.6558236 , -0.05020785,
       -0.52346236,  0.18229143,  0.17406195,  0.59931123, -0.3296758 ,
       -0.17533989, -1.1450188 , -0.5059546 ,  0.35460034, -0.7915325 ,
       -0.13855655, -0.7099911 ,  0.00137176,  0.43151554, -0.2114024 ,
       -0.3921081 ,  0.194141  , -0.3318932 , -0.37627128,  0.1319061 ,
       -0.73377585, -0.13796327, -0.04486416, -0.434089  , -0.5325656 ,
       -0.41329348,  0.880198  , -0.22801626, -1.3256093 , -0.18249083,
        0.37824255, -0.03803816, -0.5831077 ,  0.2543889 , -0.10865976,
        0.07385721, -0.3212606 , -0.31643915,  0.7924858 ,  0.12494197,
       -0.1105767 , -0.30371794,  0.23742802, -0.42363182,  0.34272537,
        0.31027588,  0.91599995,  0.07548937, -0.6989698 , -0.48

# Recuperando vetores das palavras

In [41]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v.wv.get_vector(word).reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

In [42]:
len(tokenized_tweet)

7769

In [43]:
wordvec_arrays = np.zeros((len(tokenized_tweet), 200))
#tokenized_tweet = df_final['tokenized_tweet']
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(7769, 200)

In [44]:
labels = df['class']

In [None]:
import sklearn

In [None]:
#iDividindo aleatoriamente em treinamento e teste

In [None]:

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(wordvec_df, labels, test_size=0.2, random_state=42, shuffle=True)

In [None]:
X_train



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
235,0.032690,-0.412711,0.134898,-0.022031,-0.236021,0.033374,-0.236985,-0.114925,0.163744,0.023553,...,0.118782,-0.171647,-0.011126,-0.206461,0.234685,0.116466,-0.179183,-0.154362,0.226066,0.152767
9,0.022089,-0.344124,0.111946,-0.020244,-0.205390,0.027785,-0.200417,-0.090436,0.139959,0.021090,...,0.090611,-0.135877,-0.016131,-0.179710,0.191682,0.102967,-0.149805,-0.126419,0.187249,0.123639
227,0.026141,-0.386151,0.128481,-0.024439,-0.232111,0.028857,-0.227433,-0.098350,0.153813,0.021678,...,0.105152,-0.153431,-0.020034,-0.196213,0.216376,0.111033,-0.164336,-0.144748,0.208971,0.136222
353,0.025002,-0.390405,0.127558,-0.022677,-0.232416,0.026794,-0.226453,-0.102550,0.151881,0.020620,...,0.104722,-0.156680,-0.017072,-0.198039,0.217525,0.114431,-0.164674,-0.143079,0.210843,0.137938
429,0.037166,-0.444359,0.149015,-0.026213,-0.256706,0.035297,-0.258749,-0.117877,0.176622,0.026713,...,0.125193,-0.185749,-0.018625,-0.225016,0.249459,0.118329,-0.192166,-0.170086,0.243788,0.162244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.021430,-0.281239,0.093138,-0.013142,-0.162396,0.023937,-0.160836,-0.076405,0.109907,0.012673,...,0.076818,-0.115074,-0.008476,-0.141436,0.158288,0.080351,-0.123330,-0.105948,0.151489,0.098833
106,0.032126,-0.442325,0.144643,-0.026617,-0.259212,0.034839,-0.256662,-0.117279,0.175113,0.021635,...,0.120160,-0.178225,-0.019144,-0.225370,0.246016,0.126309,-0.187619,-0.162691,0.240386,0.157636
270,0.028692,-0.409473,0.135419,-0.023556,-0.245394,0.031695,-0.238504,-0.111797,0.162221,0.019856,...,0.114663,-0.165574,-0.015139,-0.209564,0.234292,0.118390,-0.176655,-0.156976,0.229299,0.150291
435,0.029272,-0.408487,0.136038,-0.024007,-0.238859,0.029613,-0.236142,-0.105435,0.165341,0.017651,...,0.111799,-0.166930,-0.019931,-0.206468,0.226475,0.117260,-0.170820,-0.149911,0.223183,0.143979


In [None]:
y_train

Unnamed: 0,class
7495,0
5842,0
2570,1
3937,1
6214,0
...,...
5226,0
5390,0
860,1
7603,0


In [None]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
1421,-0.070567,-0.340661,0.028878,-0.286708,-0.222296,-0.275200,-0.070390,-0.462738,0.147165,0.284257,...,0.584039,-0.337444,-0.263881,-0.246271,0.364063,0.353290,-0.657011,-0.336045,0.276815,0.626865
2746,-0.249204,-0.237717,0.096210,0.098576,-0.236199,0.140721,-0.355866,-0.249361,-0.066407,-0.126227,...,-0.097418,-0.253842,0.076759,-0.301491,0.098828,-0.197372,-0.027038,-0.200215,0.121516,0.057116
3473,0.182812,-0.280493,0.120618,-0.055462,-0.059121,-0.060184,-0.194226,-0.264890,0.273775,-0.142281,...,0.492742,-0.204972,-0.043803,-0.154977,0.406465,0.307599,-0.402404,-0.230894,0.261513,0.221199
6689,0.103522,-0.603885,0.118117,0.037311,-0.314256,-0.088050,-0.307257,-0.216213,0.225006,0.186425,...,0.104693,-0.235265,-0.142711,-0.058890,-0.065937,-0.081636,-0.253248,-0.237033,0.164390,0.096638
3000,0.373693,-0.503952,0.202077,-0.241752,-0.149916,0.005993,-0.198392,-0.264619,0.325170,-0.295480,...,0.562604,-0.351902,-0.058710,-0.195954,0.458798,0.444327,-0.245973,-0.100718,0.404251,0.161327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4745,0.135066,-0.479942,0.155575,-0.075956,-0.165609,0.005889,-0.253148,-0.236047,0.109793,-0.112912,...,0.290698,-0.240006,0.050195,-0.183657,0.162508,0.188020,-0.189921,-0.125391,0.298320,0.162955
5537,-0.030681,-0.260632,0.094580,0.035138,-0.160559,-0.001635,-0.207831,-0.200649,0.069796,0.010634,...,0.128361,-0.134733,0.012106,-0.172925,0.192811,0.013596,-0.248075,-0.230543,0.176052,0.108728
904,0.156455,-0.285065,0.149937,-0.125860,-0.060583,-0.059702,-0.178306,-0.213009,0.254935,-0.117608,...,0.493764,-0.124572,0.014265,-0.107893,0.351134,0.248873,-0.322458,-0.199423,0.270623,0.131041
1446,0.482751,-0.641929,0.285520,-0.323617,-0.135831,0.009637,-0.213136,-0.277783,0.385623,-0.340971,...,0.663760,-0.441834,-0.061860,-0.257147,0.541184,0.544731,-0.268254,-0.102747,0.522683,0.141113


# treinando um classificador logistic regression

In [None]:

lreg = sklearn.linear_model.LogisticRegression(solver='lbfgs')



In [None]:
lreg.fit(X_train, y_train)

prediction = lreg.predict_proba(X_test)

prediction_int = prediction[:,1] >= 0.3


sklearn.metrics.f1_score(y_test, prediction_int)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7878532264867145

In [None]:
prediction

array([[0.17993914, 0.82006086],
       [0.69064149, 0.30935851],
       [0.29489066, 0.70510934],
       ...,
       [0.32341961, 0.67658039],
       [0.01605539, 0.98394461],
       [0.49021957, 0.50978043]])

In [None]:

sklearn.metrics.confusion_matrix(y_test, prediction_int)

array([[117, 473],
       [ 30, 934]])

In [None]:
# Treinando um classificador MLP

In [None]:
mlp = sklearn.neural_network.MLPClassifier(solver='lbfgs', alpha=1e-5,\
                    hidden_layer_sizes=(100, 2), random_state=1)
mlp.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
mlp_prediction = mlp.predict(X_test)
mlp_prediction

array([1, 0, 1, ..., 1, 1, 0])

In [None]:
prediction = mlp.predict_proba(X_test)

prediction_int = prediction[:,1] >= 0.3


sklearn.metrics.f1_score(y_test, prediction_int)

0.8014028934677774

In [None]:

sklearn.metrics.confusion_matrix(y_test, prediction_int)

array([[187, 403],
       [ 50, 914]])