In [None]:
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk import FreqDist
import numpy as np

#definindo os dados a serem utilizados
base = tt_perfil
variavel = 'text'

###ajustando as bases

#tokenizando a base de cada candidato e criando um df de tokens, total de tokens e frequencia de token
base_tokensb = tokenizar(base[base['Apelido'] == 'Bolsonaro'], variavel)
frequencia = FreqDist(base_tokensb)
dfbolsonaro = pd.Series(dict(frequencia)).to_frame().reset_index()
dfbolsonaro['candidato'] = 'Bolsonaro'
dfbolsonaro.columns = ['token', 'freq', 'candidato']
dfbolsonaro['total'] = dfbolsonaro['freq'].sum()

base_tokensl = tokenizar(base[base['Apelido'] == 'Lula'], variavel)
frequencia = FreqDist(base_tokensl)
dflula = pd.Series(dict(frequencia)).to_frame().reset_index()
dflula['candidato'] = 'Lula'
dflula.columns = ['token', 'freq', 'candidato']
dflula['total'] = dflula['freq'].sum()

#unindo os dfs dos dois candidatos, mantendo todas as palavras
dfmerge = pd.merge(dfbolsonaro, dflula, how ='outer', on='token', suffixes = ('_bolsonaro', '_lula'))
dfmerge['total_bolsonaro'] = dfbolsonaro['freq'].sum()
dfmerge['total_lula'] = dflula['freq'].sum()
dfmerge['freq_bolsonaro'] = dfmerge['freq_bolsonaro'].fillna(0)
dfmerge['freq_lula'] = dfmerge['freq_lula'].fillna(0)
dfmerge['partlogoddsratio'] = (dfmerge['freq_lula'] + 1 / dfmerge['total_lula'] + 1) / (dfmerge['freq_bolsonaro'] + 1 / dfmerge['total_bolsonaro'] + 1)
dfmerge['logoodsratio'] = np.log(dfmerge['partlogoddsratio'])
dfmerge = dfmerge.sort_values(by='logoodsratio')

#cortando as 15 maiores e 15 menores -> os extermos para cada candidato
dfmergetop15 = dfmerge[['token', 'logoodsratio']].head(35)
dfmergelast15 = dfmerge[['token', 'logoodsratio']].tail(35)

#juntando as 30 recortadas
dfoddsratio = dfmergetop15.append(dfmergelast15)

####fazendo o gráfico

plt.rcParams["figure.dpi"] = 500
fig, ax = plt.subplots(figsize=(5,12))

#ajustando o fundo da figura e do gráfico
fig.patch.set_facecolor("#ffffff")
ax.set_facecolor("#ffffff")

minimo = -7
maximo = 7

#ajustando as grades
ax.spines["left"].set_color(GREY75)
ax.spines["bottom"].set_color(GREY75)
ax.spines['bottom'].set_bounds((minimo, maximo))
ax.spines["right"].set_color(GREY75)
ax.spines["top"].set_color(GREY75)
ax.tick_params(axis="y", length=3, color=GREY75) 
ax.tick_params(axis="x", length=12, color=GREY91)
ax.set_xlim(minimo, maximo)
ax.set_ylim(-0.5, 69.5)
for i in range(minimo, maximo, 1): ax.axvline(i, color=GREY91, lw=0.9, zorder=0)
for i in range(1, 70, 1): ax.axhline(i, color=GREY91, lw=0.9, zorder=0)

plt.xlabel('log da razão de chances', fontsize=11, fontname="Montserrat", color =GREY30)

clrs = [(0.24,0.44,0.88) if (x < 0) else (0.71,0.12,0.43) for x in dfoddsratio['logoodsratio']]

plt.barh(dfoddsratio['token'], dfoddsratio['logoodsratio'], height=0.8, color=clrs)
plt.yticks(fontsize=10, fontname="Montserrat")
plt.xticks(fontsize=10, fontname="Montserrat")

legend_labels = ['Bolsonaro', 'Lula']
colors = [(0.24,0.44,0.88), (0.71,0.12,0.43)]
handles = [plt.Rectangle((0,0),1,1, color=color) for color in colors]
plt.legend(handles, legend_labels, loc='lower right')

plt.show()