In [1]:
import re
import spacy
import tweepy
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import plotly.express as px
from spacy.tokens import Doc
from spacy.tokens import DocBin
import plotly.graph_objects as go
from spacy.training import Example
from datetime import datetime, timedelta
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
from spacytextblob.spacytextblob import SpacyTextBlob
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report

Carregar base de tweets.

In [2]:
data_df = pd.read_csv('tweets_ner.csv')

Pré-processamento: Removendo espaços em branco seguidos, tabs, quebras de linhas, citações (@) e links.

In [3]:
def clean_text(text):
    if pd.isnull(text):
        return None
    return re.sub('( +)', " ", re.sub(r"(ICYMI)|( +)|([\t\n])|(@[a-zA-Z0-9_\-+\.:]+)|(http[a-zA-Z0-9/\-:&\.]+)", " ", text)).strip()

In [4]:
data_df['tweets'] = data_df['tweets'].apply(clean_text)

Carregando pipeline com reconhecimento de sentimento pré-treinado (disponível [neste link](https://spacy.io/universe/project/spacy-textblob)).

In [5]:
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x7f48d163cb70>

In [6]:
tqdm.pandas()

Classificando os tweets de acordo com o sentimento: -1: sentimento negativo; 0: sentimento neutro; 1: sentimento positivo.

In [7]:
data_df['sentiment'] = data_df['tweets'].progress_apply(lambda x: nlp(x)._.polarity)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=788.0), HTML(value='')))




In [8]:
data_df

Unnamed: 0,tweets,timestamps,sentiment
0,Fortnite is getting another PlayStation charac...,2021-04-14 03:01:00,-0.071429
1,The series premiere of HBO's The Nevers is a m...,2021-04-14 02:31:01,0.750000
2,Horizon Zero Dawn's Aloy is coming to Fornite ...,2021-04-14 02:01:01,-0.400000
3,"A new Persona mobile game is being developed, ...",2021-04-14 01:46:01,0.056061
4,Nintendo announced a 20-minute Indie World Sho...,2021-04-14 01:31:01,0.000000
...,...,...,...
783,We won't tell your boss,2021-04-04 21:31:44,0.000000
784,Scalpers reckon you'll pay $2600 for Super Mar...,2021-04-04 20:30:07,0.333333
785,"There's some ""really cool"" Titanfall content c...",2021-04-04 20:15:01,0.350000
786,Lead a perfect life with our full guide to The...,2021-04-04 19:59:35,0.675000


Identificando as organizações e classificando os sentimentos, além de atribuir os sentimentos às organizações.

In [9]:
tweets = []
sentiment = []
orgs = []

In [10]:
for row in data_df.itertuples():
    doc = nlp(row.tweets)
    entities = doc.ents
    for org in entities:
        if org.label_ == 'ORG':
            tweets.append(row.tweets)
            sentiment.append(row.sentiment)
            orgs.append(str(org[0]))

In [12]:
data_df = pd.DataFrame({'sentiment': sentiment, 'orgs': orgs})

Cada ocorrência de cada organização está listada abaixo. Como algumas organizações aparecem mais de uma vez, existe mais de uma ocorrência destas.

In [13]:
data_df

Unnamed: 0,sentiment,orgs
0,-0.071429,Fortnite
1,0.750000,HBO
2,0.000000,Nintendo
3,0.000000,Indie
4,0.141667,SlotsWise
...,...,...
441,0.400000,PS
442,0.400000,Marvel
443,0.136364,Mortal
444,0.500000,PSVR


Contando as ocorrências de cada organização. Esta contagem definirá a ordem da exibição de cada organização no gráfico de barras no final deste notebook.

In [14]:
orgs_counter = data_df.groupby('orgs').count().reset_index()

In [15]:
orgs_counter

Unnamed: 0,orgs,sentiment
0,AI,2
1,AMD,1
2,ARPG,1
3,AUG,1
4,Activision,2
...,...,...
200,non,1
201,o⁠n,1
202,the,12
203,🏀,1


Corrigindo o nome da coluna de contagem.

In [16]:
orgs_counter.columns = ['orgs', 'counter']

In [17]:
orgs_counter

Unnamed: 0,orgs,counter
0,AI,2
1,AMD,1
2,ARPG,1
3,AUG,1
4,Activision,2
...,...,...
200,non,1
201,o⁠n,1
202,the,12
203,🏀,1


Separando os sentimentos negativos dos positivos.

In [18]:
pos_sentiments = data_df[data_df['sentiment'] >= 0]
neg_sentiments = data_df[data_df['sentiment'] < 0]

Somando os scores dos sentimentos positivos e negativos.

In [19]:
grouped_pos_sentiments = pd.DataFrame(pos_sentiments.groupby('orgs')['sentiment'].sum()).reset_index()
grouped_neg_sentiments = pd.DataFrame(neg_sentiments.groupby('orgs')['sentiment'].sum()).reset_index()

In [20]:
grouped_pos_sentiments

Unnamed: 0,orgs,sentiment
0,AI,0.000000
1,AMD,0.400000
2,ARPG,0.500000
3,Activision,0.000000
4,Age,0.142857
...,...,...
168,non,0.000000
169,o⁠n,0.100000
170,the,1.824975
171,🏀,0.362121


In [21]:
grouped_neg_sentiments

Unnamed: 0,orgs,sentiment
0,AUG,-0.05
1,Apex,-0.146296
2,Arclight,-0.125
3,Avengers,-0.025
4,BioShock,-0.153788
5,Black,-0.333333
6,COVID,-1.4
7,CallOfDutyModernWarfare,-0.3
8,Capcom,-0.896939
9,Core,-0.031818


Ordenando os contadores pela contagem de ocorrências.

In [22]:
orgs_counter = orgs_counter.sort_values('counter')

In [23]:
orgs_counter

Unnamed: 0,orgs,counter
204,🚨,1
104,Melrose,1
103,Media,1
203,🏀,1
162,Speed,1
...,...,...
56,Falcon,12
102,Marvel,12
121,Nintendo,15
116,Netflix,17


Criando os vetores ordenados de sentimentos negativos e positivos. A ordenação é feita de acordo com a contagem aterior.

In [24]:
sorted_pos_sentiments = []
sorted_neg_sentiments = []
for org in orgs_counter['orgs']:
    if org in grouped_pos_sentiments['orgs'].values:
        sorted_pos_sentiments.append(grouped_pos_sentiments[grouped_pos_sentiments['orgs'] == org]['sentiment'].values[0])
    else:
        sorted_pos_sentiments.append(0)
    if org in grouped_neg_sentiments['orgs'].values:
        sorted_neg_sentiments.append(grouped_neg_sentiments[grouped_neg_sentiments['orgs'] == org]['sentiment'].values[0])
    else:
        sorted_neg_sentiments.append(0)

Exibindo o gráfico de barras. A ordem de exibição é decrescente no número de ocorrências.

In [27]:
top = 50

fig = go.Figure()
fig.add_trace(go.Bar(y=orgs_counter['orgs'][-top:], x=sorted_pos_sentiments[-top:], orientation='h'))
fig.add_trace(go.Bar(y=orgs_counter['orgs'][-top:], x=sorted_neg_sentiments[-top:], orientation='h'))
fig.update_layout(barmode='relative', title_text='Relative Barmode')
fig.show()