# Projeto 3 - Análise de sentimento sobre o mercado financeiro

In [None]:
# instalação dos pacotes necessários na versão mais nova
%pip install -U scikit-learn pandas numpy seaborn spacy --quiet

#preparing spacy, hang on we're download over 400MB of data :)
%pip install "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl" --quiet


In [12]:
import numpy as np
import pandas as pd

import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline

#estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
import lightgbm as lgb


In [157]:
# # aqui a gente cala a boca do sklearn
# # importa o filtro
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning


# # monta o filtro
simplefilter("ignore", category=ConvergenceWarning)
simplefilter("ignore", category=UndefinedMetricWarning)

In [None]:
def multi_classifier_fit_score(cleaner, vectorizer, classifiers: dict, X, y) -> dict:
  """
  Fit and Test multiple classifiers at the same time
  Returns a dictonary of scores and a dictionary of all fitted pipelines accessable by the key of the classifiers' dict

  Arguments:
    cleaner: a pipeline-ready object to clean data
    vectorizer: a pipeline-ready object to vectorize the data
    classifiers: a dictionary of classifiers to fit and score {'estimator_name': estimator()}
    X: features that explain the target
    y: target
  """
  from sklearn.metrics import accuracy_score, precision_score, recall_score
  from sklearn.model_selection import train_test_split

  scores = {}
  models = {}

  X_train_mcfs, X_test_mcfs, y_train_mcfs, y_test_mcfs = train_test_split(X, y, test_size=0.3, random_state=10)

  for index, classifier in enumerate(classifiers):
    classifier_ready = classifiers[classifier]
  
    pipe_multi_classifier = Pipeline(
    steps=[('cleaner', cleaner),
          ('vectorizer', vectorizer),
          ('classifier', classifier_ready)]
    )

    pipe_multi_classifier.fit(X_train_mcfs, y_train_mcfs)
    
    y_pred_mcfs = pipe_multi_classifier.predict(X_test_mcfs)
    
    scores[classifier] = [
      (round(accuracy_score(y_test_mcfs, y_pred_mcfs), 3)), 
      (round(precision_score(y_test_mcfs, y_pred_mcfs, average='macro'), 3)), 
      (round(recall_score(y_test_mcfs, y_pred_mcfs, average='macro'), 3))
    ]

    models[classifier] = pipe_multi_classifier
  
  return scores, models

In [None]:
def best_model(scores: dict, metric: str = 'accuracy') -> list:
  """
  Checks a dictionary of estimators scores and return a list containing the best model
  
  Arguments:
    scores: {'estimator': [accuracy, precision, recall]}
    metric: accuracy | precision | recall
  
  Returns:
    list['estimator', 'score', 'metric']
  """
  metrics_dict = {
    'accuracy': 0,
    'precision': 1,
    'recall': 2
  }

  if metric not in metrics_dict.keys():
    metric = 'accuracy'
  
  position = metrics_dict[metric]
  
  melhor_modelo = []

  for name, score in scores.items():
    if len(melhor_modelo) == 0:
      melhor_modelo = [name, score[position]]
    
    if (score[position] >= melhor_modelo[1]):
      melhor_modelo = [name, score[position]]
  
  melhor_modelo.append(metric)
  return melhor_modelo

In [4]:
nlp = spacy.load('en_core_web_lg')

#### 1. Construa as funções e a pipeline, separe os dados em treino e teste, execute a pipeline para classificar em positivo, negativo e neutro. Quais foram os valores de acurácia, precisão e sensitividade deste modelo? (3.0 pontos)

In [16]:
# Esta funcao remove espacos em branco no inicio e
# no fim do texto e converte todo o texto em letras
# minusculas
def clean_text(texto):     
    return texto.strip().lower()

# Criamos uma classe para gerenciar X e y
class predictors(TransformerMixin): #não faz diferença essa herança do TransformerMixin
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Esta funcao remove todas as stopwords e pontuacoes
def tokenizer(texto):
    doc = nlp(texto)
    tokens = [token for token in doc if ((not token.is_stop) & (not token.is_punct))]
    tokens = [token.lemma_.lower().strip() for token in tokens]
    return tokens

# Criamos um objeto CountVectorizer para vetorizar cada
# texto
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1), dtype=np.float64)

# Criamos um modelo SVM
classifier = lgb.LGBMClassifier(n_estimators=100)

In [17]:
pipe = Pipeline(
  steps=[('cleaner', predictors()),
         ('vectorizer', vectorizer),
         ('classifier', classifier)]
)

In [7]:
financial = pd.read_csv('data.csv')
financial.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [9]:
# Separando em X e y
X = financial.Sentence
y = financial.Sentiment

# Separando em teste e treino
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
pipe.fit(X_train, y_train)

In [19]:
y_pred = pipe.predict(X_test)

print(round(accuracy_score(y_test, y_pred), 3))
print(round(precision_score(y_test, y_pred, average='macro'), 3))
print(round(recall_score(y_test, y_pred, average='macro'), 3))

0.689
0.601
0.577


In [94]:
classifiers = {
              'LGBM': lgb.LGBMClassifier(n_estimators=100), 
              'SVC': SVC(), 
              'RandomForest': RandomForestClassifier(), 
              'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=100),
              'Logistic Regression': LogisticRegressionCV(cv=5, solver='sag', random_state=42, n_jobs=-1)
              }


In [None]:
scores, models = multi_classifier_fit_score(cleaner= predictors(), vectorizer= vectorizer, classifiers=classifiers, X = financial.Sentence, y = financial.Sentiment)

scores

In [None]:
melhor_modelo = best_model(scores, metric='precision')
melhor_modelo

['LGBM', 0.601, 'precision']

#### 2. Use o seu modelo para classificar os seguintes textos extraídos do site Financial Times. Faça uma tabela com o valor esperado e o valor obtido, e responda: houve divergência entre o esperado e o obtido? O que poderia ser feito para corrigir? (1.0 ponto)

In [100]:
new_data_dict = {
                  "Sentence": ["Central banks’ rate rises, geopolitical risk and slowing growth trigger investors’ stampede for safety.",
                          "China opens up bond market in bid to woo foreign investors.",
                          "HM Revenue & Customs says residents had £850bn in accounts overseas but it does not estimate if tax paid on this.",
                          "Japan’s horrifying crop of data falsification is also encouraging. The scandals have emerged from a distinct new phase in the evolution of the country’s shareholder capitalism.",
                          "Despite internal problems, the group continues to exert a tight grip on the US’s gun control debate."],
                  "Sentiment": ["negative", "neutral", "negative", "negative", "neutral"]        
                          }
                          
new_data = pd.DataFrame(new_data_dict)

new_data.head()

Unnamed: 0,Sentence,Sentiment
0,"Central banks’ rate rises, geopolitical risk a...",negative
1,China opens up bond market in bid to woo forei...,neutral
2,HM Revenue & Customs says residents had £850bn...,negative
3,Japan’s horrifying crop of data falsification ...,negative
4,"Despite internal problems, the group continues...",neutral


In [None]:
models[melhor_modelo[0]]

In [111]:
y_pred2 = models[melhor_modelo[0]].predict(new_data.Sentence)
print(round(accuracy_score(new_data.Sentiment, y_pred2), 3))
print(round(precision_score(new_data.Sentiment, y_pred2, average='macro'), 3))
print(round(recall_score(new_data.Sentiment, y_pred2, average='macro'), 3))

0.2
0.111
0.167


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [120]:
y_pred2 = models['Logistic Regression'].predict(new_data.Sentence)
print(round(accuracy_score(new_data.Sentiment, y_pred2), 3))
print(round(precision_score(new_data.Sentiment, y_pred2, average='macro'), 3))
print(round(recall_score(new_data.Sentiment, y_pred2, average='macro'), 3))

0.4
0.167
0.333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [123]:
sentences = new_data_dict['Sentence']
sentiments_or = new_data_dict['Sentiment']
sentiments_pred = y_pred2

values_dict = {"Sentence": sentences, "Original Sentiment": sentiments_or, "Predicted Sentiment": sentiments_pred}

values_df = pd.DataFrame(values_dict)

values_df['Divergence'] = values_df["Predicted Sentiment"] == values_df["Original Sentiment"]

values_df

Unnamed: 0,Sentence,Original Sentiment,Predicted Sentiment,Divergence
0,"Central banks’ rate rises, geopolitical risk a...",negative,positive,False
1,China opens up bond market in bid to woo forei...,neutral,neutral,True
2,HM Revenue & Customs says residents had £850bn...,negative,neutral,False
3,Japan’s horrifying crop of data falsification ...,negative,neutral,False
4,"Despite internal problems, the group continues...",neutral,neutral,True


In [125]:
values_df['Divergence'].value_counts()

False    3
True     2
Name: Divergence, dtype: int64

#### 3. Faça uma análise exploratória, onde identifique as três empresas mais citadas e quantifique os níveis de positividade, negatividade e neutralidade dos textos sobre estas empresas. (3.0 pontos)

##### a. Extraia de todos os textos as entidades, há quantas entidades? (0.6 pontos) 

##### b. Quantas entidades são empresas? (0.6 pontos)

##### c. Quais são as três empresas mais citadas? (0.6 pontos)

##### d. Faça uma tabela onde demonstre as três empresas mais citadas e o total de textos positivos, negativos e neutros de cada uma. (1.2 pontos)

#### 4. Faça gráficos a partir da tabela obtida acima. Descreva cada gráfico de forma que estivesse apresentando à diretoria dessas três empresas. (3.0 pontos)