# Publico scraper

In [None]:
from pathlib import Path
import sys

parent = Path().absolute().parents[0].as_posix()

sys.path.insert(0, parent)


In [None]:
import numpy as np
import pandas as pd
import spacy

from nlpiper.core import Compose
from nlpiper.transformers import cleaners
from nlpiper.core import Document

from src.cleaners import TextCleaner
from resources.stopwords import WORDS
import tensorflow as tf

In [None]:
d_ = pd.read_csv('../data/raw/publico_scraper.csv.gz', compression='gzip')
d_.head(10)

In [None]:
d_.shape

In [None]:
d_.dropna(inplace=True)
d_.shape

In [None]:
d_.main_tag.value_counts()

In [None]:
d_.city.value_counts()

In [None]:
d_.shape

In [None]:
d_.tag.value_counts().reset_index(name='N').head(10)

In [None]:
df = d_.groupby(['main_tag', 'tag']).size().reset_index(name='N')

In [None]:
df[df['main_tag'] == 'edicoes'].sort_values(by=['N'], ascending=False).head(50)

In [None]:
tags = ['sociedade', 'local', 'fugas', 'politica', 'desporto', 'p3', 'culturaipsilon', 'economia', 'ciencia', 'opiniao', 'tecnologia', 'ecosfera']

tags_to_split = ['sociedade', 'local', 'p3']

tags_rename = {'fugas': 'Turismo/Lazer', 
    'desporto': 'Desporto',
    'politica': 'Politica',
    'economia': 'Economia',
    'ciencia': 'Ciencia',
    'culturaipsilon': 'Cultura',
    'tecnologia' : 'Tecnologia',
    'ecosfera': 'Ambiente',
    'opiniao': 'Opiniao'
}

sub_tags = {
    'Saude': ['SAÚDE', 'CORONAVÍRUS', 'COVID-19', 'HOSPITAIS', 'SERVIÇO NACIONAL DE SAÚDE', 'NATALIDADE', 'ASAE', 'INEM'],
    'Ambiente': ['METEOROLOGIA', 'MAU TEMPO', 'CLIMA', 'IPMA', 'AMBIENTE', 'ÁGUA', 'SUSTENTABILIDADE', 'FLORESTAS'],
    'Incendios': ['INCÊNDIOS', 'INCÊNDIO','INCÊNDIOS FLORESTAIS'],
    'Forças-Segurança': ['GNR', 'POLÍCIA JUDICIÁRIA', 'PROTECÇÃO CIVIL', 'PSP', 'SERVIÇO DE ESTRANGEIROS E FRONTEIRAS', 'BOMBEIROS'],
    'Educacao': ['ENSINO SUPERIOR', 'EDUCAÇÃO'],
    'Justica' : ['JUSTIÇA', 'MINISTÉRIO PÚBLICO'],
    'Religiao': ['IGREJA CATÓLICA', 'RELIGIÃO'],
    'Crime': ['CRIME', 'VIOLÊNCIA DOMÉSTICA', 'PRISÕES', 'TRÁFICO DE SERES HUMANOS', 'TRÁFICO DE DROGA'],
    'Acidentes' : ['ACIDENTES', 'SEGURANÇA RODOVIÁRIA', 'ESTRADAS'],
    'Transportes' : ['TRANSPORTES', 'COMBOIOS', 'MOBILIDADE', 'BICICLETAS', 'CP', 'MOBILIDADE', 'AVIAÇÃO'],
    'Local' : [
        'LISBOA', 'COIMBRA', 'AVEIRO', 'SETÚBAL', 'VIANA DO CASTELO', 'PORTO', 'BRAGANÇA', 'BRAGA', 'BEJA', 'VISEU', 'ÉVORA',
        'CÂMARA DE VISEU', 'CÂMARA DE LISBOA', 'VILA REAL', 'ALGARVE', 'LEIRIA', 'CASTELO BRANCO', 'FARO', 'SANTARÉM', 'ALENTEJO',
        'CÂMARA DO PORTO', 'PORTALEGRE', 'CÂMARA DE COIMBRA', 'CÂMARA DE BRAGA', 'GUARDA', 'AUTARQUIAS'
    ],
    'Habitacao' : ['HABITAÇÃO', 'PATRIMÓNIO'],
    'Cultura': ['ARTES', 'MÚSICA', 'CULTURA', 'FESTIVAL', 'EVENTO', 'ARTE URBANA', 'STREET ART', 'TEATRO', 'MUSEUS'],
    'Tecnologia': ['TECNOLOGIA'],
    'Opiniao': ['OPINIÃO', 'REPORTAGEM'],
    'Turismo/Lazer': ['TURISMO'],
    'Sociedade': ['CRIANÇAS', 'SOLIDARIEDADE', 'CIGANOS', 'SEGURANÇA SOCIAL', 'IDOSOS', 'ANIMAIS']
}

In [None]:
data_ = d_[d_['main_tag'].isin(tags)]
data_.shape

In [None]:
data_to_split = data_[data_['main_tag'].isin(tags_to_split)]
print(data_to_split.shape)
data_not_to_split = data_[~data_['main_tag'].isin(tags_to_split)]
print(data_not_to_split.shape)

In [None]:
def assign_category(val, sub_tags):
    for key, values in sub_tags.items():
        if val in values:
            return(key)
   
    return 'Outros'
    

In [None]:
data_to_split['category'] = data_to_split['tag'].apply(lambda val: assign_category(val, sub_tags))

In [None]:
data_to_split['category'].value_counts()

In [None]:
# Apply Renaming
data_not_to_split['category'] = data_not_to_split['main_tag'].apply(lambda val: tags_rename[val])

In [None]:
data_not_to_split['category'].value_counts()

In [None]:
df = pd.concat([data_not_to_split, data_to_split], ignore_index=True)

In [None]:
df.category.value_counts()

In [None]:
from scripts.data_prep import process_stop_words, apply_cleaning

In [None]:
model= spacy.load("pt_core_news_lg")

pipeline = Compose([
        cleaners.CleanURL(),
        cleaners.CleanEOF(),
        cleaners.CleanMarkup(),
        cleaners.CleanAccents(),
        cleaners.CleanNumber()
    ])

tc = TextCleaner(model=model, stop_words=process_stop_words(pipeline))


In [None]:
# from joblib import Parallel, delayed 
# docs = Parallel(n_jobs=2)(delayed(apply_cleaning)(doc, pipeline, tc) for doc in df['title'].to_list()) 

In [None]:
df = df[~df['category'].isin(['Opiniao', 'Outros'])]

In [None]:
df.category.value_counts()

In [None]:
from tqdm import tqdm
data_p = pd.DataFrame()
data_p['body'] = [apply_cleaning(doc, pipeline, tc) for doc in tqdm(df['body'].to_list())]
data_p['title'] = [apply_cleaning(doc, pipeline, tc) for doc in tqdm(df['title'].to_list())]
data_p.to_csv('../data/raw/publico_data_processed.csv.gz', compression='gzip', index=False)

In [None]:
data_p = pd.read_csv('../data/raw/publico_data_processed.csv.gz', compression='gzip')

In [None]:
X = np.array(data_p['body'].to_list())
Y = np.array(df['category'].tolist())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedShuffleSplit


In [None]:
score = []
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, Y ):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    vectorizer = TfidfVectorizer()
    X_train_ = vectorizer.fit_transform(X_train)
    X_test_ = vectorizer.transform(X_test)    

    print('Training classifier')
    svc = linear_model.LogisticRegression()
    svc.fit(X_train_,y_train)

    print('Testing classifier')
    y_pred = svc.predict(X_test_)
    print(classification_report(y_test, y_pred))
    score.append(f1_score(y_test, y_pred, average=None))

print(np.mean(score))

In [None]:
score = []
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, Y ):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    vectorizer = TfidfVectorizer()
    X_train_ = vectorizer.fit_transform(X_train)
    X_test_ = vectorizer.transform(X_test)    

    print('Training classifier')
    svc = linear_model.SGDClassifier(loss='perceptron', class_weight='balanced')
    svc.fit(X_train_, y_train)

    print('Testing classifier')
    y_pred = svc.predict(X_test_)
    print(classification_report(y_test, y_pred))
    score.append(f1_score(y_test, y_pred, average='macro'))

print(np.mean(score))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

score = []
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, Y ):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    vectorizer = TfidfVectorizer()
    X_train_ = vectorizer.fit_transform(X_train)
    X_test_ = vectorizer.transform(X_test)    

    print('Training classifier')
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    clf.fit(X_train_, y_train)

    print('Testing classifier')
    y_pred = clf.predict(X_test_)
    print(classification_report(y_test, y_pred))
    score.append(f1_score(y_test, y_pred, average='macro'))

print(np.mean(score))

In [None]:
df_train = pd.DataFrame() 
df_train['category'] = y_train
df_train['title'] = X_train
df_train['category'] = df_train['category'].apply(lambda val: f'__label__{val}')

np.savetxt('categories-train.txt', df_train.values, fmt = "%s")

print(df_train.shape)

df_test = pd.DataFrame() 
df_test['category'] = y_test
df_test['title'] = X_test
df_test['category'] = df_test['category'].apply(lambda val: f'__label__{val}')

print(df_test.shape)

In [None]:
import fasttext
model = fasttext.train_supervised(input="categories-train.txt", lr=1, epoch=100, wordNgrams=5)
df_test['Pred'] = df_test['title'].apply(lambda val: model.predict(val)[0][0])
print(classification_report(df_test['category'], df_test['Pred']))

## Neural Net (RNN)

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(y_train)

n_classes = len(np.unique(y_train))

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, to_categorical(le.transform(y_train), num_classes=n_classes)))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, to_categorical(le.transform(y_test), num_classes=n_classes)))

BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)


VOCAB_SIZE = 100000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))


model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=100,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(n_classes, activation='softmax')
])
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=3, 
    restore_best_weights=True, 
    mode='min'
)

# Hyperparameters
epochs = 10
learning_rate = 0.001
decay_rate = learning_rate / epochs

opt = tf.keras.optimizers.Adam(
    learning_rate=learning_rate,
    #decay=decay_rate 
    )

model.compile(
    optimizer=opt, 
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=[tf.metrics.Precision()]
)

In [None]:

model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=epochs,
    callbacks=callback
 )


In [None]:
model.evaluate(test_dataset)

In [None]:
preds = model.predict(test_dataset)
preds = [le.inverse_transform([np.argmax(pred)])[0] for pred in preds]

In [None]:
print(classification_report(y_test, preds))

## Sentiment


In [None]:
model = fasttext.load_model('../models/trained/fasttext-sentiment.bin')


In [None]:
data_p

In [None]:
data_p['sentiment'] = data['']

In [None]:
model.predict(data_p['title'].iloc[0])[1]

In [None]:
sent = []
indexes = []
for i, val in enumerate(data_p['title'].to_list()):
    try:
        sent.append(model.predict(val))
        indexes.append(i)
    except:
        continue

In [None]:
data_p.iloc[indexes]

In [None]:
results = data_p.iloc[indexes]
results['sentiment'] = [val[0][0] for val in sent]
results['score'] = [val[1][0] for val in sent]

In [None]:
results[['title', 'sentiment']].head(50)

In [None]:
results.score.hist()

In [None]:
results[(results.sentiment == '__label__Negative') & (results.score > 0.80)].head(50)

In [None]:
aspects = []
nlp= spacy.load("pt_core_news_lg")

for sentence in d_.iloc[-1].body.split('.'):
  doc = nlp(sentence)
  descriptive_term = ''
  target = ''
  for token in doc:
    if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
      target = token.text
    if token.pos_ == 'ADJ':
      prepend = ''
      for child in token.children:
        if child.pos_ != 'ADV':
          continue
        prepend += child.text + ' '
      descriptive_term = prepend + token.text
  if target and descriptive_term:    
    aspects.append({'aspect': target,
      'description': descriptive_term, 'sentence': sentence})
print(aspects)

In [None]:
for sentence in d_.iloc[-1].body.split('.'):
  doc = nlp(sentence)
  for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
      token.pos_,[child for child in token.children])

In [None]:
for sentence in d_.iloc[-3].body.split('.'):
  doc = nlp(sentence)
  descriptive_term = ''
  for token in doc:
    if token.pos_ == 'ADJ':
      descriptive_term = token
  #print(sentence)
  print(descriptive_term)

In [None]:
for sentence in d_.iloc[0].body.split('.'):
  doc = nlp(sentence)
  descriptive_term = ''
  for token in doc:
    if token.pos_ == 'ADJ':
      prepend = ''
      for child in token.children:
        if child.pos_ != 'ADV':
          continue
        prepend += child.text + ' '
      descriptive_term = prepend + token.text
  print(sentence)
  print(descriptive_term)

In [None]:
d_.iloc[0].body.split('.')