# 03) Improve labels

A key problem with the headlines data is that it was collected by scrapping online news sources, keyword matching and then feeding the keyword matches to Google Gemini for labelling. Because the keyword matching proccess likely missed risk headlines, the default non-risk category probably contains a significant amount of mislabelled data. To tackle this problem, a regression model is trained on half of the data at a time to generate predictions for the other half's non-risk headlines. A slice of low probability headlines from each half is kept, eliminating many false negatives and also tackling the class imbalance problem (see notebook 1). 

## Read-in data

In [1]:
import pandas as pd
import numpy as np

# read-in data
df = pd.read_csv('../Data/original_headlines.csv', encoding='utf-8')

# include only spanish 
spanish_df = df[df.country.isin(['Argentina', 'Colombia', 'Mexico'])].reset_index(drop=True)
print(str(round(len(spanish_df)/1000, 1)) + 'K Spanish headlines')

# include only portuguese 
portuguese_df = df[df.country == 'Brazil'].reset_index(drop=True)
print(str(round(len(portuguese_df)/1000, 1)) + 'K Portuguese headlines')

61.7K Spanish headlines
12.7K Portuguese headlines


## Remove duplicates & thumbnails

Headlines containing the word thumbnail are normally videos which cannot be scraped. The presence of many of these with a similar format in our non-risk headlines data will add little variety and therefore value to our dataset so therefore they are removed.

In [2]:
# remove duplicates
spanish_df.drop_duplicates(subset='headline', inplace=True)
portuguese_df.drop_duplicates(subset='headline', inplace=True)

# remove thumbnails
spanish_df = spanish_df[~spanish_df['headline'].str.lower().str.contains('thumbnail', na=False)]
portuguese_df = portuguese_df[~portuguese_df['headline'].str.lower().str.contains('thumbnail', na=False)]

# reset index
spanish_df.reset_index(drop=True, inplace=True)
portuguese_df.reset_index(drop=True, inplace=True)

## Clean text

In [24]:
import string
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
spanish_stop_words = set(stopwords.words('Spanish'))
portuguese_stop_words = set(stopwords.words('Portuguese'))

# common text cleaning techniques
def clean_text(text, language):
    text = text.strip()
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + '¡¿'))

    if language=='Spanish':
        text = ' '.join([word for word in text.split() if word not in spanish_stop_words])
    elif language=='Portuguese':
        text = ' '.join([word for word in text.split() if word not in portuguese_stop_words])
    
    return text

spanish_df['headline'] = [clean_text(x, 'Spanish') for x in spanish_df['headline']]
portuguese_df['headline'] = [clean_text(x, 'Portuguese') for x in portuguese_df['headline']]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jack-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Split dataframe

The data is split into two sets randomly so a model can be trained on each and used to predict headlines for the other one.

In [4]:
import random 

# randomly split a dataframe into 2 equal size groups 
def split_dataframes(df):
    population = list(range(len(df)))
    half_headlines = int(np.floor(len(population) / 2))
    random_samples = random.sample(population, half_headlines)
    return df.loc[random_samples,:].reset_index(drop=True), df.loc[~df.index.isin(random_samples), :].reset_index(drop=True)

## Fit model

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# vectorizes data and fits a model 
def fit_model(df):
    X, y = df.headline, [int(pd.notna(x)) for x in df.risk_type]
    vectorizer = TfidfVectorizer()
    train_tfidf = vectorizer.fit_transform(X)
    model = LogisticRegression()
    model.fit(train_tfidf, y)
    return vectorizer, model

## Add predictions

In [6]:
# returns predictions as binary decisions and probabilities
def predict_headlines(df, vectorizer, model):
    tfidf_vectors = vectorizer.transform(df.headline)
    y_preds = model.predict(tfidf_vectors)
    y_pred_prob = [np.mean(model.predict_proba(x)[:, 1]) for x in tfidf_vectors]
    return y_preds, y_pred_prob

# adds the predictions for each half of the data to their respective dfs
def add_predictions_to_df(primary_df, secondary_df):
    vectorizer, model = fit_model(secondary_df)
    y_preds, y_pred_prob = predict_headlines(primary_df, vectorizer, model)
    primary_df['y_pred'], primary_df['y_pred_prob'] = y_preds, y_pred_prob
    primary_df.sort_values('y_pred_prob', ascending=False, inplace=True)
    return primary_df

# returns a headlines dataframe along with their predictions for veiwing
# this is useful so we can set appropriate upper and lower limits for the 
# slice of low probability non-risk headlines we will select later on
def view_headline_preds(df, language):
    df_1, df_2 = split_dataframes(df) 
    df_1, df_2 = add_predictions_to_df(df_1, df_2), add_predictions_to_df(df_2, df_1)
    #df_1, df_2 =  drop_non_risk_headlines(df_1, language), drop_non_risk_headlines(df_2, language)
    return df_1, df_2

spanish_df_1, spanish_df_2 = view_headline_preds(spanish_df, 'Spanish')
portuguese_df_1, portuguese_df_2 = view_headline_preds(portuguese_df, 'Portuguese')

## Find false negatives threshold

In [8]:
# this function shows the non-risk labelled headlines ordered
# from the lowest to highest probability score. An ml practitioner 
# can use this to find the appropriate threshold for which there are 
# few false negatives in the data...
def view_nonrisk_highest_rows(df, start, end):
    temp_df = df.loc[pd.isna(df.risk_type)].sort_values('y_pred_prob')
    print('Non-risk headlines: ' + str(len(temp_df)))
    print()
    selected_index_df = temp_df.loc[pd.isna(temp_df.risk_type)].iloc[start:end, :]
    for i in range(len(selected_index_df)):
        print(str(selected_index_df.index[i]) + ':   ' + selected_index_df.headline.values[i])

#view_nonrisk_highest_rows(spanish_df_2, 6950, 7000)
#view_nonrisk_highest_rows(portuguese_df_1, 1850, 1900)

## Drop high score headlines

In [10]:
# drops a number of non-risk headlines based on their prediction scores
# keeps 7,000 headlines with low predictions
# the lowest 3,000 are not included as these are mostly youtube videos and provide little variety
def drop_non_risk_headlines(df, language):        
    non_risk_df = df.loc[pd.isna(df.risk_type)]
    risk_df = df.loc[~pd.isna(df.risk_type)]
    
    if language=='Spanish':
        lower_limit, upper_limit = 0, 7000
    elif language=='Portuguese':
        lower_limit, upper_limit = 0, 1900
        
    low_score_non_risk = non_risk_df.iloc[(len(non_risk_df)-upper_limit):(len(non_risk_df)-lower_limit),:]
    return pd.concat([risk_df, low_score_non_risk])

# creates a filtered dataframe combining both halfs of the data 
# after dropping headlines with high predictions
def create_filtered_df(df_1, df_2, language):
    df_1, df_2 =  drop_non_risk_headlines(df_1, language), drop_non_risk_headlines(df_2, language)
    return pd.concat([df_1, df_2])

filtered_spanish_df = create_filtered_df(spanish_df_1, spanish_df_2, language='Spanish')
filtered_portuguese_df = create_filtered_df(portuguese_df_1, portuguese_df_1, language='Portuguese')

## Create train test split

In [11]:
from sklearn.model_selection import train_test_split

# returns a train test split
def split_data(df, test_size=0.25):
    X = df.headline
    y = [int(pd.notna(x)) for x in df.risk_type]
    return train_test_split(X, y, test_size=test_size, stratify=y)

## Evaluate model

In [12]:
from sklearn.metrics import classification_report, accuracy_score

# evaluates the model's performance and prints the results
def evaluate_model(model, X_test_tfidf, y_test):
    y_pred = model.predict(X_test_tfidf)
    y_pred_prob = model.predict_proba(X_test_tfidf)[:, 1] 
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print()

## New headlines

In [22]:
# uses totally new headlines to evaluate the data - this
# is necessary because the filtered dfs now include only 
# easier non-risk headlines with low prediction scores
def get_new_df(language):
    new_df = pd.read_csv('../Data/headlines.csv', encoding='utf-8')
    new_df['headline'] = [clean_text(x) for x in new_df['headline']]
    
    difference = len(new_df) - len(df)
    new_headlines = new_df.iloc[(len(new_df)-difference-1):len(new_df),:]

    if language=='Spanish':
        new_headlines = new_headlines.loc[new_headlines.country.isin(['Argentina', 'Colombia', 'Mexico'])]
    elif language=='Portuguese':
        new_headlines = new_headlines.loc[new_headlines.country == 'Brazil']
    
    new_headlines.reset_index(drop=True, inplace=True)
    print(str(round(len(new_headlines)/1000, 2)) + 'K never before seen headlines')
    print()
    return new_headlines

## Check results

In [23]:
# evaluates a filtered dataset against totally new headlines
def check_results(train_df, language):
    print()
    print('*** ' + language + ' ***')
    print()

    X_train, X_test, y_train, y_test = split_data(train_df, test_size=0.1)
    
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)

    new_headlines = get_new_df(language)
    
    X_test_tfidf = vectorizer.transform(new_headlines.headline)
    y_test = [int(pd.notna(x)) for x in new_headlines.risk_type]
    
    evaluate_model(model, X_test_tfidf, y_test)

check_results(filtered_spanish_df, language='Spanish')
check_results(filtered_portuguese_df, language='Portuguese')


*** Spanish ***

3.34K never before seen headlines

Accuracy: 0.842326139088729
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.84      0.90      2884
           1       0.46      0.87      0.60       452

    accuracy                           0.84      3336
   macro avg       0.72      0.85      0.75      3336
weighted avg       0.91      0.84      0.86      3336



*** Portuguese ***

0.53K never before seen headlines

Accuracy: 0.7518796992481203
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.75      0.82       401
           1       0.50      0.76      0.60       131

    accuracy                           0.75       532
   macro avg       0.70      0.76      0.71       532
weighted avg       0.81      0.75      0.77       532


