# 02) TF–IDF baseline

This notebook creates a baseline model using TF-IDF and logistic regression. A regression model is used to allow for the altering of the risk headline classification threshold. The primary business goal of this project is to create a model with a high recall. Although the overall accuracy for the Spanish and Portuguese models isn't that bad for a baseline model (see below), the recall for both of them in terms of risk headlines (1) is poor. 

## Read-in data

In [1]:
import pandas as pd
import numpy as np

# read-in data
df = pd.read_csv('../Data/original_headlines.csv', encoding='utf-8')

# include only spanish 
spanish_df = df[df.country.isin(['Argentina', 'Colombia', 'Mexico'])].reset_index(drop=True)
print(str(round(len(spanish_df)/1000, 1)) + 'K Spanish headlines')

# include only portuguese 
portuguese_df = df[df.country == 'Brazil'].reset_index(drop=True)
print(str(round(len(portuguese_df)/1000, 1)) + 'K Portuguese headlines')

61.7K Spanish headlines
12.7K Portuguese headlines


## Split data

In [2]:
from sklearn.model_selection import train_test_split

def split_data(df):
    X = df.headline
    y = [int(pd.notna(x)) for x in df.risk_type]
    return train_test_split(X, y, test_size=0.25, stratify=y)

## Clean text

In [3]:
import string
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def clean_text(text, language='english'):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + '¡¿'))
    stop_words = set(stopwords.words(language))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jack-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Vectorize data

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_data(X_train, X_test=None, train_only=False):
    if train_only==True:
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(X_train)
        return X_train_tfidf
    else:
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        return X_train_tfidf, X_test_tfidf

## Fit model

In [5]:
from sklearn.linear_model import LogisticRegression

def fit_model(X_train_tfidf, y_train):
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)
    return model

## Evaluate model

In [6]:
from sklearn.metrics import classification_report, accuracy_score

def evaluate_model(model, X_test_tfidf, y_test):
    y_pred = model.predict(X_test_tfidf)
    y_pred_prob = model.predict_proba(X_test_tfidf)[:, 1] 
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

## Compare languages

In [7]:
languages = ['spanish', 'portuguese']

def clean_train_evaluate(language):
    df = eval(language + '_df')
    df['headline'] = df['headline'].apply(clean_text, language=language)
    X_train, X_test, y_train, y_test = split_data(df)
    X_train_tfidf, X_test_tfidf = vectorize_data(X_train, X_test)
    model = fit_model(X_train_tfidf, y_train)
    evaluate_model(model, X_test_tfidf, y_test)

for language in languages:
    print()
    print('*** ' + language.title() + ' ***')
    print()
    clean_train_evaluate(language)
    print()


*** Spanish ***

Accuracy: 0.9238150813719769
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96     13559
           1       0.86      0.44      0.58      1864

    accuracy                           0.92     15423
   macro avg       0.90      0.72      0.77     15423
weighted avg       0.92      0.92      0.91     15423



*** Portuguese ***

Accuracy: 0.8458149779735683
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.97      0.91      2463
           1       0.82      0.40      0.54       715

    accuracy                           0.85      3178
   macro avg       0.83      0.69      0.72      3178
weighted avg       0.84      0.85      0.82      3178




## Test headlines

In [8]:
# The following headline is deemed a risk headline that this model fails to predict...
test_headline = 'Inversiones en sistema eléctrico, insuficientes para satisfacer la creciente demanda: IMCO'

df = spanish_df
X_train, X_test, y_train, y_test = split_data(df)
X_train_tfidf, X_test_tfidf = vectorize_data(X_train, X_test)
model = fit_model(X_train_tfidf, y_train)

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(X_train)
cleaned_text = list(clean_text(test_headline, language='spanish'))
test_vector = vectorizer.transform(cleaned_text)

y_pred = model.predict(test_vector)
y_pred_prob = model.predict_proba(test_vector)[:, 1] 
np.mean(y_pred_prob)

0.05605419570785658

## Save model

In [9]:
import pickle

filename = '../Models/tfidf_baseline.pkl'

with open(filename, 'wb') as file:
    pickle.dump(model, file)