# 7) Model tuning

Tuning the support vector machine model shows...

## Read-in data

Seperate dataframes are read-in for each language and dataset type.

In [1]:
import pandas as pd
import numpy as np

# reads in the spanish improved label headlines
spanish_improved_labels_df = pd.read_csv('../Data/spanish_improved_labels_df.csv', encoding='utf-8').reset_index(drop=True)
print(str(round(len(spanish_improved_labels_df)/1000, 1)) + 'K Spanish train headlines')

# reads in the spanish put aside headlines
spanish_put_aside_df = pd.read_csv('../Data/spanish_put_aside_df.csv', encoding='utf-8').reset_index(drop=True)
print(str(round(len(spanish_put_aside_df)/1000, 1)) + 'K Spanish put aside headlines')

# reads in the portuguese improved label headlines
portuguese_improved_labels_df = pd.read_csv('../Data/portuguese_improved_labels_df.csv', encoding='utf-8').reset_index(drop=True)
print(str(round(len(portuguese_improved_labels_df)/1000, 1)) + 'K Portuguese train headlines')

# reads in the portuguese put aside headlines
portuguese_put_aside_df = pd.read_csv('../Data/portuguese_put_aside_df.csv', encoding='utf-8').reset_index(drop=True)
print(str(round(len(portuguese_put_aside_df)/1000, 1)) + 'K Portuguese put aside headlines')

24.3K Spanish train headlines
0.6K Spanish put aside headlines
6.6K Portuguese train headlines
0.6K Portuguese put aside headlines


## Create risk type dataframe

Creates a dataframe with all the headlines from a certain risk type and an equal number of other headlines. These other headlines include both non-risk type headlines and other risk types, helping the model to learn better by distinguishing between specific risk types.

In [2]:
import random

# returns a dataframe containing all of a specific risk type and and equal number 
# of randomly sampled other and non risks
def risk_type_df(df, risk_type):
    # a specified risk type 
    risk_type_df = df.loc[df.risk_type==risk_type]

    # ranodmly sampled non risks
    population, sample_no = list(df.loc[~df.index.isin(risk_type_df.index)].index), len(risk_type_df)
    non_risks = random.sample(population, sample_no)
    non_risks_df = df.loc[df.index.isin(non_risks)]

    # concatenates risks and non-risks
    risk_type_df = pd.concat([risk_type_df, non_risks_df])

    # creates a binary coolumn to indicate whether a risk is the specified risk type or not
    risk_type_df['y_value'] = [1 if x == risk_type else 0 for x in risk_type_df['risk_type']]
    
    return risk_type_df

## Training functions

A model is fit and evaluated on TF-IDF vectors.

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# returns a train test split
def split_data(df, risk_type, test_size=0.25):
    # defines X
    X = df.headline
    
    # defines y
    if risk_type != None:
        y = df.y_value
    else:
        y = [int(pd.notna(x)) for x in df.risk_type]
        
    # returns a split
    return train_test_split(X, y, test_size=test_size, stratify=y)

# evaluates a model by printing the accuracy and classification report
def evaluate_model(model, X_test_tfidf, y_test):
    # generates predictions
    y_pred = model.predict(X_test_tfidf)
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 2))
    print()
    
    # prints classification report
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print()
    return classification_report(y_test, y_pred, output_dict=True)

# evaluates a filtered dataset against new headlines using tf-idf vectors and logistic regression 
def fit_evaluate_tfidf(model, train_df, put_aside_df, risk_type, language):
    # prints the language
    print()
    print('*** ' + language + ': ' + str(risk_type) + ' ***')
    print()

    # reassigns the train df to focus on a particular risk type and defines the y_test variable
    if risk_type != None:
        train_df, y_test = risk_type_df(train_df, risk_type), [1 if x == risk_type else 0 for x in put_aside_df['risk_type']]
    else:
        y_test = [int(pd.notna(x)) for x in put_aside_df.risk_type]
        
    # instantiate vectorizer
    vectorizer = TfidfVectorizer()

    # prepare data
    X_train, X_test, y_train, NOT_Y_TEST = split_data(train_df, risk_type, test_size=0.001)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(put_aside_df.headline)

    # fit logistic regression model
    model.fit(X_train_tfidf, y_train)

    # evaluate data
    print(str(len(put_aside_df)) + ' put aside headlines')
    return evaluate_model(model, X_test_tfidf, y_test)

In [5]:
# defines the risk type
risk_type = 'political_stability'

