<a href="https://colab.research.google.com/github/josh-millar/josh-millar/blob/main/q5_q6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk
!pip install syllables
import csv
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# QUESTION 5
I've attempted 4 different tests for Q5 which are written procedurally below. The large code block at the start is generally the same as in Q1-4 with some minor adjustments. Functions are modified for each test.

In [None]:
# copy helper functions verbatim from Q1-4 notebook
# a couple of modifications to fit in with the main 
# function below
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    raw_data = []
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))
    return raw_data

def split_and_preprocess_data(percentage, raw_data):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    train_data = []
    test_data = []
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))
    return train_data, test_data

def pre_process(text):
    """ Takes a statement as a single string, removes puncuation and returns
    a list of tokens"""
    # remove punctuation (i.e. anything that isn't a word or space)
    text = re.sub(r'[^\w\s]', '', text)
    # convert to lower case and split by whitespace
    text_split_lower = text.lower().split()
    return text_split_lower

def convert_label(label):
    """Converts the multiple classes into two,
    making it a binary distinction between fake news and real."""
    labels_map = {
        'true': 'REAL',
        'mostly-true': 'REAL',
        'half-true': 'REAL',
        'false': 'FAKE',
        'barely-true': 'FAKE',
        'pants-fire': 'FAKE'
    }
    return labels_map[label]

def parse_data_line(data_line):
    # Should return a tuple of the label as just FAKE or REAL and the statement
    # e.g. (label, statement)
    full_label = data_line[1]
    statement = data_line[2]
    label = convert_label(full_label)
    return (label, statement)

global_feature_dict = {} # A global dictionary of features

def to_feature_vector(tokens):
    """Given a token (a list of words), returns a dictionary as a 
    bag of words. Additionally, update a global dictionary"""
    feature_vector = {}
    for token in tokens:
      # first update feature vectors
      if token in feature_vector:
        feature_vector[token] += 1
      else:
        feature_vector[token] = 1
      # now update global dictionary in same way
      if token in global_feature_dict:
        global_feature_dict[token] += 1
      else:
        global_feature_dict[token] = 1
    return feature_vector

from sklearn.metrics import classification_report

def train_classifier(data):
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)
def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def cross_validate(dataset, folds):
    results = []
    fold_size = int(len(dataset) / folds) + 1

    for i in range(0, len(dataset), int(fold_size)):
        # split the dataset into a training and cross validation set
        cv_set = dataset[i:i+fold_size]
        training_set = dataset[0:i] + dataset[i+fold_size::]
        # train data on the training set
        classifier = train_classifier(training_set)
        # split the cv set into tokens and labels
        cv_dicts = [d[0] for d in cv_set]
        y_true = [y[1] for y in cv_set]
        # predict the labels using the classifier
        y_pred = predict_labels(cv_dicts, classifier)
        # get the metrics for this fold, add them to the running total 
        metrics_dict = classification_report(y_true, y_pred, output_dict=True)
        this_result = [
            metrics_dict["macro avg"]["precision"],
            metrics_dict["macro avg"]["recall"],
            metrics_dict["macro avg"]["f1-score"],
            metrics_dict["accuracy"]
            ]
        if not results:
          results = this_result
        else:
          results = [sum(x) for x in zip(results, this_result)]
    # convert the summed results to an average, then create a dictionary
    cv_results_values = [result / folds for result in results]
    cv_results_keys = ["precision", "recall", "f1-score", "accuracy"]
    cv_results = dict(zip(cv_results_keys, cv_results_values))
    return cv_results
# print(sorted(global_feature_dict.items(), key=lambda x: (-x[1], x[0])))
# rerun the original implementation to get a baseline

# define a helper function to drive the whole pipeline
def main():
  data_file_path = 'fake_news.tsv'
  raw_data = load_data(data_file_path)
  train_data, test_data = split_and_preprocess_data(0.8, raw_data)
  cv_results = cross_validate(train_data, 10)
  return cv_results

In [None]:
# get a baseline value for the cv, and start building up a results dataframe
cv_results_baseline = main()
all_results = pd.DataFrame()
all_results = all_results.append(pd.Series(cv_results_baseline, name="baseline"))
all_results

Unnamed: 0,precision,recall,f1-score,accuracy
baseline,0.557461,0.557597,0.557136,0.564486


Test 1: Improve pre-processing through lemmatization and stop word removal

In [None]:
# Input: a string of one statement
def pre_process(text):
    # remove punctuation (i.e. anything that isn't a word or space)
    text = re.sub(r'[^\w\s]', '', text)
    # convert to lower case and split by whitespace
    text_split = text.lower().split()
    sw = nltk.corpus.stopwords.words("english")
    # remove stop words
    text_split_no_sws = [w for w in text_split if w not in sw]
    # lemmatize
    lemmatizer = nltk.stem.WordNetLemmatizer()
    text_split_lem = [lemmatizer.lemmatize(w) for w in text_split_no_sws]
    return text_split_lem

cv_results_sws = main()
all_results = all_results.append(pd.Series(cv_results_sws, name="sw"))
all_results

Unnamed: 0,precision,recall,f1-score,accuracy
baseline,0.557461,0.557597,0.557136,0.564486
sw,0.550129,0.550196,0.549668,0.557306


Test 2: Basic hyperparameter optimsation of the SVC. Just a basic grid-search for the C parameter and loss function. 

Next, modify the weighting, firstly to account for the imbalance in the labels (using the 'balanced' value, then by assigning a higher weight to fake news.

In [None]:
best_f1 = -1
#define the hyperparameter space
C_parameters = [0.1, 1.0, 10, 100, 1000]
loss_parameters = ["hinge", "squared_hinge"]
# cycle through each of the parameters as a grid search:
for this_loss in loss_parameters:
  for this_c in C_parameters:
    def train_classifier(data):
        svc_model = LinearSVC(
            C=this_c, 
            loss=this_loss, 
            random_state=42, 
            max_iter=1000)
        pipeline =  Pipeline([('svc', svc_model)])
        return SklearnClassifier(pipeline).train(data)
    results = main()
    # update the best parameters
    if results["f1-score"] > best_f1:
      best_c = this_c
      best_loss = this_loss
      best_f1 = results["f1-score"]

class_weights = ["balanced", {0: 1, 1: 4}]
best_class_weight = "balanced"
for class_weight in class_weights:
  def train_classifier(data):
    svc_model = LinearSVC(
        C=best_c,
        loss=best_loss,
        class_weight=class_weight
    )
    pipeline =  Pipeline([('svc', svc_model)])
    return SklearnClassifier(pipeline).train(data)
  if results["f1-score"] > best_f1:
      best_class_weight = class_weight
      best_f1 = results["f1-score"]

# redefine the train_classifier function with the best performing model
def train_classifier(data):
  svc_model = LinearSVC(
    C=best_c,
    loss=best_loss,
    class_weight=best_class_weight    
  )
  pipeline =  Pipeline([('svc', svc_model)])
  return SklearnClassifier(pipeline).train(data)
print("Best c: %s, Best loss: %s, Best class weight: %s" %(best_c, best_loss, best_class_weight))
cv_results_hyper = main()
all_results = all_results.append(pd.Series(cv_results_hyper, name="hyper"))
all_results



Best c: 0.1, Best loss: hinge, Best class weight: balanced


Unnamed: 0,precision,recall,f1-score,accuracy
baseline,0.557461,0.557597,0.557136,0.564486
sw,0.550129,0.550196,0.549668,0.557306
hyper,0.590994,0.592531,0.588434,0.590333


Test 3: Include the length of the text. Want to incorporate stylistic features, so need to start working with the text as a sparse matrix. There's a fair bit of refactoring of the code to get it to work with the sparse matrix. Some of this from lab 2, but have also used the info here for help building the pipeline: 
https://www.kaggle.com/code/baghern/a-deep-dive-into-sklearn-pipelines/notebook

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
# we want split and preprocess to return lists now..
def split_and_preprocess_data(percentage, raw_data):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    train_text = [x[0] for x in raw_data[:num_training_samples]]
    test_text = [x[0] for x in raw_data[num_training_samples:]]
    train_labels = [x[1] for x in raw_data[:num_training_samples]]
    test_labels = [x[1] for x in raw_data[num_training_samples:]]
    return train_text, test_text, train_labels, test_labels

def cross_validate(text, labels, folds):
    results = []
    fold_size = int(len(text) / folds) + 1
    for i in range(0, len(text), int(fold_size)):
        # split the dataset into a training and cross validation set
        cv_text = text[i:i+fold_size]
        training_text = text[0:i] + text[i+fold_size::]
        cv_labels = labels[i:i+fold_size]
        training_labels = labels[0:i] + labels[i+fold_size::]
        # train data on the training set
        classifier = train_classifier(training_text, training_labels)
        # predict the labels using the classifier
        y_pred = predict_labels(cv_text, classifier)
        # get the metrics for this fold, add them to the running total 
        metrics_dict = classification_report(cv_labels, y_pred, output_dict=True)
        this_result = [
            metrics_dict["macro avg"]["precision"],
            metrics_dict["macro avg"]["recall"],
            metrics_dict["macro avg"]["f1-score"],
            metrics_dict["accuracy"]
            ]
        if not results:
          results = this_result
        else:
          results = [sum(x) for x in zip(results, this_result)]
    # convert the summed results to an average, then create a dictionary
    cv_results_values = [result / folds for result in results]
    cv_results_keys = ["precision", "recall", "f1-score", "accuracy"]
    cv_results = dict(zip(cv_results_keys, cv_results_values))
    return cv_results

def get_length(data):
    # the length is the sum of the values of each sparse matrix
    length = np.array([len(x.split()) for x in data]).reshape(-1, 1)
    return length

def train_classifier(data_text, data_label):
    svc_model = LinearSVC(
        C=best_c,
        loss=best_loss,
        class_weight=best_class_weight    
    )
    # create a transformer for the getting the legnth of the text and normalizing
    # with the standard scaler
    length_feats = Pipeline([
        ("count_text", FunctionTransformer(get_length, validate=False)),
        ("scale", StandardScaler())
    ])
    text_feats = Pipeline([
        ("text_count_vect", CountVectorizer(min_df=1))
    ])
    feats = FeatureUnion([
        ("text", text_feats),
        ("length", length_feats)
    ])
    pipeline = Pipeline([
        ("features", feats),
        ("svc", svc_model)
    ])
    return pipeline.fit(data_text, data_label)
def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.predict(samples)

def main():
    data_file_path = 'fake_news.tsv'
    raw_data = load_data(data_file_path)
    train_text, test_text, train_labels, test_labels = split_and_preprocess_data(0.8, raw_data)
    cv_results = cross_validate(train_text, train_labels, 10)
    return cv_results
cv_results_len = main()
all_results = all_results.append(pd.Series(cv_results_len, name="length"))
all_results

Unnamed: 0,precision,recall,f1-score,accuracy
baseline,0.557461,0.557597,0.557136,0.564486
sw,0.550129,0.550196,0.549668,0.557306
hyper,0.590994,0.592531,0.588434,0.590333
length,0.594191,0.595737,0.591564,0.593526


Try 4: getting the gunning-fog readability index. 

To the marker: this takes quite a long time to compute and doesn't really do much. feel free to skip this - it wont mess up later cells

In [None]:
import syllables
def get_gunning_fog(text):
    """returns an array of the gunning fog readability index from p16 of Unit
    4's lecture. has cheated a bit by using a library for the syllables"""
    gf = []
    for line in text:
        no_words = len(line.split())
        no_sentences = len(re.split(r'[.!?]+', line))
        complex_words = len([w for w in line.split() if syllables.estimate(w) > 2])
        gf.append(
            0.4 * ((no_words / no_sentences) + 100 * (complex_words / no_words))
        )
    return np.array(gf).reshape(-1, 1)

def train_classifier(data_text, data_label):
    """returns a trained classifier"""
    svc_model = LinearSVC(
        C=best_c,
        loss=best_loss,
        class_weight=best_class_weight    
    )
    #create a transformer for the getting the gf readbility index of the text 
    # and normalizing with the standard scaler
    
    gf_feats = Pipeline([
        ("gf_text", FunctionTransformer(get_gunning_fog, validate=False)),
        ("scale", StandardScaler())
    ])
    text_feats = Pipeline([
        ("text_count_vect", CountVectorizer(min_df=1))
    ])
    feats = FeatureUnion([
        ("text", text_feats),
        ("length", gf_feats)
    ])
    pipeline = Pipeline([
        ("features", feats),
        ("svc", svc_model)
    ])
    return pipeline.fit(data_text, data_label)
cv_results_gf = main()
all_results = all_results.append(pd.Series(cv_results_gf, name="gunning-fog"))
all_results

Unnamed: 0,precision,recall,f1-score,accuracy
baseline,0.557461,0.557597,0.557136,0.564486
sw,0.550129,0.550196,0.549668,0.557306
hyper,0.590994,0.592531,0.588434,0.590333
length,0.594191,0.595737,0.591564,0.593526
gunning-fog,0.594885,0.596448,0.592295,0.594258


In [None]:
# print the df for report
all_results.to_latex()

'\\begin{tabular}{lrrrr}\n\\toprule\n{} &  precision &    recall &  f1-score &  accuracy \\\\\n\\midrule\nbaseline    &   0.557461 &  0.557597 &  0.557136 &  0.564486 \\\\\nsw          &   0.550129 &  0.550196 &  0.549668 &  0.557306 \\\\\nhyper       &   0.590994 &  0.592531 &  0.588434 &  0.590333 \\\\\nlength      &   0.594191 &  0.595737 &  0.591564 &  0.593526 \\\\\ngunning-fog &   0.595369 &  0.596955 &  0.592708 &  0.594629 \\\\\n\\bottomrule\n\\end{tabular}\n'

# QUESTION 6
Firstly, I have 

In [None]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    raw_data = []
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))
    return raw_data