# CS1671 A3 Scratch
### Jacob Emmerson

In [32]:
from collections import defaultdict
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

---

## Q1

In [44]:
#### 1. Evaluation Metrics ####

## Input: y_pred, a list of length n with the predicted labels,
## y_true, a list of length n with the true labels

## Calculates the precision of the predicted labels
def get_precision(y_pred, y_true):
    tp = 0
    fp = 0
    for p in range(len(y_pred)): # len(pred) == len(true); p = prediction index
        if y_pred[p] == 1: # is it PREDICTED positive
            if y_true[p] == 1: # correct prediction
                tp += 1
            else:
                fp += 1

    return (tp/(tp+fp))
    
## Calculates the recall of the predicted labels
def get_recall(y_pred, y_true):
    tp = 0
    fn = 0

    for p in range(len(y_true)):
        if y_true[p] == 1: # check all positive samples
            if y_pred[p] == 1: # correct prediction
                tp += 1
            else:
                fn += 1
    return (tp/(tp+fn)) # recall == sensitivity

## Calculates the f-score of the predicted labels
def get_fscore(y_pred, y_true):
    p = get_precision(y_pred, y_true)
    r = get_recall(y_pred, y_true)

    fscore = 2 * ((p * r)/(p + r))

    return fscore

def test_predictions(y_pred, y_true):

    f = get_fscore(y_pred, y_true)
    p = get_precision(y_pred, y_true)
    r = get_recall(y_pred, y_true)

    print(f"Precision = {p}")
    print(f"Recall = {r}")
    print(f"F-Score = {f}")

    return (p,r,f) # returns a tuple of precision, recall, and fscore

---

## Q2

In [181]:
#### 2. Complex Word Identification ####

## Loads in the words and labels of one of the datasets
def load_file(data_file):
    words = []
    labels = []   
    with open(data_file, 'rt', encoding="utf8") as f:
        i = 0
        for line in f:
            if i > 0:
                line_split = line[:-1].split("\t")
                words.append(line_split[0].lower())
                labels.append(int(line_split[1]))
            i += 1
    return words, labels

### 2.1: A very simple baseline

## Makes feature matrix for all complex
def all_complex_feature(words):
    return [words,[1] * len(words)]

## Labels every word complex
def all_complex(data_file):
    data = load_file(data_file)
    words = data[0] # data is a tuple where data_file[0] are the words and [1] are the labels
    true_labels = data[1]
    preds = all_complex_feature(words)[1]
    p,r,f = test_predictions(preds, true_labels)

    performance = [p, r, f]
    return performance

### 2.2: Word length thresholding

## Makes feature matrix for word_length_threshold
def length_threshold_feature(words, threshold):
    preds = []
    for w in words:
        if len(w) >= threshold:
            preds.append(1)
        else:
            preds.append(0)
    return [words,preds]

## Finds the best length threshold by f-score, and uses this threshold to
## classify the training and development set
def word_length_threshold(training_file, development_file):
    thresh = 0
    t_data = load_file(training_file)
    t_words = t_data[0]
    t_labels = t_data[1]

    d_data = load_file(development_file)
    d_words = d_data[0]
    d_labels = d_data[1]

    best_f = 0
    for t in range(21): # try thresh [0,20]
        temp = length_threshold_feature(t_words, t)
        f = get_fscore(temp[1], t_labels)
        if f > best_f:
            best_f = f
            thresh = t

    print(f"Best Threshold Found = {thresh}")
    print('-' * 20)
    
    t_M = length_threshold_feature(t_words, thresh)
    d_M = length_threshold_feature(d_words, thresh)

    print("Training Scores:")
    tp, tr, tf = test_predictions(t_M[1], t_labels)

    print("\nDevelopment Scores:")
    dp, dr, df = test_predictions(d_M[1], d_labels)

    training_performance = [tp, tr, tf]
    development_performance = [dp, dr, df]
    return training_performance, development_performance

### 2.3: Word frequency thresholding

## Loads Google NGram counts
def load_ngram_counts(ngram_counts_file): 
   counts = defaultdict(int) 
   with gzip.open(ngram_counts_file, 'rt') as f: 
       for line in f:
           token, count = line.strip().split('\t') 
           if token[0].islower(): 
               counts[token] = int(count) 
   return counts

# Finds the best frequency threshold by f-score, and uses this threshold to
## classify the training and development set

## Make feature matrix for word_frequency_threshold
def frequency_threshold_feature(words, threshold, counts):
    preds = []
    for w in words:
        try: # catch unseen words
            freq = counts[w]
        except:
            freq = 0 

        if freq <= threshold: # if word is not frequently seen, it is complex
            preds.append(1)
        else:
            preds.append(0)

    return [words,preds]

def word_frequency_threshold(training_file, development_file, counts):
    thresh = 0
    t_data = load_file(training_file)
    t_words = t_data[0]
    t_labels = t_data[1]

    d_data = load_file(development_file)
    d_words = d_data[0]
    d_labels = d_data[1]

    # bounds for threshold optimization
    l_B = min(counts.values())
    u_B = max(counts.values()) // 2 # divison by 2 is optional (a thresh = upper bound would be equal to first baseline)

    print(f"Trying Thresholds between {[l_B, u_B]}")

    best_f = 0
    for t in np.linspace(l_B, u_B, 100000): # try 100000 thresholds
        temp = frequency_threshold_feature(t_words, threshold = t, counts = counts)
        f = get_fscore(temp[1], t_labels) 
        if f > best_f:
            best_f = f
            thresh = t
        else: break # quick stopping, assumes local max = global max

    print(f"Best Threshold Found = {thresh}")
    print('-' * 20)
    t_M = frequency_threshold_feature(t_words, threshold = thresh, counts = counts)
    d_M = frequency_threshold_feature(d_words, threshold = thresh, counts = counts)

    print("Training Scores:")
    tp, tr, tf = test_predictions(t_M[1], t_labels)

    print("\nDevelopment Scores:")
    dp, dr, df = test_predictions(d_M[1], d_labels)

    training_performance = [tp, tr, tf]
    development_performance = [dp, dr, df]
    return training_performance, development_performance

### 2.4: Naive Bayes
        
## Trains a Naive Bayes classifier using length and frequency features
def naive_bayes(training_file, development_file, counts):
    ## YOUR CODE HERE
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

### 2.5: Logistic Regression

## Trains a Naive Bayes classifier using length and frequency features
def logistic_regression(training_file, development_file, counts):
    ## YOUR CODE HERE    
    training_performance = (tprecision, trecall, tfscore)
    development_performance = (dprecision, drecall, dfscore)
    return development_performance

In [11]:
training_file = "data/complex_words_training.txt"
development_file = "data/complex_words_development.txt"
test_file = "data/complex_words_test_unlabeled.txt"

train_data = load_file(training_file)

ngram_counts_file = "ngram_counts.txt.gz"
counts = load_ngram_counts(ngram_counts_file)

In [150]:
all_complex(training_file)

Precision = 0.43275
Recall = 1.0
F-Score = 0.604083057058105


[0.43275, 1.0, 0.604083057058105]

In [182]:
all_complex(development_file)

Precision = 0.418
Recall = 1.0
F-Score = 0.5895627644569816


[0.418, 1.0, 0.5895627644569816]

In [183]:
word_length_threshold(training_file, development_file)

Best Threshold Found = 7
--------------------
Training Scores:
Precision = 0.6007401315789473
Recall = 0.8440207972270364
F-Score = 0.7018976699495555

Development Scores:
Precision = 0.6053511705685619
Recall = 0.8660287081339713
F-Score = 0.7125984251968505


([0.6007401315789473, 0.8440207972270364, 0.7018976699495555],
 [0.6053511705685619, 0.8660287081339713, 0.7125984251968505])

In [184]:
word_frequency_threshold(training_file, development_file, counts)

Trying Thresholds between [0, 23688414825]
Best Threshold Found = 10896779.787297873
--------------------
Training Scores:
Precision = 0.5999008428358948
Recall = 0.6990179087232813
F-Score = 0.6456776947705442

Development Scores:
Precision = 0.603515625
Recall = 0.7392344497607656
F-Score = 0.664516129032258


([0.5999008428358948, 0.6990179087232813, 0.6456776947705442],
 [0.603515625, 0.7392344497607656, 0.664516129032258])