In [None]:
import string
import random
from copy import deepcopy
from collections import Counter
from typing import Optional, List, Dict, Union
from itertools import combinations
from math import ceil, floor

import numpy as np
import pandas as pd

from cycler import cycler

from IPython.display import display, HTML

from matplotlib import pyplot as plt
from matplotlib.colors import to_hex

from sklearn import preprocessing, svm
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.utils.testing import ignore_warnings

from tqdm.notebook import tqdm_notebook as tqdm

tqdm.pandas()

from book_processor.Book import PROJ_ROOT


NEW_GENRES = ["Adventure_Stories", "Fiction", "Historical_Fiction", 
          "Love_Stories", "Mystery", "Poetry", "Science_Fiction", "Short_Stories"]

GENRE_COMBS = [(c1, c2) for c1, c2 in combinations(NEW_GENRES, 2)]

scale = preprocessing.MinMaxScaler()

#### Helper Methods

In [None]:
def display_df(df: pd.DataFrame, header: Optional[str] = None, max_rows: Optional[int] = None, 
               add_break: bool = False, formatters: Optional[Dict] = None, index: bool = False):
    html = ""
    if header is not None:
        html += header
        
    if format is not None:
        html += df.to_html(index=index, max_rows=max_rows, formatters=formatters)
    else:
        html += df.to_html(index=index, max_rows=max_rows)
        
    if add_break:
        html += "<br>"
    display(HTML(html))
    

def setup_axis(ax, xmin: Optional[Union[int, float]] = 0, xmax: Optional[Union[int, float]] = 1,
               ymin: Union[int, float] = 0, ymax: Union[int, float] = 1,
               xlabel: str = "", ylabel: str = "", **kwargs):
    """
    - xmajor: int = 20
    - xminor: int = 100
    - ymajor: int = 20
    - yminor: int = 100
    - x_ticklabels: List = None
    - x_ticklabel_size: Union[int, float] = 20
    - y_ticklabel_size: Union[int, float] = 24
    - xlabel_size: Union[int, float] = 28
    - xlabel_pad: Union[int, float] = 20
    - ylabel_size: Union[int, float] = 32
    - ylabel_pad: Union[int, float] = 30
    - left: Union[int, float] = None
    - right: Union[int, float] = None
    - bottom: Union[int, float] = None
    - top: Union[int, float] = None
    - grid: str = "--"
    - minor_grid: str = None
    """
    
    if "x_ticklabels" not in kwargs and xmin is not None:
        if xmin < 0:
            xmajor = kwargs.get("xmajor", xmax * 4)
            ax.set_xticks(np.arange(xmin if xmajor >= 40 else 0, xmax + 1, xmax / xmajor))
            
            xminor = kwargs.get("xminor", xmajor * 10)
            ax.set_xticks(np.arange(xmin, xmax + 1, xmax / xminor), minor=True)
        else:
            ax.set_xticks(np.linspace(xmin, xmax, kwargs.get("xmajor", 20) + 1))
            ax.set_xticks(np.linspace(xmin, xmax, kwargs.get("xminor", 100) + 1), minor=True)
    elif "x_ticklabels" in kwargs:
        ax.set_xticks(np.arange(xmin, len(kwargs["x_ticklabels"])))
        ax.set_xticklabels(kwargs["x_ticklabels"])
    
    ax.set_yticks(np.linspace(ymin, ymax, kwargs.get("ymajor", 20) + 1))
    ax.set_yticks(np.linspace(ymin, ymax, kwargs.get("yminor", 100) + 1), minor=True)
    
    ax.tick_params(axis="x", labelsize=kwargs.get("x_ticklabel_size", 20))
    ax.tick_params(axis="y", labelsize=kwargs.get("y_ticklabel_size", 24))
    
    xlabel_size = kwargs.get("xlabel_size", 0 if xlabel == "" else 28)
    xlabel_pad = kwargs.get("xlabel_pad", None if xlabel == "" else 20)
    ax.set_xlabel(xlabel, fontsize=xlabel_size, labelpad=xlabel_pad)
    ax.set_ylabel(ylabel, fontsize=kwargs.get("ylabel_size", 32), labelpad=kwargs.get("ylabel_pad", 30))
    
    if "left" in kwargs:
        ax.set_xlim(left=kwargs["left"])
    if "right" in kwargs:
        ax.set_xlim(right=kwargs["right"])
    if "bottom" in kwargs:
        ax.set_ylim(bottom=kwargs["bottom"])
    if "top" in kwargs:
        ax.set_ylim(top=kwargs["top"])
    
    ax.grid(linestyle=kwargs.get("grid", "--"))
    ax.grid(linestyle=kwargs.get("minor_grid", "none"), which="minor")


def auto_label(ax, fontsize: int = 12):
    rects = ax.patches
    (y_bottom, y_top) = ax.get_ylim()
    y_height = y_top - y_bottom + 0.05
    for rect in rects:
        height = rect.get_height()
        p_height = (height / y_height)
        if p_height > 0.95:
            label_position = height - (y_height * 0.05)
        else:
            label_position = height + (y_height * 0.01)
        if label_position > 0.1:
            ax.text(rect.get_x() + rect.get_width() / 2., label_position, 
                    "{0:.3f}".format(height), ha='center', va='bottom', fontsize=fontsize)
            

def process_and_scale(data: Union[List, pd.DataFrame], n_cols: int = 5, book_nums: Optional[pd.Series] = None):
    if isinstance(data, List):
        data_df = pd.DataFrame(data).fillna(0)
    else:
        data_df = data.reset_index(drop=True)
    
    if book_nums is not None:
        data_df.insert(0, "Book #", book_nums.reset_index(drop=True))
        data_df.rename(columns={"_genre": "@Genre", "_outcome": "@Outcome"}, inplace=True)
    
    nominal = data_df[["Book #", "@Genre", "@Outcome"]]
    data_df.drop(columns=["Book #", "@Genre", "@Outcome"], inplace=True)
    data_df_scaled = scale.fit_transform(data_df)
    data_df_scaled = pd.DataFrame(data_df_scaled, columns=data_df.columns)
    data_df_scaled["@Outcome"] = nominal["@Outcome"]
    data_df_scaled.insert(0, "@Genre", nominal["@Genre"])
    data_df_scaled.insert(0, "Book #", nominal["Book #"])
    to_display = get_display_df(data_df_scaled, n_cols)
    
    return data_df_scaled, to_display


def get_display_df(df: pd.DataFrame, n_cols: int = 5):
    to_display = df.iloc[:, :n_cols].copy()
    to_display["..."] = "..."
    to_display = pd.concat([to_display, df.iloc[:, -n_cols:]], axis=1)
    return to_display


def remove_numbers(df: pd.DataFrame):
    drop_cols = [c for c in df.columns if re.match("[A-Za-z]*\d+[A-Za-z]*", c, re.IGNORECASE)]
    dropped = df.drop(columns=drop_cols) 
    return dropped


def tfi_ngram(df_temp: pd.DataFrame, uni: bool = False, bi: bool = False):
    if uni:
        tfi_ngram_df_vect = CountVectorizer(analyzer="word")
    elif bi:
        tfi_ngram_df_vect = CountVectorizer(analyzer="word", ngram_range=(2,2))
        
    tfi_ngram_df_vect.fit(df_temp["first_1k"])
    tfi_ngram_data = tfi_ngram_df_vect.transform(df_temp["first_1k"])
    tfi_ngram_data = pd.DataFrame(tfi_ngram_data.todense(), columns=tfi_ngram_df_vect.get_feature_names())
    tfi_ngram_data = remove_numbers(tfi_ngram_data)
    return tfi_ngram_data


def process_weights(model_weights: Dict, display: bool = True):
    for key, weights in model_weights.items():
        model_weights[key] = pd.concat(weights)
        model_weights[key].reset_index(drop=True, inplace=True)
        model_weights[key] = model_weights[key].mean(axis=0).reset_index()
        model_weights[key].columns = ["Feature", "Weight"]
        model_weights[key] = model_weights[key].sort_values(by=["Weight"], ascending=False).reset_index(drop=True)
        
        if display:
            display_df(model_weights[key], f"<h4>{key} Feature Weights</h4>", 10, True)
            
    return model_weights


def predict_success(model_df: pd.DataFrame, model_name: str, genre_list: List[str] = NEW_GENRES, **kwargs):
    """
    - add_to_acc: Dict = None
    - disp_acc = True
    - disp_weights = True
    - searching = False
    - show_pbar = True
    """
    accuracies = []
    weights = {genre: [] for genre in genre_list}

    if kwargs.get("show_pbar", True):
        display(HTML(f"<h4>Predicting book success with {model_name} data...</h4>"))
        bar_length = len(genre_list) * 5
        with tqdm(total=bar_length) as pbar:
            _predict_success(model_df, model_name, accuracies, weights, genre_list, pbar, **kwargs)
    else:
        _predict_success(model_df, model_name, accuracies, weights, genre_list, **kwargs)
        
    accuracies = pd.DataFrame(accuracies)
    weights = process_weights(weights, display=kwargs.get("disp_weights", True))
    if kwargs.get("add_to_acc", None) is not None:
        add_to_acc.update({model_name: accuracies})

    accuracies = accuracies.append({"Genre": "Average", "Accuracy": accuracies["Accuracy"].mean()}, ignore_index=True)
    if kwargs.get("disp_acc", True):
        display_df(accuracies, f"<h4>{model_name} Accuracies by Genre</h4>")

    return accuracies, weights


def _predict_success(model_df: pd.DataFrame, model_name: str, accs: List, ws: Dict, genre_list: List[str] = NEW_GENRES, pbar: Optional = None, **kwargs):
    """
    - searching = False
    """
    for genre in genre_list:
        if pbar is not None:
            pbar.set_postfix_str(f" -- {genre}")
        
        df_temp = model_df[model_df["@Genre"] == genre]
        mean_acc = _train_test(df_temp, model_name, ws, genre, "@Outcome", pbar, **kwargs)

        accuracy = np.array(mean_acc).mean()
        accs.append({"Genre": genre, "Accuracy": accuracy})


def predict_genre(model_df: pd.DataFrame, model_name: str, how: str = "one_v_one", genre_list: List = GENRE_COMBS, **kwargs):
    """
    - add_to_acc: Dict = None
    - disp_acc = True
    - disp_weights = True
    - searching = False
    - show_pbar = True
    """
    accuracies = []
    weights = {genre: [] for genre in genre_list}

    if kwargs.get("show_pbar", True):
        display(HTML(f"<h4>Performing {how} binary genre prediction with {model_name} data...</h4>"))
        bar_length = len(genre_list) * 5
        with tqdm(total=bar_length) as pbar:
            globals()[how](model_df, model_name, accuracies, weights, genre_list, pbar, **kwargs)
    
    else:
        globals()[how](model_df, model_name, accuracies, weights, genre_list, **kwargs)

    accuracies = pd.DataFrame(accuracies)
    weights = process_weights(weights, display=kwargs.get("disp_weights", True))
    if kwargs.get("add_to_acc", None) is not None:
        add_to_acc.update({model_name: accuracies})

    accuracies = accuracies.append({"Genre": "Average", "Accuracy": accuracies["Accuracy"].mean()}, ignore_index=True)
    if kwargs.get("disp_acc", True):
        display_df(accuracies, f"<h4>{model_name} Accuracies by Genre</h4>")

    return accuracies, weights

def one_v_one(model_df: pd.DataFrame, model_name: str, accs: List, ws: Dict, genre_list: List, pbar: Optional = None, **kwargs):
    for g1, g2 in genre_list:
        if pbar is not None:
            pbar.set_postfix_str(f" -- {g1}, {g2}")
                
        df_temp = model_df[(model_df["@Genre"] == g1) | (model_df["@Genre"] == g2)].reset_index(drop=True)
        mean_acc = _train_test(df_temp, model_name, ws, (g1, g2), "@Genre", pbar, **kwargs)

        accuracy = np.array(mean_acc).mean()
        accs.append({"Genre": (g1, g2), "Accuracy": accuracy})


def one_v_all(model_df: pd.DataFrame, model_name: str, accs: List, ws: Dict, genre_list: List, pbar: Optional = None, **kwargs):
    for genre in genre_list:
        if pbar is not None:
            pbar.set_postfix_str(f" -- {genre}")
        
        gtemp = model_df[model_df["@Genre"] == genre].copy().reset_index(drop=True)
        not_gtemp = model_df[model_df["@Genre"] != genre].copy().reset_index(drop=True)
        
        sample_idx = random.sample(range(0, len(not_gtemp)), k=len(gtemp))
        
        not_gtemp = not_gtemp.iloc[sample_idx].reset_index(drop=True)
        not_gtemp["@Genre"] = f"not {genre}"
        
        df_temp = pd.concat([gtemp, not_gtemp])
        mean_acc = _train_test(df_temp, model_name, ws, genre, "@Genre", pbar, **kwargs)

        accuracy = np.array(mean_acc).mean()
        accs.append({"Genre": genre, "Accuracy": accuracy})


def _train_test(temp: pd.DataFrame, model_name: str, ws: Dict, wkey: Union[str, tuple], pred_col: str, pbar: Optional = None, **kwargs):
    tfi_data = get_df_by_name(temp, model_name, searching=kwargs.get("searching", False))
        
    encoder = preprocessing.LabelEncoder()
    y_data = encoder.fit_transform(temp[pred_col])
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    mean_acc = []
    for train_index, test_index in kf.split(y_data):

        X_train, X_test = tfi_data.iloc[train_index], tfi_data.iloc[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]

        preds, clf = predict(X_train, X_test, y_train)

        coefs = clf.coef_.ravel()
        ws[wkey].append(pd.DataFrame(coefs, index=tfi_data.columns).transpose())

        score = np.mean(preds == y_test)
        mean_acc.append(score)

        if pbar is not None:
            pbar.update(1)
    
    return mean_acc


def get_df_by_name(temp: pd.DataFrame, model_name: str, searching: bool = False):
    if searching or model_name not in ["Unigram", "Bigram", "POS"]:
        return temp.drop(columns=["Book #", "@Genre", "@Outcome"])
    
    elif model_name == "Unigram":
        return tfi_ngram(temp, uni=True)
    
    elif model_name == "Bigram":
        return tfi_ngram(temp, bi=True)
    
    elif model_name == "POS":
        return temp.drop(columns=["Book #", "@Genre", "@Outcome", "CD", "$", "``", "''"])


def get_ens_str(uni: bool = False, roget: bool = False, liwc: bool = False, pos: bool = False, nrc: bool = False, wn: bool = False):
    ens_str = []
    if uni:
        ens_str.append("Unigram")
    if roget:
        ens_str.append("Roget")
    if liwc:
        ens_str.append("LIWC")
    if pos:
        ens_str.append("POS")
    if nrc:
        ens_str.append("NRC")
    if wn:
        ens_str.append("WordNet")
    return "_".join(ens_str)


@ignore_warnings(category=ConvergenceWarning)
def predict(x_train, x_test, y_train):
    estimator = svm.LinearSVC()    
    estimator.fit(x_train, y_train)
    preds = estimator.predict(x_test)
    return preds, estimator


def create_cmap(cmap, items: List, as_dict: bool = True):
    color_list = [to_hex(c) for c in cycler("color", cmap(np.linspace(0, 1, len(items)))).by_key()["color"]]
    return dict(zip(items, color_list)) if as_dict else color_list

# Load and Process Data

In [None]:
import os
from zipfile import ZipFile
import re
import pickle
from roget.roget_thesaurus import RogetThesaurus

z = ZipFile(str(PROJ_ROOT.joinpath("data", "books_by_genre.zip")))
namelist = z.namelist()

all_books = {re.search("(?<=all_).*(?=_books)", path)[0]: pickle.load(z.open(path)) for path in namelist}

roget_thesaurus = RogetThesaurus(PROJ_ROOT.joinpath("roget", "roget_thesaurus.csv"))

ACCURACIES = {}

In [None]:
unigram_data = []
bigram_data = []

for genre, books in all_books.items():
    print(f"Extracting data from {genre} books...")
    for i, book in enumerate(books):
        if book.book_number == "19513" or book.book_number == "19640" \
                or book.book_number == "19678" or book.book_number == "19782"\
                or book.book_number == "19836":
            continue
            
        _outcome = book.success

        first_1k = "".join(book.first_1k_sentences)
        first_1k = re.sub("_", "", first_1k)
        first_1k = first_1k.translate(str.maketrans('', '', string.punctuation))
        first_1k = re.sub("chapter ([ivx]+\s+|\w+\s+?)", "", first_1k, re.IGNORECASE)
        
        unigram_temp = {"Book #": book.book_number, "@Genre": genre, "first_1k": first_1k, "@Outcome": _outcome}
        unigram_data.append(unigram_temp)
        
        bigram_temp = {"Book #": book.book_number, "@Genre": genre, "first_1k": first_1k, "@Outcome": _outcome}
        bigram_data.append(bigram_temp)

In [None]:
unigram_df = pd.DataFrame(unigram_data)
unigram_df.first_1k = unigram_df.first_1k.astype(str)
display_df(unigram_df, "<h4>Unigram Data</h4>", max_rows=6, formatters={"first_1k": lambda s: s[:100] + "..."})

In [None]:
bigram_df = pd.DataFrame(bigram_data)
bigram_df.first_1k = bigram_df.first_1k.astype(str)
display_df(bigram_df, "<h4>Bigram Data</h4>", max_rows=6, formatters={"first_1k": lambda s: s[:100] + "..."})


In [None]:
pos_data = []
roget_data = []
wordnet_data = [] 
liwc_data = []
swn_data = []
nrc_data = []

for genre in all_books.keys():
    pos_data += pickle.load(open(PROJ_ROOT.joinpath("data", f"{genre}_pos_data.txt"), "rb+"))
    roget_data += pickle.load(open(PROJ_ROOT.joinpath("data", f"{genre}_roget_data.txt"), "rb+"))
    wordnet_data += pickle.load(open(PROJ_ROOT.joinpath("data", f"{genre}_wordnet_data.txt"), "rb+"))
    liwc_data += pickle.load(open(PROJ_ROOT.joinpath("data", f"{genre}_liwc_data.txt"), "rb+"))
    swn_data += pickle.load(open(PROJ_ROOT.joinpath("data", f"{genre}_swn_data.txt"), "rb+"))
    nrc_data += pickle.load(open(PROJ_ROOT.joinpath("data", f"{genre}_nrc_data.txt"), "rb+"))

In [None]:
pos_df_scaled, to_display = process_and_scale(pos_data, book_nums=unigram_df["Book #"])
display_df(to_display, "<h4>POS Data</h4>", 10)

In [None]:
roget_df_scaled, to_display = process_and_scale(roget_data, book_nums=unigram_df["Book #"])
display_df(to_display, "<h4>Roget Data</h4>", 10)

In [None]:
wordnet_df_scaled, to_display = process_and_scale(wordnet_data, book_nums=unigram_df["Book #"])
display_df(to_display, "<h4>WordNet Data</h4>", 10)

In [None]:
liwc_df_scaled, to_display = process_and_scale(liwc_data, book_nums=unigram_df["Book #"])
display_df(to_display, "<h4>LIWC Data</h4>", 10)


In [None]:
swn_df_scaled, to_display = process_and_scale(swn_data, book_nums=unigram_df["Book #"])
display_df(to_display, "<h4>SentiWordNet Data</h4>", 10)


In [None]:
nrc_df_scaled, to_display = process_and_scale(nrc_data, book_nums=unigram_df["Book #"])
display_df(nrc_df_scaled, "<h4>NRC Sentiment Data</h4>", 10)

#### Ensemble Method Definition

In [None]:
@ignore_warnings(category=ConvergenceWarning)
def ensemble(uni: bool = False, roget: bool = False, liwc: bool = False, pos: bool = False, nrc: bool = False, wn: bool = False):
    ens_acc = []
    ens_str = get_ens_str(uni, roget, liwc, pos, nrc, wn)
    
    for genre in NEW_GENRES:
        # print(genre)
        encoder = preprocessing.LabelEncoder()
        models = []
        
        if uni:
            unigram_df_temp = unigram_df[unigram_df["@Genre"] == genre]
            tfi_unigram_data = tfi_ngram(unigram_df_temp, uni=uni)
            y_data_unigram = encoder.fit_transform(unigram_df_temp["@Outcome"])
            models.append((tfi_unigram_data, y_data_unigram))
        if roget:
            roget_df_temp = roget_df_scaled[roget_df_scaled["@Genre"] == genre]
            tfi_roget_data = roget_df_temp.drop(columns=["Book #", "@Genre", "@Outcome"])
            y_data_roget = encoder.fit_transform(roget_df_temp["@Outcome"])
            models.append((tfi_roget_data, y_data_roget))
        if liwc:
            liwc_df_temp = liwc_df_scaled[liwc_df_scaled["@Genre"] == genre]
            tfi_liwc_data = liwc_df_temp.drop(columns=["Book #", "@Genre", "@Outcome"])
            y_data_liwc = encoder.fit_transform(liwc_df_temp["@Outcome"])
            models.append((tfi_liwc_data, y_data_liwc))
        if pos:
            pos_df_temp = pos_df_scaled[pos_df_scaled["@Genre"] == genre]
            tfi_pos_data = pos_df_temp.drop(columns=["Book #", "@Genre", "@Outcome", "CD", "$", "``", "''"])
            y_data_pos = encoder.fit_transform(pos_df_temp["@Outcome"])
            models.append((tfi_pos_data, y_data_pos))
        if nrc:
            nrc_df_temp = nrc_df_scaled[nrc_df_scaled["@Genre"] == genre]
            tfi_nrc_data = nrc_df_temp.drop(columns=["Book #", "@Genre", "@Outcome"])
            y_data_nrc = encoder.fit_transform(nrc_df_temp["@Outcome"])
            models.append((tfi_nrc_data, y_data_nrc))
        if wn:
            wordnet_df_temp = wordnet_df_scaled[wordnet_df_scaled["@Genre"] == genre]
            tfi_wordnet_data = wordnet_df_temp.drop(columns=["Book #", "@Genre", "@Outcome"])
            y_data_wn = encoder.fit_transform(wordnet_df_temp["@Outcome"])
            models.append((tfi_wordnet_data, y_data_wn))
            
        kf = KFold(n_splits=5, shuffle=True, random_state=0)
        mean_acc = []
        
        for train_index, test_index in kf.split(models[0][1]):
            X_trains = []
            X_tests = []
            y_trains = []
            y_tests = []
            
            for x, y in models:
                X_trains.append(x.iloc[train_index])
                X_tests.append(x.iloc[test_index])
                y_trains.append(y[train_index])
                y_tests.append(y[test_index])
            
            preds = []
            probs = []
            for x_train, x_test, y_train, y_test in zip(X_trains, X_tests, y_trains, y_tests):
                
                estimator = svm.LinearSVC()
                estimator.fit(x_train, y_train)
                preds.append(estimator.predict(x_test))
            
                if len(models) < 3:
                    probs.append(estimator._predict_proba_lr(x_test))
                        
            ens = []
            if len(models) > 2:
                for pred in zip(*preds):
                    counter = Counter(pred)
                    ens.append(max(counter, key=counter.get))
            else:
                preds1, preds2 = preds[0], preds[1]
                probs1, probs2 = probs[0], probs[1]
                for pred1, prob1, pred2, prob2 in zip(preds1, probs1, preds2, probs2):
                    pred = round(((pred1 * prob1[1]) + (pred2 * prob2[1])) / 2)
                    ens.append(pred)
            
            score = np.mean(ens == y_tests[0])
            mean_acc.append(score)
        
        acc = np.array(mean_acc).mean()
        ens_acc.append({"Genre": genre, "Accuracy": acc})
    
    ens_acc = pd.DataFrame(ens_acc)
    ACCURACIES.update({ens_str: ens_acc})
    display_df(ens_acc, f"<h4>{ens_str} Accuracy by Genre</h4>")
    return ens_acc

#### Unigram

In [None]:
unigram_acc, uni_weights = predict_success(unigram_df, "Unigram")

#### Bigram

In [None]:
bigram_acc, bigram_weights = predict_success(bigram_df, "Bigram")

#### POS

In [None]:
pos_acc, pos_weights = predict_success(pos_df_scaled, "POS")

#### Roget

In [None]:
roget_acc, roget_weights = predict_success(roget_df_scaled, "Roget")

#### WordNet

In [None]:
wordnet_acc, wordnet_weights = predict_success(wordnet_df_scaled, "WordNet")

#### LIWC

In [None]:
liwc_acc, liwc_weights = predict_success(liwc_df_scaled, "LIWC")

#### SentiWordNet

In [None]:
swn_acc, swn_weights = predict_success(swn_df_scaled, "SentiWordNet")

#### NRC Sentiment Emotion Lexicons

In [None]:
nrc_acc, nrc_weights = predict_success(nrc_df_scaled, "NRC Sentiment")

#### Context Free Grammar

In [None]:
# TODO: DO THIS

#### Unigram Roget

In [None]:
uni_roget_acc = ensemble(uni=True, roget=True)

#### Unigram Roget WordNet

In [None]:
uni_roget_wn_ens_acc = ensemble(uni=True, roget=True, wn=True)

#### Unigram Roget LIWC

In [None]:
uni_roget_liwc_ens_acc = ensemble(uni=True, roget=True, liwc=True)

#### Unigram Roget LIWC Sentiment

In [None]:
uni_roget_liwc_nrc_ens_acc = ensemble(uni=True, roget=True, liwc=True, nrc=True)

#### Unigram POS

In [None]:
uni_pos_acc = ensemble(uni=True, pos=True)

#### Unigram POS Roget

In [None]:
uni_pos_roget_acc = ensemble(uni=True, pos=True, roget=True)

#### Unigram POS Roget LIWC

In [None]:
uni_pos_roget_liwc_acc = ensemble(uni=True, pos=True, roget=True, liwc=True)

#### Unigram POS Roget LIWC Sentiment

In [None]:
uni_pos_roget_liwc_nrc_acc = ensemble(uni=True, pos=True, roget=True, liwc=True, nrc=True)

#### Roget LIWC

In [None]:
roget_liwc_acc = ensemble(roget=True, liwc=True)

#### Roget Sentiment

In [None]:
roget_nrc_acc = ensemble(roget=True, nrc=True)

#### Roget LIWC Sentiment

In [None]:
roget_liwc_nrc_acc = ensemble(roget=True, liwc=True, nrc=True)

#### LIWC Sentiment

In [None]:
liwc_nrc_acc = ensemble(liwc=True, nrc=True)

In [None]:
def get_df_for_reduction(model_name: str, g: Optional[str] = None, g2: Optional[str] = None):
    if g is None:
        if model_name == "Unigram":
            out_col = unigram_df[["Book #", "@Genre", "@Outcome"]].copy().reset_index(drop=True)
            data = tfi_ngram(unigram_df, uni=True)
            data.insert(0, "@Genre", out_col["@Genre"])
            data.insert(0, "Book #", out_col["Book #"])
            data["@Outcome"] = out_col["@Outcome"]
            return data
        elif model_name == "Bigram":
            out_col = bigram_df[["Book #", "@Genre", "@Outcome"]].copy().reset_index(drop=True)
            data = tfi_ngram(bigram_df[bigram_df["@Genre"] == g], bi=True)
            data.insert(0, "@Genre", out_col["@Genre"])
            data.insert(0, "Book #", out_col["Book #"])
            data["@Outcome"] = out_col["@Outcome"]
            return data
        else:
            return globals()[f"{model_name.lower()}_df_scaled"].copy()
    
    if g2 is None:
        if model_name == "Unigram":
            out_col = unigram_df[unigram_df["@Genre"] == g][["Book #", "@Outcome"]].copy().reset_index(drop=True)
            data = tfi_ngram(unigram_df[unigram_df["@Genre"] == g], uni=True)
            data.insert(0, "@Genre", g)
            data.insert(0, "Book #", out_col["Book #"])
            data["@Outcome"] = out_col["@Outcome"]
            return data
        elif model_name == "Bigram":
            out_col = bigram_df[bigram_df["@Genre"] == g][["Book #", "@Outcome"]].copy().reset_index(drop=True)
            data = tfi_ngram(bigram_df[bigram_df["@Genre"] == g], bi=True)
            data.insert(0, "@Genre", g)
            data.insert(0, "Book #", out_col["Book #"])
            data["@Outcome"] = out_col["@Outcome"]
            return data
        else:
            return globals()[f"{model_name.lower()}_df_scaled"][globals()[f"{model_name.lower()}_df_scaled"]["@Genre"] == g].copy()
    
    else:
        if model_name == "Unigram":
            out_col = unigram_df[(unigram_df["@Genre"] == g) | (unigram_df["@Genre"] == g2)][["Book #", "@Genre", "@Outcome"]].copy().reset_index(drop=True)
            data = tfi_ngram(unigram_df[(unigram_df["@Genre"] == g) | (unigram_df["@Genre"] == g2)], uni=True)
            data.insert(0, "@Genre", out_col["@Genre"])
            data.insert(0, "Book #", out_col["Book #"])
            data["@Outcome"] = out_col["@Outcome"]
            return data
        elif model_name == "Bigram":
            out_col = bigram_df[(bigram_df["@Genre"] == g) | (bigram_df["@Genre"] == g2)][["Book #", "@Genre", "@Outcome"]].copy().reset_index(drop=True)
            data = tfi_ngram(bigram_df[(bigram_df["@Genre"] == g) | (bigram_df["@Genre"] == g2)], bi=True)
            data.insert(0, "@Genre", out_col["@Genre"])
            data.insert(0, "Book #", out_col["Book #"])
            data["@Outcome"] = out_col["@Outcome"]
            return data
        else:
            scaled_df = globals()[f"{model_name.lower()}_df_scaled"]
            return scaled_df[(scaled_df["@Genre"] == g) | (scaled_df["@Genre"] == g2)].copy()        


def reduce_features(model_weights: Dict, model_name: str, model_df: Optional[pd.DataFrame] = None, max_steps: int = 10,
                    genre_list: List = NEW_GENRES, g_predict: Optional[str] = None, og_acc: Optional[pd.DataFrame] = None):
    
    header = f"<h4>Performing exhaustive parameter search for feature reduction on {model_name}"
    header += f"for {g_predict} Genre Prediction</h4>" if g_predict is not None else ""
    display(HTML(f"<h4>Performing exhaustive parameter search for feature reduction on {model_name}</h4>"))
    
    og_copy = og_acc.copy()
    if "Average" in list(og_copy.iloc[:, 0]):
        og_copy = og_copy[og_copy[og_copy.columns[0]] != "Average"]
        
    og_copy.insert(1, "Step", -0.25)
    n_feats = [len(model_weights[k]) for k, v in model_weights.items()]
    og_copy["Num Features"] = n_feats
    
    exhaustive = og_copy.to_dict("records")
    reduced_features = {genre: model_weights[genre].copy() for genre in genre_list}
    
    steps = np.arange(0, max_steps + 0.25, 0.25)
    with tqdm(total=len(steps) * len(genre_list)) as pbar:
        for genre in genre_list:
            pbar.set_postfix_str(f" -- {genre}")
            best_acc = og_acc["Accuracy"].mean()
            
            if model_df is None:
                if g_predict == "one_v_one":
                    scaled_df = get_df_for_reduction(model_name, genre[0], genre[1])
                elif g_predict == "one_v_all":
                    scaled_df = get_df_for_reduction(model_name)
                else:
                    scaled_df = get_df_for_reduction(model_name, genre)
            else:
                if g_predict == "one_v_one":
                    scaled_df = model_df[(model_df["@Genre"] == genre[0]) | (model_df["@Genre"] == genre[1])].copy()
                elif g_predict == "one_v_all":
                    scaled_df = model_df.copy()
                else:
                    scaled_df = model_df[model_df["@Genre"] == genre].copy()
            
            for i, step in enumerate(steps):
                avg_weight = model_weights[genre]["Weight"].mean()
                std_dev = model_weights[genre]["Weight"].std()
                threshold = avg_weight + (step * std_dev)
                param_results = model_weights[genre][model_weights[genre]["Weight"].abs() >= threshold]

                if len(param_results) < 5:
                    print(f"{genre} exhausted at {step} deviations above the mean")
                    pbar.update(len(steps) - i)
                    break
                
                elif step == max_steps:
                    print(f"{genre} did not exhaust, len(results) = {len(param_results)}")

                elif len(param_results) == len(model_weights[genre]["Weight"]):
                    pbar.update(1)
                    continue

                col_filter = set(param_results["Feature"])
                cols = ["Book #", "@Genre"] + list(col_filter) + ["@Outcome"]
                param_results_df = scaled_df[cols]

                if g_predict is not None:
                    param_acc, param_weights = predict_genre(param_results_df, model_name, how=g_predict, searching=True,
                                                             genre_list=[genre], disp_acc=False, disp_weights=False, show_pbar=False)
                    if g_predict == "one_v_one":
                        step_acc = param_acc.loc[param_acc["Genre"] == (genre[0], genre[1]), "Accuracy"].values[0]
                    else:
                        step_acc = param_acc.loc[param_acc["Genre"] != "Average", "Accuracy"].values[0]
                else:
                    param_acc, param_weights = predict_success(param_results_df, model_name, searching=True,
                                                               genre_list=[genre], disp_acc=False, disp_weights=False, show_pbar=False)
                    step_acc = param_acc.loc[param_acc["Genre"] != "Average", "Accuracy"].values[0]
                    
                if step_acc > best_acc:
                    best_acc = step_acc
                    reduced_features[genre] = param_weights[genre].copy()

                exhaustive.append({"Genre": genre, "Step": step, "Accuracy": step_acc, "Num Features": len(param_weights[genre])})
                pbar.update(1)

    exhaustive_df = pd.DataFrame(exhaustive)
    return exhaustive_df, reduced_features


def plot_exhausted(exh_df_: pd.DataFrame, max_steps: int = 10, markersize: int = 10, 
                   genre_list: List = NEW_GENRES, colors: Optional[Dict] = None, markers: bool = True):
    
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(30, 15))

    exh_df = exh_df_.copy()
    tuned_params = []
    tuned_params_dict = {}
    markers = ["o", "P", "s", "D", "p", "v", "H", "*"]
    
    i = 0
    while len(genre_list) > len(markers):
        markers.append(markers[i])
        i += 1
        if i >= len(markers):
            i = 0
    
    if markers:
        for genre, m in zip(genre_list, markers):
            best = exh_df[(exh_df["Genre"] == genre)]["Accuracy"].max()
            best_param = exh_df.loc[(exh_df["Genre"] == genre) & (exh_df["Accuracy"] == best), "Step"].values[0]
            n_features = exh_df.loc[(exh_df["Genre"] == genre) & (exh_df["Accuracy"] == best), "Num Features"].values[0]
            tuned_params.append({"Genre": genre, "Deviations": best_param, "Accuracy": best, "Num Features": n_features})
            tuned_params_dict[genre] = {"Deviations": best_param, "Accuracy": best}
            if colors is None:
                exh_df[exh_df["Genre"] == genre][["Step", "Accuracy"]].plot(x="Step", ax=axes, rot=0, marker=m if len(genre_list) <= len(NEW_GENRES) else "",
                                                                            markersize=markersize, markeredgewidth=2, fillstyle="none", linewidth=2)
            else:
                exh_df[exh_df["Genre"] == genre][["Step", "Accuracy"]].plot(x="Step", ax=axes, rot=0, color=colors[genre],
                                                                            marker=m if len(genre_list) <= len(NEW_GENRES) else "",
                                                                            markersize=markersize, markeredgewidth=2, fillstyle="none", linewidth=2)
    else:
        for genre in genre_list:
            best = exh_df[(exh_df["Genre"] == genre)]["Accuracy"].max()
            best_param = exh_df.loc[(exh_df["Genre"] == genre) & (exh_df["Accuracy"] == best), "Step"].values[0]
            n_features = exh_df.loc[(exh_df["Genre"] == genre) & (exh_df["Accuracy"] == best), "Num Features"].values[0]
            tuned_params.append({"Genre": genre, "Deviations": best_param, "Accuracy": best, "Num Features": n_features})
            tuned_params_dict[genre] = {"Deviations": best_param, "Accuracy": best}
            if colors is None:
                exh_df[exh_df["Genre"] == genre][["Step", "Accuracy"]].plot(x="Step", ax=axes, rot=0, linewidth=2)
            else:
                exh_df[exh_df["Genre"] == genre][["Step", "Accuracy"]].plot(x="Step", ax=axes, rot=0, color=colors[genre], linewidth=2)
    
    tuned_params_df = pd.DataFrame(tuned_params)
    display_df(tuned_params_df.copy().append({"Genre": "Average",
                                              "Deviations": tuned_params_df["Deviations"].mean(),
                                              "Accuracy": tuned_params_df["Accuracy"].mean(),
                                              "Num Features": tuned_params_df["Num Features"].mean()}, ignore_index=True))
    
    if max_steps > 15:
        minor_step = 1 if max_steps > 35 else 0.5
        xmajor = floor(max_steps / 5) if max_steps > 35 else max_steps
        xminor = xmajor * 4
    elif exh_df["Step"].max() <= 6:
        xmajor = max_steps * 4
        xminor = xmajor * 5
        minor_step = 0.05
    else:
        xmajor = max_steps * 2
        xminor = xmajor * 2
        minor_step = 0.25
    
    setup_axis(axes, xmin=-0.25, xmax=max_steps, xmajor=xmajor, xminor=xminor,
               xlabel="Deviations above the Mean", ylabel="Accuracy",
               left=exh_df["Step"].min() - minor_step, right=exh_df["Step"].max() + minor_step,
               bottom=exh_df["Accuracy"].min() - 0.025, top=tuned_params_df["Accuracy"].max() + 0.025,
               grid="-", minor_grid=":")
    
    if genre_list == NEW_GENRES:
        axes.legend(genre_list, bbox_to_anchor=(0.9915, 1.07), fontsize=19, ncol=len(genre_list))
    else:
        axes.legend(genre_list, bbox_to_anchor=(1.005, 1), loc="upper left", fontsize=19)

    plt.margins(x=0.01, y=0.05)    
    plt.show()
    return tuned_params_df

In [None]:
uni_exh, uni_rw = reduce_features(uni_weights, "Unigram", max_steps=35, og_acc=unigram_acc)

In [None]:
uni_reduced_acc = plot_exhausted(uni_exh, max_steps=35)

In [None]:
bi_exh, bi_weights_reduced = reduce_features(bigram_weights, "Bigram", max_steps=75, og_acc=bigram_acc)

In [None]:
bi_reduced_acc = plot_exhausted(bi_exh, max_steps=75)

In [None]:
pos_exh, pos_weights_reduced = reduce_features(pos_weights, "POS", og_acc=pos_acc)

In [None]:
pos_reduced_acc = plot_exhausted(pos_exh, markersize=20)

In [None]:
roget_exh, roget_rw = reduce_features(roget_weights, "Roget", og_acc=roget_acc)

In [None]:
roget_reduced_acc = plot_exhausted(roget_exh, markersize=15)

In [None]:
wn_exh, wn_rw = reduce_features(wordnet_weights, "WordNet", max_steps=15, og_acc=wordnet_acc)

In [None]:
for genre in NEW_GENRES:
    display_df(wn_rw[genre], genre, max_rows=20)

In [None]:
wn_reduced_acc = plot_exhausted(wn_exh, max_steps=15)

In [None]:
liwc_exh, liwc_weights_reduced = reduce_features(liwc_weights, "LIWC", og_acc=liwc_acc)

In [None]:
liwc_reduced_acc = plot_exhausted(liwc_exh, markersize=20)

#### Mapping WordNet to Roget

In [None]:
roget_invert = []

display(HTML("<b>Converting Roget DataFrame to be by Word...</b>"))
with tqdm(total=len(roget_thesaurus.roget_df)) as pbar:
    for idx, row in roget_thesaurus.roget_df.iterrows():
        for word in set(row["Words"]):
            roget_invert.append({"Word": word, "Category": row["Category"], "Level3": row["Level3"], "Level2": row["Level2"],
                                 "Level1": row["Level1"], "Section": row["Section"], "Class": row["Class"]})
        pbar.update(1)

roget_thesaurus_df = pd.DataFrame(roget_invert)

In [None]:
wn_df = pd.DataFrame(wordnet_data).fillna(0).rename(columns={"_genre": "@Genre", "_outcome": "@Outcome"})
wn_df.insert(0, "Book #", unigram_df["Book #"].reset_index(drop=True))
wnrf_set = {genre: wn_df[wn_df["@Genre"] == genre][["Book #", "@Genre"] + [w for w in wn_rw[genre]["Feature"]] + ["@Outcome"]] for genre in NEW_GENRES}

In [None]:
NumsAndOutcomes = {genre: wn_df[wn_df["@Genre"] == genre][["Book #", "@Genre", "@Outcome"]].reset_index(drop=True) for genre in NEW_GENRES}

In [None]:
def map_to_roget(dfs_to_map: Dict, src_model: str, genre_list: List = NEW_GENRES, to_categories: bool = False, to_sections: bool = False, to_classes: bool = False):
    map_from = "Word" if to_categories else "Category" if to_sections else "Section"
    map_to = "Category" if to_categories else "Section" if to_sections else "Class"
    display(HTML(f"<h4>Mapping {src_model} to Roget {map_to}...</h4>"))
    
    mapped_dict = {genre: {} for genre in genre_list}
    bar_length = sum(len(dfs_to_map[genre].columns) - 3 for genre in genre_list)
    
    with tqdm(total=bar_length) as pbar:
        for genre in genre_list:
            pbar.set_postfix_str(f" -- {genre}")
            mapping_cols = dfs_to_map[genre].drop(columns=["Book #", "@Genre", "@Outcome"]).columns
            
            for col in mapping_cols:
                roget_map = roget_thesaurus_df[roget_thesaurus_df[map_from] == col][map_to]
                
                if to_sections or to_classes:
                    roget_map = roget_map.unique()
                
                for mapping in roget_map:
                    if mapping in mapped_dict[genre].keys():
                        mapped_dict[genre][mapping] = pd.concat([mapped_dict[genre][mapping], dfs_to_map[genre][col]], axis=1)
                    else:
                        mapped_dict[genre][mapping] = dfs_to_map[genre][[col]]
                pbar.update(1)
    
    return mapped_dict


def concat_map_to_roget(map_to_roget_dict: Dict, src_model: str, map_to: str, genre_list: List = NEW_GENRES, nums_outcomes: Dict = NumsAndOutcomes):
    no_scale = {}
    display(HTML(f"<h4>Concatenating {src_model} to Roget {map_to} -- no scaling...</h4>"))
    with tqdm(total=len(genre_list)) as pbar:
        for genre in genre_list:
            pbar.set_postfix_str(f" -- {genre}")
            mapped = pd.concat([pd.DataFrame({k: map_to_roget_dict[genre][k].sum(axis=1).reset_index(drop=True)}) for k in map_to_roget_dict[genre].keys()], axis=1)
            mapped.insert(0, "@Genre", nums_outcomes[genre]["@Genre"])
            mapped.insert(0, "Book #", nums_outcomes[genre]["Book #"])
            mapped["@Outcome"] = nums_outcomes[genre]["@Outcome"]
            no_scale[genre] = mapped
            pbar.update(1)

    scaled = {}
    display(HTML(f"<h4>Concatenating {src_model} to Roget {map_to} -- scaling by genre...</h4>"))
    with tqdm(total=len(genre_list)) as pbar:
        for genre in genre_list:
            pbar.set_postfix_str(f" -- {genre}")
            mapped = pd.concat([pd.DataFrame({k: map_to_roget_dict[genre][k].sum(axis=1).reset_index(drop=True)}) for k in map_to_roget_dict[genre].keys()], axis=1)
            mapped.insert(0, "@Genre", nums_outcomes[genre]["@Genre"])
            mapped.insert(0, "Book #", nums_outcomes[genre]["Book #"])
            mapped["@Outcome"] = nums_outcomes[genre]["@Outcome"]
            scaled[genre], _ = process_and_scale(mapped)
            pbar.update(1)
    
    return no_scale, scaled


def test_map_to_roget(no_scale: Dict, scaled_: Dict, src_model: str, map_to: str, genre_list: List = NEW_GENRES, g_predict: Optional[str] = None):
    display(HTML(f"<h4>Testing {src_model} to Roget {map_to} -- All Genres Scaled</h4>"))
    full_map_to_roget = pd.concat(list(no_scale.values())).fillna(0)
    full_map_to_roget_scaled, _ = process_and_scale(full_map_to_roget)

    if g_predict is not None:
        scaled = pd.concat(list(scaled_.values())).fillna(0)
        full_map_to_roget_acc, full_map_to_roget_weights = predict_genre(full_map_to_roget_scaled, f"{src_model} to Roget {map_to}",
                                                                         how=g_predict, genre_list=genre_list, disp_acc=False,
                                                                         disp_weights=False, show_pbar=False)
    else:
        full_map_to_roget_acc, full_map_to_roget_weights = predict_success(full_map_to_roget_scaled, f"{src_model} to Roget {map_to}",
                                                                           genre_list=genre_list, disp_acc=False,
                                                                           disp_weights=False, show_pbar=False)
    
    full_map_to_roget_acc = full_map_to_roget_acc[full_map_to_roget_acc["Genre"] != "Average"]
    full_map_to_roget_acc = full_map_to_roget_acc.append({"Genre": "Average", "Accuracy": full_map_to_roget_acc["Accuracy"].mean()}, ignore_index=True)
    display_df(full_map_to_roget_acc)

    display(HTML(f"<h4>Testing {src_model} to Roget {map_to} -- Scaled By Genre</h4>"))
    map_to_roget_results = []
    map_to_roget_weights = {}
    
    for genre in genre_list:
        if g_predict is not None:
            acc, weights = predict_genre(scaled, f"{src_model} to Roget {map_to}", how=g_predict, searching=True,
                                         genre_list=[genre], disp_acc=False, disp_weights=False, show_pbar=False)
        else:
            acc, weights = predict_success(scaled_[genre], f"{src_model} to Roget {map_to}", searching=True,
                                           genre_list=[genre], disp_acc=False, disp_weights=False, show_pbar=False)

        acc = acc[acc["Genre"] != "Average"]
        map_to_roget_results.append(acc)
        map_to_roget_weights.update(weights)

    map_to_roget_acc = pd.concat(map_to_roget_results)
    map_to_roget_acc = map_to_roget_acc.append({"Genre": "Average", "Accuracy": map_to_roget_acc["Accuracy"].mean()}, ignore_index=True)
    display_df(map_to_roget_acc)
    
    return full_map_to_roget_acc, full_map_to_roget_weights, map_to_roget_acc, map_to_roget_weights


# def test_map_to_roget(no_scale: Dict, scaled: Dict, src_model: str, map_to: str, genre_list: List = NEW_GENRES, g_predict: Optional[str] = None):
#     full_map_to_roget = pd.concat(list(no_scale.values())).fillna(0)
#     full_map_to_roget_scaled, _ = process_and_scale(full_map_to_roget)

#     if g_predict is not None:
#         full_map_to_roget_acc, full_map_to_roget_weights = predict_genre(full_map_to_roget_scaled, f"{src_model} to Roget {map_to}",
#                                                                          how=g_predict, genre_list=genre_list, add_to_acc=False,
#                                                                          disp_acc=False, disp_weights=False, show_pbar=False)
#         map_to_roget = pd.concat(list(scaled.values())).fillna(0)
#         map_to_roget_acc, map_to_roget_weights = predict_genre(map_to_roget, f"{src_model} to Roget {map_to}",
#                                                                how=g_predict, genre_list=genre_list, add_to_acc=False,
#                                                                disp_acc=False, disp_weights=False, show_pbar=False)

#     else:
#         full_map_to_roget_acc, full_map_to_roget_weights = predict_success(full_map_to_roget_scaled, f"{src_model} to Roget {map_to}",
#                                                                            genre_list=genre_list, disp_acc=False,
#                                                                            disp_weights=False, show_pbar=False)
#         map_to_roget_results = []
#         map_to_roget_weights = {}

#         for genre in genre_list:
#             acc, weights = predict_success(scaled[genre], f"{src_model} to Roget {map_to}", searching=True, add_to_acc=False,
#                                            genre_list=[genre], disp_acc=False, disp_weights=False, show_pbar=False)

#             map_to_roget_results.append(acc)
#             map_to_roget_weights.update(weights)

#         map_to_roget_acc = pd.concat(map_to_roget_results)
    
#     display_df(full_map_to_roget_acc, f"<h4>{src_model} to Roget {map_to} -- All Genres Scaled</h4>")
#     display_df(map_to_roget_acc, f"<h4>{src_model} to Roget {map_to} -- Scaled By Genre</h4>")
    
#     return full_map_to_roget_acc, full_map_to_roget_weights, map_to_roget_acc, map_to_roget_weights

In [None]:
wnrf_to_rocat = map_to_roget(wnrf_set, src_model="WordNet", to_categories=True)

In [None]:
wnrf_to_rocat_no_scale, wnrf_to_rocat_scaled = concat_map_to_roget(wnrf_to_rocat, src_model="WordNet", map_to="Category")

In [None]:
_ = test_map_to_roget(wnrf_to_rocat_no_scale, wnrf_to_rocat_scaled, src_model="WordNet", map_to="Category")

In [None]:
wnrf_to_rosect = map_to_roget(wnrf_to_rocat_no_scale, src_model="WordNet", to_sections=True)

In [None]:
wnrf_to_rosect_no_scale, wnrf_to_rosect_scaled = concat_map_to_roget(wnrf_to_rosect, src_model="WordNet", map_to="Section")

In [None]:
full_wnrf_to_rosect_acc, full_wnrf_to_rosect_weights, wnrf_to_rosect_acc, wnrf_to_rosect_weights = test_map_to_roget(wnrf_to_rosect_no_scale, wnrf_to_rosect_scaled, src_model="WordNet", map_to="Section")

In [None]:
wnrf_to_rosect_df = pd.concat(list(wnrf_to_rosect_no_scale.values())).fillna(0)
wnrf_to_rosect_df_scaled, _ = process_and_scale(wnrf_to_rosect_df)
wnrf_to_rosect_exh, wnrf_to_rosect_rw = reduce_features(full_wnrf_to_rosect_weights, "WordNet to Roget Section",
                                                        model_df=wnrf_to_rosect_df_scaled, og_acc=full_wnrf_to_rosect_acc)

In [None]:
wnrf_to_rosect_reduced_acc = plot_exhausted(wnrf_to_rosect_exh, markersize=20)

In [None]:
wnrf_to_roget_class = map_to_roget(wnrf_to_rosect_no_scale, src_model="WordNet", to_classes=True)

In [None]:
wnrf_to_roget_class_no_scale, wnrf_to_roget_class_scaled = concat_map_to_roget(wnrf_to_roget_class, src_model="WordNet", map_to="Class")

In [None]:
_ = test_map_to_roget(wnrf_to_roget_class_no_scale, wnrf_to_roget_class_scaled, src_model="WordNet", map_to="Class")

In [None]:
def get_themes_by_genre(no_scale: Dict, sect_weights: Dict, genre_list: List = NEW_GENRES):
    themes_by_genre = []
    full_no_scale = pd.concat(list(no_scale.values())).fillna(0)
    bar_length = len(full_no_scale.drop(columns=["Book #", "@Genre", "@Outcome"]).columns) * len(genre_list)
    themes = list(full_no_scale.drop(columns=["Book #", "@Genre", "@Outcome"]).columns)
    themes.sort()

    display(HTML("<h4>Getting themes by genre...</h4>"))
    with tqdm(total=bar_length) as pbar:
        for genre in genre_list:
            pbar.set_postfix_str(f" -- {genre}")
            for theme in themes:
                try:
                    theme_weight = sect_weights[genre].set_index("Feature").loc[theme, "Weight"]
                except KeyError:
                    theme_weight = 0
                themes_by_genre.append({"Genre": genre, "Theme": theme, "Weight": theme_weight})
                pbar.update(1)

    tbg_df = pd.DataFrame(themes_by_genre)
    tbg_df.loc[tbg_df["Theme"] == theme, "Weight"] = tbg_df.loc[tbg_df["Theme"] == theme, "Weight"].abs()
    # for theme in themes:
    #     tbg_weights_scaled = scale.fit_transform(tbg_df.loc[tbg_df["Theme"] == theme][["Weight"]].abs())
    #     tbg_df.loc[tbg_df["Theme"] == theme, "Weight"] = tbg_weights_scaled
    
    return tbg_df, themes


def plot_tbg(tbg_df: pd.DataFrame, themes: List, colors: Dict, sort: bool = False, scatter: bool = False):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(30,15))

    if sort:
        sorted_tbg_df = get_sorted_tbg(tbg_df, colors)
        for i in range(len(sorted_tbg_df["Theme"])):
            axes.bar(sorted_tbg_df["Theme"][i], sorted_tbg_df["percentage"][i], width=0.5, color=sorted_tbg_df["color"][i],
                     bottom=sorted_tbg_df["bottoms"][i], label=sorted_tbg_df["Genre"][i])

        plt.xticks(rotation=90)

    else:
        if scatter:
            for genre, color in zip(NEW_GENRES, list(colors.values())):
                sizes = tbg_df[tbg_df["Genre"] == genre].sort_values(by=["Theme"])["Weight"]*2**12
                tbg_df[tbg_df["Genre"] == genre].sort_values(by=["Theme"]).plot(x="Theme", y="Weight", ax=axes, kind="scatter", s=sizes, rot=90, c=color, alpha=0.9)
        else:
            tbg_percentage = tbg_df.copy()
            for theme in themes:
                theme_sum = tbg_percentage.loc[tbg_percentage["Theme"] == theme, "Weight"].sum()
                tbg_percentage.loc[tbg_percentage["Theme"] == theme, "Weight"] /= theme_sum
            
            margin_bottom = np.zeros(len(tbg_percentage["Theme"].drop_duplicates()))

            for genre, color in zip(NEW_GENRES, list(colors.values())):
                values = list(tbg_percentage[tbg_percentage["Genre"] == genre].sort_values(by=["Theme"]).loc[:, "Weight"])
                tbg_percentage[tbg_percentage["Genre"] == genre].sort_values(by=["Theme"]).plot.bar(x="Theme", y="Weight", ax=axes, stacked=True, width=0.5,
                                                                                                    bottom=margin_bottom, rot=90, color=color)
                margin_bottom += values

    if sort:
        setup_axis(axes, xmin=None, ymajor=10, ylabel="Weight Percentage")
    else:
        setup_axis(axes, ymajor=10, x_ticklabels=themes, ylabel="Weight Percentage" if not scatter else "Weight",
                   bottom=-0.01 if scatter else None, top=1.05 if scatter else None)

    if sort:
        axes.legend(bbox_to_anchor=(0.9915, 1.07), fontsize=19, ncol=len(NEW_GENRES))
        handles, labels = axes.get_legend_handles_labels()
        by_label = dict(zip(labels, handles))
        sorted_keys = sorted(by_label)
        sorted_vals = [by_label[k] for k in sorted_keys]
        by_label = dict(zip(sorted_keys, sorted_vals))
        axes.legend(by_label.values(), by_label.keys(), bbox_to_anchor=(0.9915, 1.07), fontsize=19, ncol=len(NEW_GENRES))
    elif scatter:
        legend = axes.legend(NEW_GENRES, bbox_to_anchor=(0.9915, 1.07), fontsize=19, ncol=len(NEW_GENRES))
        for i in range(len(legend.legendHandles)):
            legend.legendHandles[i]._sizes = [250]
    else:
        axes.legend(NEW_GENRES, bbox_to_anchor=(0.9915, 1.07), fontsize=19, ncol=len(NEW_GENRES))
        
    plt.margins(x=0.025, y=0.05)
    plt.show()


def get_sorted_tbg(tbg_df: pd.DataFrame, colors: Dict):
    s_tbg_df = tbg_df.copy()
    
    s_tbg_df["percentage"] = s_tbg_df["Weight"] / s_tbg_df.groupby("Theme")["Weight"].transform("sum")
    s_tbg_df.sort_values("percentage", ascending=False, inplace=True)
    
    s_tbg_df = s_tbg_df.groupby("Theme").apply(ranker)
    s_tbg_df.sort_values(["Theme", "rank"], ascending=[True, True], inplace=True)
    
    s_tbg_df["color"] = s_tbg_df.apply(color_assigment, args=(colors,), axis=1)
    
    s_tbg_df["bottoms"] = s_tbg_df.groupby("Theme")["percentage"].cumsum() - s_tbg_df["percentage"]
    s_tbg_df["Theme"] = s_tbg_df["Theme"].astype(str)
    
    s_tbg_df = s_tbg_df.reset_index(drop=True)
    return s_tbg_df


def ranker(df: pd.DataFrame):
    df["rank"] = np.arange(len(df)) + 1
    return df


def color_assigment(df: pd.DataFrame, colors: Dict):
    return colors[df["Genre"]]

In [None]:
from matplotlib.lines import Line2D
from matplotlib.patches import Patch


def get_rosect_freq(scaled: Dict, themes: List, sect_weights: Dict, genre_list: List = NEW_GENRES, g_predict: Optional[str] = None):
    map_to_rosect_wvs = {genre: [] for genre in genre_list}
    display(HTML("<h4>Calculating Roget Section Frequency by Success per Genre</h4>"))
    with tqdm(total=len(genre_list)) as pbar:
        for genre in genre_list:
            pbar.set_postfix_str(f" -- {genre}")

            for theme in themes:
                try:
                    if g_predict == "one_v_one":
                        theme_avg_freq1 = scaled[genre].loc[scaled[genre]["@Genre"] == genre[0], theme].mean()
                        theme_avg_freq2 = scaled[genre].loc[scaled[genre]["@Genre"] == genre[1], theme].mean()
                    elif g_predict == "one_v_all":
                        ova = pd.concat(list(scaled.values())).fillna(0)
                        theme_avg_freq1 = ova.loc[ova["@Genre"] == genre, theme].mean()
                        theme_avg_freq2 = ova.loc[ova["@Genre"] != genre, theme].mean()
                    else:
                        theme_avg_freq1 = scaled[genre].loc[scaled[genre]["@Outcome"] == "SUCCESSFUL", theme].mean()
                        theme_avg_freq2 = scaled[genre].loc[scaled[genre]["@Outcome"] == "FAILURE", theme].mean()
                except KeyError as e:
                    scaled[genre][theme] = 0
                    if g_predict == "one_v_one":
                        theme_avg_freq1 = scaled[genre].loc[scaled[genre]["@Genre"] == genre[0], theme].mean()
                        theme_avg_freq2 = scaled[genre].loc[scaled[genre]["@Genre"] == genre[1], theme].mean()
                    elif g_predict == "one_v_all":
                        ova = pd.concat(list(scaled.values())).fillna(0)
                        theme_avg_freq1 = ova.loc[ova["@Genre"] == genre, theme].mean()
                        theme_avg_freq2 = ova.loc[ova["@Genre"] != genre, theme].mean()
                    else:
                        theme_avg_freq1 = scaled[genre].loc[scaled[genre]["@Outcome"] == "SUCCESSFUL", theme].mean()
                        theme_avg_freq2 = scaled[genre].loc[scaled[genre]["@Outcome"] == "FAILURE", theme].mean()

                try:
                    weight = abs(sect_weights[genre].set_index("Feature").loc[theme, "Weight"])
                except KeyError:
                    weight = 0

                freq_diff = theme_avg_freq1 - theme_avg_freq2
                map_to_rosect_wvs[genre].append({"Genre": genre, "Theme": theme, "Frequency Difference": freq_diff, "Weight": weight})

            map_to_rosect_wvs[genre] = pd.DataFrame(map_to_rosect_wvs[genre]).sort_values(by=["Theme"])
            map_to_rosect_wvs[genre] = map_to_rosect_wvs[genre][(map_to_rosect_wvs[genre]["Frequency Difference"] != 0) | 
                                                                (map_to_rosect_wvs[genre]["Weight"] != 0)].reset_index(drop=True)
            pbar.update(1)

    for genre in genre_list:
        map_to_rosect_wvs_scaled = scale.fit_transform(map_to_rosect_wvs[genre].loc[map_to_rosect_wvs[genre]["Genre"] == genre][["Weight"]])
        map_to_rosect_wvs[genre].loc[map_to_rosect_wvs[genre]["Genre"] == genre, "Weight"] = map_to_rosect_wvs_scaled
    
    return map_to_rosect_wvs


def plot_theme_freq_diff_vs_weight(map_to_rosect_wvs: Dict, colors: Dict, other_wvs: Optional[Dict] = None, genre_list: Dict = NEW_GENRES, common_only: bool = False):
    for genre in genre_list:
        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(30,15))
        
        themes = list(map_to_rosect_wvs[genre]["Theme"])
        
        if other_wvs is not None and not common_only:
            themes = list(set(themes + list(other_wvs[genre]["Theme"])))
        
        themes.sort()
        legend_elems = [Line2D([0], [0], marker="o", color="white", markerfacecolor=colors[theme], label=theme, markersize=20) for theme in themes]
        
        for theme in themes:
            if theme in list(map_to_rosect_wvs[genre]["Theme"]):
                map_to_rosect_wvs[genre][map_to_rosect_wvs[genre]["Theme"] == theme].plot(x="Weight", y="Frequency Difference", ax=axes, linestyle="none",
                                                                                          marker="o", markersize=30, color=colors[theme], alpha=0.9)
            if other_wvs is not None:
                fill = "full" if theme in list(map_to_rosect_wvs[genre]["Theme"]) else "none"
                a = 0.9 if fill == "full" else 1.0
                other_wvs[genre][other_wvs[genre]["Theme"] == theme].plot(x="Weight", y="Frequency Difference", ax=axes, linestyle="none",
                                                                          marker="D", markeredgewidth=3, fillstyle=fill, markersize=30,
                                                                          color=colors[theme], alpha=a)
            
        axes.set_title(genre, fontsize=32)
        
        top = max(map_to_rosect_wvs[genre]["Frequency Difference"].max(), abs(map_to_rosect_wvs[genre]["Frequency Difference"].min()))
        bottom = min(-map_to_rosect_wvs[genre]["Frequency Difference"].max(), map_to_rosect_wvs[genre]["Frequency Difference"].min())
        
        if map_to_rosect_wvs[genre]["Frequency Difference"].min() > -0.05:
            bottom = -0.05
        
        if other_wvs is not None:
            top = max(top, other_wvs[genre]["Frequency Difference"].max(), abs(other_wvs[genre]["Frequency Difference"].min()))
            bottom = min(bottom, other_wvs[genre]["Frequency Difference"].min())
        
        ymajor = 40 if top > 0.05 else 160
        offset = 0.02 if top > 0.05 else 0.002
        setup_axis(axes, ymin=-1, ymajor=ymajor, yminor=ymajor * 5,
                   x_ticklabel_size=22, xlabel="Weight", xlabel_size=32, xlabel_pad=30, ylabel="Avg Frequency Difference",
                   left=-0.025, right=1.025,
                   bottom=bottom - offset,
                   top=top + offset)

        axes.axhline(linestyle="--", linewidth=3, color="black", alpha=0.5)
        axes.legend(handles=legend_elems, bbox_to_anchor=(1.005, 0.95), loc="upper left", fontsize=22)

        plt.show()


def plot_theme_freq_diff(map_to_rosect_wvs: Dict, colors: Dict, other_wvs: Optional[Dict] = None, genre_list: Dict = NEW_GENRES):
    for genre in genre_list:
        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(30,15))
        
        themes = list(map_to_rosect_wvs[genre]["Theme"])
        legend_elems = [Patch(facecolor="white", edgecolor="black", label="Reduced Frequency Difference")]
        
        top = map_to_rosect_wvs[genre]["Frequency Difference"].max()
        bottom = map_to_rosect_wvs[genre]["Frequency Difference"].min()
        
        if other_wvs is not None:
            themes = list(set(themes + list(other_wvs[genre]["Theme"])))
            themes.sort()
            positions = np.arange(0, len(themes))
            
            merged = pd.merge(map_to_rosect_wvs[genre], other_wvs[genre], on=["Genre", "Theme"], how="outer").fillna(0)
            merged.rename(columns={"Frequency Difference_x": "Frequency Difference (Reduced)",
                                   "Frequency Difference_y": "Frequency Difference",
                                   "Weight_x": "Weight (Reduced)",
                                   "Weight_y": "Weight"}, inplace=True)
            
            axes.bar(positions - 0.2, merged["Frequency Difference (Reduced)"], width=0.4, color=[colors[theme] for theme in themes])
            axes.bar(positions + 0.2, merged["Frequency Difference"], width=0.4, color=[colors[theme] for theme in themes], edgecolor="white", hatch="///")
            legend_elems.append(Patch(facecolor="white", edgecolor="black", label="Full Frequency Difference", hatch="///"))
            
            top = max(top, other_wvs[genre]["Frequency Difference"].max())
            bottom = min(bottom, other_wvs[genre]["Frequency Difference"].min())
            
            plt.xticks(rotation=90)
        
        else:
            themes.sort()
            map_to_rosect_wvs[genre].plot.bar(x="Theme", y="Frequency Difference", ax=axes, rot=90, width=0.5, color=[colors[theme] for theme in themes])            
            
        axes.set_title(genre, fontsize=32)
        
        ymajor = 40 if (top > 0.1 or abs(bottom) > 0.1) else 80 if top > 0.05 else 160
        offset = 0.01 if top > 0.05 else 0.002
        setup_axis(axes, ymin=-1, ymajor=ymajor, yminor=ymajor * 5,
                   x_ticklabels=themes,
                   ylabel="Avg Frequency Difference",
                   bottom=bottom - offset,
                   top=top + offset,
                   grid="-", minor_grid=":")
        
        axes.grid(axis="x", linestyle="--")

        axes.legend(handles=legend_elems, loc="upper right", fontsize=18)
        plt.margins(x=0.025)
        plt.show()

In [None]:
wnrf_to_rosect_set = {genre: wnrf_to_rosect_df_scaled[wnrf_to_rosect_df_scaled["@Genre"] == genre][["Book #", "@Genre"] + [w for w in wnrf_to_rosect_rw[genre]["Feature"]] + ["@Outcome"]] for genre in NEW_GENRES}

In [None]:
wnrf_themes_by_genre_df, wnrf_tbg_themes = get_themes_by_genre(wnrf_to_rosect_set, wnrf_to_rosect_rw)
wnrf_to_rosect_wvs = get_rosect_freq(wnrf_to_rosect_set, wnrf_tbg_themes, wnrf_to_rosect_rw)
colors = create_cmap(plt.cm.tab10, NEW_GENRES)
wnrf_wvs_colors = create_cmap(plt.cm.nipy_spectral, wnrf_tbg_themes)

In [None]:
plot_tbg(wnrf_themes_by_genre_df, wnrf_tbg_themes, colors, sort=True)

In [None]:
plot_tbg(wnrf_themes_by_genre_df, wnrf_tbg_themes, colors)

In [None]:
plot_tbg(wnrf_themes_by_genre_df, wnrf_tbg_themes, colors, scatter=True)

In [None]:
plot_theme_freq_diff_vs_weight(wnrf_to_rosect_wvs, wnrf_wvs_colors)

In [None]:
wn_set = {genre: wn_df[wn_df["@Genre"] == genre] for genre in NEW_GENRES}

In [None]:
# wn_to_rocat = map_to_roget(wn_set, src_model="WordNet", to_categories=True)
# with open(str(PROJ_ROOT.joinpath("data", "wn_to_rocat.txt")), "wb+") as f:
#     try:
#         pickle.dump(wn_to_rocat, f)
#     except MemoryError:
#         print("There was a MemoryError when dumping wn_to_rocat")

wn_to_rocat = pickle.load(open(str(PROJ_ROOT.joinpath("data", "wn_to_rocat.txt")), "rb+"))

In [None]:
wn_to_rocat_no_scale, wn_to_rocat_scaled = concat_map_to_roget(wn_to_rocat, src_model="WordNet", map_to="Category")

In [None]:
wn_to_rosect = map_to_roget(wn_to_rocat_no_scale, src_model="WordNet", to_sections=True)

In [None]:
wn_to_rosect_no_scale, wn_to_rosect_scaled = concat_map_to_roget(wn_to_rosect, src_model="WordNet", map_to="Section")

In [None]:
full_wn_to_rosect_acc, full_wn_to_rosect_weights, wn_to_rosect_acc, wn_to_rosect_weights = test_map_to_roget(wn_to_rosect_no_scale, wn_to_rosect_scaled, src_model="WordNet", map_to="Section")

In [None]:
wn_to_rosect_df_scaled = pd.concat(list(wn_to_rosect_scaled.values())).fillna(0)
wn_to_rosect_exh, wn_to_rosect_rw = reduce_features(wn_to_rosect_weights, "WordNet to Roget Section", model_df=wn_to_rosect_df_scaled, og_acc=wn_to_rosect_acc)

In [None]:
wn_to_rosect_reduced_acc = plot_exhausted(wn_to_rosect_exh, markersize=20)

In [None]:
wn_to_rosect_set = {genre: wn_to_rosect_df_scaled[wn_to_rosect_df_scaled["@Genre"] == genre][["Book #", "@Genre"] + [w for w in wn_to_rosect_rw[genre]["Feature"]] + ["@Outcome"]] for genre in NEW_GENRES}

In [None]:
wn_to_rosect_themes_by_genre_df, wn_to_rosect_tbg_themes = get_themes_by_genre(wn_to_rosect_set, wn_to_rosect_rw)
wn_to_rosect_wvs = get_rosect_freq(wn_to_rosect_set, wn_to_rosect_tbg_themes, wn_to_rosect_rw)
wn_to_rosect_wvs_colors = create_cmap(plt.cm.nipy_spectral, wn_to_rosect_tbg_themes)

In [None]:
plot_theme_freq_diff_vs_weight(wnrf_to_rosect_wvs, wn_to_rosect_wvs_colors, other_wvs=wn_to_rosect_wvs)

In [None]:
# TODO: Plot only avg freq diff as bar chart
plot_theme_freq_diff(wnrf_to_rosect_wvs, wn_to_rosect_wvs_colors, other_wvs=wn_to_rosect_wvs)

In [None]:
def get_theme_diffs(wvs1: Dict, wvs2: Dict, genre_list: List = GENRES):
    theme_diffs = {}
    for genre in genre_list:
        g_diff = pd.merge(wvs1[genre], wvs2[genre], on=["Genre", "Theme"], how="outer").fillna(0)
        g_diff.rename(columns={"Frequency Difference_x": "Frequency Difference (Reduced)",
                               "Frequency Difference_y": "Frequency Difference",
                               "Weight_x": "Weight (Reduced)",
                               "Weight_y": "Weight"}, inplace=True)
        
        error = distance(g_diff[["Frequency Difference (Reduced)", "Weight (Reduced)"]], g_diff[["Frequency Difference", "Weight"]])
        theme_diffs[genre] = pd.DataFrame({"Genre": g_diff["Genre"], "Theme": g_diff["Theme"], "Error": error}).sort_values(by=["Theme"])
        theme_diffs[genre] = theme_diffs[genre].append({"Genre": "Average", "Theme": "Average", "Error": theme_diffs[genre]["Error"].mean()}, ignore_index=True)
        
    return theme_diffs


def distance(df1: Union[pd.DataFrame, pd.Series], df2: Union[pd.DataFrame, pd.Series]):
    return np.linalg.norm(df1.values - df2.values, axis=1)


def plot_theme_diffs(theme_diffs_: Dict, colors: Dict, genre_list: List = GENRES):
    for genre in genre_list:
        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(30,15))
        
        theme_diffs = theme_diffs_[genre][theme_diffs_[genre]["Genre"] != "Average"].copy()
        themes = list(theme_diffs["Theme"])
        
        theme_diffs.plot.bar(x="Theme", y="Error", ax=axes, width=0.5, rot=90, color=[colors[theme] for theme in themes])
            
        axes.set_title(genre, fontsize=32)
        ymajor=10 if theme_diffs["Error"].max() > 0.7 else 20
        setup_axis(axes, ymajor=ymajor, yminor=ymajor * 4,
                   x_ticklabels=themes, ylabel="No Reduction Theme Error",
                   top=theme_diffs["Error"].max() + 0.02,
                   grid="-", minor_grid=":")
        
        axes.grid(axis="x", linestyle="none")
        axes.get_legend().remove()

        plt.show()

In [None]:
wn_to_rosect_theme_diffs = get_theme_diffs(wnrf_to_rosect_wvs, wn_to_rosect_wvs)

In [None]:
for genre in GENRES:
    display_df(wn_to_rosect_theme_diffs[genre])

In [None]:
plt.rcParams['figure.dpi'] = 250

plot_theme_diffs(wn_to_rosect_theme_diffs, wn_to_rosect_wvs_colors)

#### Mapping Roget to Roget Section

In [None]:
roget_df = pd.DataFrame(roget_data).fillna(0).rename(columns={"_genre": "@Genre", "_outcome": "@Outcome"})
roget_df.insert(0, "Book #", unigram_df["Book #"].reset_index(drop=True))
roget_rf_set = {genre: roget_df[roget_df["@Genre"] == genre][["Book #", "@Genre"] + [w for w in roget_rw[genre]["Feature"]] + ["@Outcome"]] for genre in GENRES}

In [None]:
roget_rf_to_rosect = map_to_roget(roget_rf_set, src_model="Roget", to_sections=True)
rosect_rf_no_scale, rosect_rf_scaled = concat_map_to_roget(roget_rf_to_rosect, src_model="Roget", map_to="Section")
full_rosect_rf_acc, full_rosect_rf_weights, rosect_rf_acc, rosect_rf_weights = test_map_to_roget(rosect_rf_no_scale, rosect_rf_scaled, src_model="Roget", map_to="Section")

In [None]:
rosect_rf_df_scaled = pd.concat(list(rosect_rf_scaled.values())).fillna(0)
rosect_rf_exh, rosect_rf_rw = reduce_features(rosect_rf_weights, "Roget Section", model_df=rosect_rf_df_scaled, og_acc=rosect_rf_acc)

In [None]:
rosect_rf_reduced_acc = plot_exhausted(rosect_rf_exh, markersize=20)

In [None]:
rosect_rf_set = {genre: rosect_rf_df_scaled[rosect_rf_df_scaled["@Genre"] == genre][["Book #", "@Genre"] + [w for w in rosect_rf_rw[genre]["Feature"]] + ["@Outcome"]] for genre in GENRES}

In [None]:
rosect_rf_themes_by_genre_df, rosect_rf_tbg_themes = get_themes_by_genre(rosect_rf_set, rosect_rf_rw)
rosect_rf_wvs = get_rosect_freq(rosect_rf_set, rosect_rf_tbg_themes, rosect_rf_rw)
rosect_rf_wvs_colors = create_cmap(plt.cm.nipy_spectral, rosect_rf_tbg_themes)

In [None]:
plot_theme_freq_diff_vs_weight(rosect_rf_wvs, rosect_rf_wvs_colors)

#### Genre Prediction - One v. One

In [None]:
wn_genre_acc, wn_genre_weights = predict_genre(wordnet_df_scaled, "WordNet", how="one_v_one", disp_weights=False)

In [None]:
wn_genre_exh, wn_genre_rw = reduce_features(wn_genre_weights, "WordNet", max_steps=15, genre_list=GENRE_COMBS, g_predict="one_v_one", og_acc=wn_genre_acc)

In [None]:
g_clf_colors = create_cmap(plt.cm.nipy_spectral, GENRE_COMBS)
wn_genre_reduced_acc = plot_exhausted(wn_genre_exh, max_steps=15, genre_list=GENRE_COMBS, colors=g_clf_colors)

In [None]:
GenresNumsOutcomes = {(g1, g2): wn_df[(wn_df["@Genre"] == g1) | (wn_df["@Genre"] == g2)][["Book #", "@Genre" ,"@Outcome"]].reset_index(drop=True) for g1, g2 in GENRE_COMBS}

In [None]:
roget_genre_acc, roget_genre_weights = predict_genre(roget_df_scaled, "Roget", how="one_v_one", disp_weights=False)

In [None]:
roget_g_set = {(g1, g2): roget_df[(roget_df["@Genre"] == g1) | (roget_df["@Genre"] == g2)] for g1, g2 in GENRE_COMBS}

In [None]:
roget_g_to_rosect = pickle.load(open(str(PROJ_ROOT.joinpath("data", "roget_g_to_rosect.txt")), "rb+"))

# roget_g_to_rosect = map_to_roget(roget_g_set, src_model="Roget", to_sections=True, genre_list=GENRE_COMBS)
# with open(str(PROJ_ROOT.joinpath("data", "roget_g_to_rosect.txt")), "wb+") as f:
#     try:
#         pickle.dump(roget_g_to_rosect, f)
#     except MemoryError:
#         print("There was a MemoryError when dumping roget_g_to_rosect")

In [None]:
rosect_g_no_scale, rosect_g_scaled = concat_map_to_roget(roget_g_to_rosect, src_model="Roget", map_to="Section", genre_list=GENRE_COMBS, nums_outcomes=GenresNumsOutcomes)
full_rosect_g_acc, full_rosect_g_weights, rosect_g_acc, rosect_g_weights = test_map_to_roget(rosect_g_no_scale, rosect_g_scaled,
                                                                                             src_model="Roget", map_to="Section",
                                                                                             genre_list=GENRE_COMBS, g_predict="one_v_one")

In [None]:
# CHECK WHICH IS BETTER FIRST
rosect_g_df = pd.concat(list(rosect_g_no_scale.values())).fillna(0)
rosect_g_df_scaled, _ = process_and_scale(rosect_g_df)

In [None]:
rosect_g_set = {(g1, g2): rosect_g_df_scaled[(rosect_g_df_scaled["@Genre"] == g1) | (rosect_g_df_scaled["@Genre"] == g2)] for g1, g2 in GENRE_COMBS}

In [None]:
rosect_g_themes_by_genre_df, rosect_g_tbg_themes = get_themes_by_genre(rosect_g_set, full_rosect_g_weights, genre_list=GENRE_COMBS)
rosect_g_wvg = get_rosect_freq(rosect_g_set, rosect_g_tbg_themes, full_rosect_g_weights, genre_list=GENRE_COMBS, g_predict="one_v_one")

In [None]:
roget_genre_exh, roget_genre_rw = reduce_features(roget_genre_weights, "Roget", genre_list=GENRE_COMBS, g_predict="one_v_one", og_acc=roget_genre_acc)

In [None]:
roget_genre_reduced_acc = plot_exhausted(roget_genre_exh, genre_list=GENRE_COMBS, colors=g_clf_colors)

In [None]:
roget_grf_set = {(g1, g2): roget_df[(roget_df["@Genre"] == g1) | (roget_df["@Genre"] == g2)][["Book #", "@Genre"] + [w for w in roget_genre_rw[(g1, g2)]["Feature"]] + ["@Outcome"]] for g1, g2 in GENRE_COMBS}

In [None]:
roget_grf_to_rosect = map_to_roget(roget_grf_set, src_model="Roget", to_sections=True, genre_list=GENRE_COMBS)

In [None]:
rosect_grf_no_scale, rosect_grf_scaled = concat_map_to_roget(roget_grf_to_rosect, src_model="Roget", map_to="Section", genre_list=GENRE_COMBS, nums_outcomes=GenresNumsOutcomes)
full_rosect_grf_acc, full_rosect_grf_weights, rosect_grf_acc, rosect_grf_weights = test_map_to_roget(rosect_grf_no_scale, rosect_grf_scaled,
                                                                                                     src_model="Roget", map_to="Section",
                                                                                                     genre_list=GENRE_COMBS, g_predict="one_v_one")

In [None]:
rosect_grf_df = pd.concat(list(rosect_grf_no_scale.values())).fillna(0)
rosect_grf_df_scaled, _ = process_and_scale(rosect_grf_df)

In [None]:
rosect_grf_exh, rosect_grf_rw = reduce_features(full_rosect_grf_weights, "Roget Section", model_df=rosect_grf_df_scaled,
                                                genre_list=GENRE_COMBS, g_predict="one_v_one", og_acc=full_rosect_grf_acc)

In [None]:
rosect_grf_reduced_acc = plot_exhausted(rosect_grf_exh, genre_list=GENRE_COMBS, colors=g_clf_colors, markers=False)

In [None]:
rosect_grf_set = {(g1, g2): rosect_grf_df_scaled[(rosect_grf_df_scaled["@Genre"] == g1) | (rosect_grf_df_scaled["@Genre"] == g2)][["Book #", "@Genre"] + [w for w in rosect_grf_rw[(g1, g2)]["Feature"]] + ["@Outcome"]] for g1, g2 in GENRE_COMBS}

In [None]:
rosect_grf_themes_by_genre_df, rosect_grf_tbg_themes = get_themes_by_genre(rosect_grf_set, rosect_grf_rw, genre_list=GENRE_COMBS)
rosect_grf_wvg = get_rosect_freq(rosect_grf_set, rosect_grf_tbg_themes, rosect_grf_rw, genre_list=GENRE_COMBS, g_predict="one_v_one")

In [None]:
plt.rcParams['figure.dpi'] = 100

plot_theme_freq_diff_vs_weight(rosect_grf_wvg, rosect_rf_wvs_colors, genre_list=GENRE_COMBS, other_wvs=rosect_g_wvg)

In [None]:
rosect_genre_theme_diffs = get_theme_diffs(rosect_grf_wvg, rosect_g_wvg, genre_list=GENRE_COMBS)
# genre_similarities = []
# for comb in GENRE_COMBS:
#     similarity = rosect_grf_wvg[comb]["Frequency Difference"].mean()
#     genre_similarities.append({"Genre": comb, "Similarity": abs(similarity)})

# genre_sims_df = pd.DataFrame(genre_similarities).sort_values(by=["Similarity"])

In [None]:
plt.rcParams['figure.dpi'] = 250

plot_theme_diffs(rosect_genre_theme_diffs, rosect_rf_wvs_colors, genre_list=GENRE_COMBS)

In [None]:
# rosect_genre_wvgs = get_rosect_freq(rosect_grf_set, rosect_grf_tbg_themes, rosect_genre_rw, genre_list=GENRE_COMBS, g_predict=True, g_success=True)
# plot_theme_freq_diff_vs_weight(rosect_genre_wvgs, rosect_rf_wvs_colors, genre_list=GENRE_COMBS)

# Genre Prediction - One v. All

### Roget

In [None]:
# TODO: For each genre, do genre prediciton of selected genre vs. not the selected genre
#           - Pick n random books from selected genre, and n random books from all other genres --> use kfold
#           - Predict if book is of selected genre or not

roget_set = {genre: roget_df[roget_df["@Genre"] == genre] for genre in GENRES}

In [None]:
roget_ova_acc, roget_ova_weights = predict_genre(roget_df_scaled, "Roget", how="one_v_all", genre_list=GENRES, disp_weights=False)

In [None]:
roget_ova_exh, roget_ova_rw = reduce_features(roget_ova_weights, "Roget", g_predict="one_v_all", og_acc=roget_ova_acc)

In [None]:
roget_genre_reduced_acc = plot_exhausted(roget_ova_exh, markersize=20)

In [None]:
roget_to_rosect = pickle.load(open(str(PROJ_ROOT.joinpath("data", "roget_to_rosect.txt")), "rb+"))
# roget_to_rosect = map_to_roget(roget_set, src_model="Roget", to_sections=True)
# with open(str(PROJ_ROOT.joinpath("data", "roget_to_rosect.txt")), "wb+") as f:
#     try:
#         pickle.dump(roget_to_rosect, f)
#     except MemoryError:
#         print("There was a MemoryError when dumping roget_to_rosect")

In [None]:
rosect_ova_no_scale, rosect_ova_scaled = concat_map_to_roget(roget_to_rosect, src_model="Roget", map_to="Section")
full_rosect_ova_acc, full_rosect_ova_weights, rosect_ova_acc, rosect_ova_weights = test_map_to_roget(rosect_ova_no_scale, rosect_ova_scaled,
                                                                                                     src_model="Roget", map_to="Section",
                                                                                                     g_predict="one_v_all")

# TODO: Is scaling each genre independently cheating? Why does rosect scaled by genre perform better than reduced roget? (same goes for WordNet)

In [None]:
rosect_ova_df = pd.concat(list(rosect_ova_no_scale.values())).fillna(0)
rosect_ova_df_scaled, _ = process_and_scale(rosect_ova_df)

In [None]:
rosect_ova_set = {genre: rosect_ova_df_scaled[rosect_ova_df_scaled["@Genre"] == genre] for genre in GENRES}

In [None]:
rosect_ova_themes_by_genre_df, rosect_ova_tbg_themes = get_themes_by_genre(rosect_ova_set, full_rosect_ova_weights)
rosect_ova_wvg = get_rosect_freq(rosect_ova_set, rosect_ova_tbg_themes, full_rosect_ova_weights, g_predict="one_v_all")

In [None]:
roget_ovarf_set = {genre: roget_df[roget_df["@Genre"] == genre][["Book #", "@Genre"] + [w for w in roget_ova_rw[genre]["Feature"]] + ["@Outcome"]] for genre in GENRES}

In [None]:
roget_ovarf_to_rosect = map_to_roget(roget_ovarf_set, src_model="Roget", to_sections=True)

In [None]:
rosect_ovarf_no_scale, rosect_ovarf_scaled = concat_map_to_roget(roget_ovarf_to_rosect, src_model="Roget", map_to="Section")
full_rosect_ovarf_acc, full_rosect_ovarf_weights, rosect_ovarf_acc, rosect_ovarf_weights = test_map_to_roget(rosect_ovarf_no_scale, rosect_ovarf_scaled,
                                                                                                             src_model="Roget", map_to="Section",
                                                                                                             g_predict="one_v_all")

In [None]:
rosect_ovarf_df = pd.concat(list(rosect_ovarf_no_scale.values())).fillna(0)
rosect_ovarf_df_scaled, _ = process_and_scale(rosect_ovarf_df)

In [None]:
rosect_ovarf_exh, rosect_ovarf_rw = reduce_features(rosect_ovarf_weights, "Roget Section", model_df=rosect_ovarf_df_scaled,
                                                    g_predict="one_v_all", og_acc=rosect_ovarf_acc)

In [None]:
rosect_ovarf_reduced_acc = plot_exhausted(rosect_ovarf_exh, markersize=20)

In [None]:
rosect_ovarf_set = {genre: rosect_ovarf_df_scaled[rosect_ovarf_df_scaled["@Genre"] == genre][["Book #", "@Genre"] + [w for w in rosect_ovarf_rw[genre]["Feature"]] + ["@Outcome"]] for genre in GENRES}

In [None]:
rosect_ovarf_themes_by_genre_df, rosect_ovarf_tbg_themes = get_themes_by_genre(rosect_ovarf_set, rosect_ovarf_rw)
rosect_ovarf_wvg = get_rosect_freq(rosect_ovarf_set, rosect_ovarf_tbg_themes, rosect_ovarf_rw, g_predict="one_v_all")

In [None]:
plt.rcParams['figure.dpi'] = 100
plot_theme_freq_diff(rosect_ovarf_wvg, rosect_rf_wvs_colors, other_wvs=rosect_ova_wvg)

### WordNet

In [None]:
wn_ova_acc, wn_ova_weights = predict_genre(wordnet_df_scaled, "WordNet", how="one_v_all", genre_list=GENRES, disp_weights=False)

In [None]:
wn_ova_exh, wn_ova_rw = reduce_features(wn_ova_weights, "WordNet", max_steps=15, g_predict="one_v_all", og_acc=wn_ova_acc)

In [None]:
wn_ova_reduced_acc = plot_exhausted(wn_ova_exh, max_steps=15, markersize=20)

In [None]:
full_wn_rosect_ova_acc, full_wn_rosect_ova_weights, wn_rosect_ova_acc, wn_rosect_ova_weights = test_map_to_roget(wn_to_rosect_no_scale, wn_to_rosect_scaled,
                                                                                                                 src_model="WordNet", map_to="Section",
                                                                                                                 g_predict="one_v_all")

In [None]:
wn_rosect_ova_df = pd.concat(list(wn_to_rosect_no_scale.values())).fillna(0)
wn_rosect_ova_df_scaled, _ = process_and_scale(wn_rosect_ova_df)

In [None]:
wn_rosect_ova_set = {genre: wn_rosect_ova_df_scaled[wn_rosect_ova_df_scaled["@Genre"] == genre] for genre in GENRES}

wn_rosect_ova_themes_by_genre_df, wn_rosect_ova_tbg_themes = get_themes_by_genre(wn_rosect_ova_set, wn_rosect_ova_weights)
wn_rosect_ova_wvg = get_rosect_freq(wn_rosect_ova_set, wn_rosect_ova_tbg_themes, wn_rosect_ova_weights, g_predict="one_v_all")

In [None]:
def plot_avg_freq_diff_by_genre(map_to_rosect_wvs: Dict, colors: Dict, other_wvs: Optional[Dict] = None, genre_list: Dict = GENRES, **kwargs):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(30,15))
    
    avg_freq_diffs = pd.DataFrame([{"Genre": genre, "Average Frequency Difference": map_to_rosect_wvs[genre]["Frequency Difference"].abs().mean()} for genre in genre_list])

    legend_elems = [Patch(facecolor="white", edgecolor="black", label="Reduced Average Frequency Difference")]

    top = avg_freq_diffs["Average Frequency Difference"].max()
    bottom = avg_freq_diffs["Average Frequency Difference"].min()

    if other_wvs is not None:
        other_avg_freq_diffs = pd.DataFrame([{"Genre": genre, "Average Frequency Difference": other_wvs[genre]["Frequency Difference"].abs().mean()} for genre in genre_list])
        positions = np.arange(0, len(genre_list))

        merged = pd.merge(avg_freq_diffs, other_avg_freq_diffs, on="Genre", how="outer").fillna(0)
        merged.rename(columns={"Average Frequency Difference_x": "Average Frequency Difference (Reduced)",
                               "Average Frequency Difference_y": "Average Frequency Difference"}, inplace=True)

        axes.bar(positions - 0.2, merged["Average Frequency Difference (Reduced)"], width=0.4, color=[colors[genre] for genre in genre_list])
        axes.bar(positions + 0.2, merged["Average Frequency Difference"], width=0.4, color=[colors[genre] for genre in genre_list], edgecolor="white", hatch="///")
        legend_elems.append(Patch(facecolor="white", edgecolor="black", label="Full Average Frequency Difference", hatch="///"))

        top = max(top, other_avg_freq_diffs["Average Frequency Difference"].max())
        bottom = min(bottom, other_avg_freq_diffs["Average Frequency Difference"].min())

        plt.xticks(rotation=90)
        display_df(merged.append({"Genre": "Average",
                                  "Average Frequency Difference (Reduced)": merged["Average Frequency Difference (Reduced)"].mean(),
                                  "Average Frequency Difference": merged["Average Frequency Difference"].mean()}, ignore_index=True))

    else:
        avg_freq_diffs.plot.bar(x="Genre", y="Average Frequency Difference", ax=axes, rot=90, width=0.5, color=[colors[genre] for genre in genre_list])
        display_df(merged.append({"Genre": "Average", "Average Frequency Difference": avg_freq_diffs["Average Frequency Difference"].mean()}, ignore_index=True))

    ymajor = 40 if (top > 0.1 or abs(bottom) > 0.1) else 80 if top > 0.05 else 160
    offset = 0.01 if top > 0.05 else 0.002
    setup_axis(axes, ymin=-1, ymajor=ymajor, yminor=ymajor * 5,
               x_ticklabels=genre_list,
               ylabel="Magnitude of Avg Frequency Difference",
               bottom=bottom - offset,
               top=top + offset,
               grid="-", minor_grid=":")

    axes.set_title(kwargs.get("title", ""), fontsize=32)
    axes.grid(axis="x", linestyle="--")

    axes.legend(handles=legend_elems, loc="upper right", fontsize=18)
    plt.margins(x=0.025)
    plt.show()

In [None]:
# TODO: What makes the reduced word list special? How is it able to achieve such high performance?
#           - Magnitude of the avg freq diff should be larger for reduced feature set

plot_avg_freq_diff_by_genre(wnrf_to_rosect_wvs, colors, other_wvs=wn_to_rosect_wvs, title="Magnitude of Average Freq. Diff. of Themes by Genre - Success Prediction")

In [None]:
plot_avg_freq_diff_by_genre(rosect_ovarf_wvg, colors, other_wvs=rosect_ova_wvg, title="Magnitude of Average Freq. Diff. of Themes by Genre - Genre Prediction")

In [None]:
plt.rcParams['figure.dpi'] = 500

def score_books(model_dict: Dict, model_weights: Dict, weight_name: str):
    scores = {}
    bar_length = sum(len(model_dict[genre].columns) - 3 for genre in GENRES)

    display(HTML(f"<h4>Scoring books with {weight_name} Feature Weights...</h4>"))
    with tqdm(total=bar_length) as pbar:
        for genre in GENRES:
            pbar.set_postfix_str(f" -- {genre}")
            for col in model_dict[genre].drop(columns=["Book #", "@Genre", "@Outcome"]).columns:
                weight = model_weights[genre][model_weights[genre]["Feature"] == col]["Weight"].values[0]
                model_dict[genre][col] *= weight
                pbar.update(1)
            g_scores = model_dict[genre].drop(columns=["Book #", "@Genre", "@Outcome"]).sum(axis=1).reset_index(drop=True)
            scores[genre] = pd.DataFrame({"Book #": model_dict[genre]["Book #"].reset_index(drop=True), "Genre": model_dict[genre]["@Genre"].reset_index(drop=True), "WordNet Score": g_scores, "Outcome": model_dict[genre]["@Outcome"].reset_index(drop=True)})
            scores[genre]["WordNet Score"] = scale.fit_transform(scores[genre][["WordNet Score"]])
            scores[genre].sort_values(by=["WordNet Score"], ascending=False, inplace=True)
            scores[genre] = scores[genre].reset_index(drop=True).reset_index().rename(columns={"index": "Rank"})[["Book #", "Genre", "Rank", "WordNet Score", "Outcome"]]
    
    return scores


def get_precision(scores: Dict, display_scores: bool = False):
    score_pre = []
    for genre in GENRES:
        if display_scores:
            display(HTML(f"<b>{genre} Scores<b>"))
            display_df(scores[genre], max_rows=16)
        num_success = len(scores[genre][scores[genre]["Outcome"] == "SUCCESSFUL"])
        top = scores[genre].head(num_success)
        pre = len(top[top["Outcome"] == "SUCCESSFUL"]) / num_success
        score_pre.append({"Genre": genre, "Pre": pre})
    score_pre_df = pd.DataFrame(score_pre)
    return score_pre_df

In [None]:
wn_rf_scaled = {genre: wordnet_df_scaled[wordnet_df_scaled["@Genre"] == genre][["Book #", "@Genre"] + [w for w in wn_rw[genre]["Feature"]] + ["@Outcome"]] for genre in GENRES}

In [None]:
wn_scores = score_books(wn_rf_scaled, wn_rw, "Reduced WordNet")

In [None]:
wn_score_pre_df = get_precision(wn_scores, display_scores=True)

In [None]:
display_df(wn_score_pre_df)

In [None]:
# TODO: Compare reduced word lists of each genre --> how are they similar/different?

from functools import reduce


def get_genre_intersections(model_weights: Dict, display: bool = False):
    intersections = {}

    for g1, g2 in GENRE_COMBS:
        intersections[(g1, g2)] = pd.merge(model_weights[g1], model_weights[g2], on="Feature")
        intersections[(g1, g2)].columns = ["Feature"] + [g1, g2]

        g1_scaled = scale.fit_transform(intersections[(g1, g2)][[g1]].abs())
        intersections[(g1, g2)][g1] = g1_scaled
        g2_scaled = scale.fit_transform(intersections[(g1, g2)][[g2]].abs())
        intersections[(g1, g2)][g2] = g2_scaled

        intersections[(g1, g2)]["Difference"] = intersections[(g1, g2)][g1] - intersections[(g1, g2)][g2]
        intersections[(g1, g2)].sort_values(by=[g1], ascending=False, inplace=True)
        
        if display:
            display_df(intersections[(g1, g2)], f"<b>{g1}, {g2} -- {len(intersections[(g1, g2)])}</b>", max_rows=10)

    return intersections

In [None]:
wn_intersections = get_genre_intersections(wn_rw, display=True)

#### Accuracy Variables

In [None]:
UNI = {'Adventure_Stories': 84.0, 
       'Fiction': 75.0, 
       'Historical_Fiction': 60.0, 
       'Love_Stories': 82.0,
       'Mystery': 73.0, 
       'Poetry': 71.0, 
       'Science_Fiction': 61.0, 
       'Short_Stories': 57.0}

for i in UNI.keys():
    UNI[i] = UNI[i]/100

BI = {'Adventure_Stories': 81.0, 
       'Fiction': 75.0, 
       'Historical_Fiction': 51.0, 
       'Love_Stories': 72.0,
       'Mystery': 73.0, 
       'Poetry': 72.0, 
       'Science_Fiction': 59.0, 
       'Short_Stories': 57.0}

for i in BI.keys():
    BI[i] = BI[i]/100

POS = {'Adventure_Stories': 74.0, 
       'Fiction': 72.0, 
       'Historical_Fiction': 47.0, 
       'Love_Stories': 65.9,
       'Mystery': 63.9, 
       'Poetry': 63.0, 
       'Science_Fiction': 63.0, 
       'Short_Stories': 67.0}

for i in POS.keys():
    POS[i] = POS[i]/100

#### Plotting

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(30,15))

all_acc_colors = create_cmap(plt.cm.tab20, ACCURACIES, as_dict=False) # [to_hex(c) for c in cycler("color", plt.cm.tab20(np.linspace(0, 1, len(ACCURACIES)))).by_key()["color"]]

for (name, accuracies), color in zip(ACCURACIES.items(), all_acc_colors):
    accuracies.rename(columns={"Accuracy": name}, inplace=True)
    accuracies.plot(x="Genre", y=name, ax=axes, rot=0, color=color, linewidth=3)

axes.set_xticks(np.linspace(0, len(GENRES), len(GENRES) * 4 + 1), minor=True)    
axes.set_xticks(np.linspace(0, len(GENRES), len(GENRES) * 2 + 1))
axes.set_xticks(np.arange(0, len(GENRES)))
axes.set_xticklabels(GENRES)

axes.set_yticks(np.linspace(0, 1, 101), minor=True)
axes.set_yticks(np.linspace(0, 1, 21))

axes.tick_params(axis="x", labelsize=20)
axes.tick_params(axis="y", labelsize=24)
axes.set_xlabel("Genre", fontsize=28, labelpad=20)
axes.set_ylabel("Accuracy", fontsize=32, labelpad=30)
# axes.set_ylim(bottom=0.4, top=0.8)
axes.grid(linestyle="--")
axes.grid(axis="y", linestyle=":", which="minor")


legend = axes.legend(ACCURACIES.keys(), bbox_to_anchor=(1, 0.95), fontsize=22)
plt.margins(x=0.01, y=0.05)
plt.show()

In [None]:
avg_df = []
for k, v in ACCURACIES.items():
    average = v.loc[v["Genre"] == "Average", "Accuracy"]
    avg_df.append({"Model": k, "Avg": average})

avg_df = pd.DataFrame(avg_df)
avg_df.sort_values(by=["Avg"], ascending=False, inplace=True)
avg_df.reset_index(drop=True, inplace=True)
display_df(avg_df)

In [None]:
reduced_accs = {"Unigram Reduced": uni_reduced_acc, "Bigram Reduced": bi_reduced_acc, "POS Reduced": pos_reduced_acc, 
                "Roget Reduced": roget_reduced_acc, "WordNet Reduced": wn_reduced_acc, "LIWC Reduced": liwc_reduced_acc}

comparison_colors = create_cmap(plt.cm.tab10, reduced_accs, as_dict=False) # [to_hex(c) for c in cycler("color", plt.cm.tab10(np.linspace(0, len(reduced_accs)))).by_key()["color"]]

originals = {k.split()[0]: ACCURACIES[k.split()[0]] for k in reduced_accs.keys()}

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(30,15))

for (name, accuracies), m, c in zip(reduced_accs.items(), markers[:len(reduced_accs)], comparison_colors):
    accuracies.plot(x="Genre", y="Accuracy", ax=axes, rot=0, color=c, marker=m, markersize=20, markeredgewidth=3, fillstyle="none", linewidth=3, label=name)

for (name, accuracies), m, c in zip(originals.items(), markers[:len(originals)], comparison_colors):
    accuracies.rename(columns={"Accuracy": name}, inplace=True)
    accuracies.plot(x="Genre", y=name, ax=axes, rot=0, color=c, marker=m, markersize=20, markeredgewidth=2, fillstyle="none", linewidth=2, alpha=0.5)

axes.set_xticks(np.linspace(0, len(GENRES), len(GENRES) * 4 + 1), minor=True)
axes.set_xticks(np.linspace(0, len(GENRES), len(GENRES) * 2 + 1))
axes.set_xticks(np.arange(0, len(GENRES)))
axes.set_xticklabels(GENRES)

axes.set_yticks(np.linspace(0, 1, 101), minor=True)
axes.set_yticks(np.linspace(0, 1, 21))

axes.tick_params(axis="x", labelsize=20)
axes.tick_params(axis="y", labelsize=24)
axes.set_xlabel("Genre", fontsize=28, labelpad=20)
axes.set_ylabel("Accuracy", fontsize=32, labelpad=30)
axes.grid(linestyle="--")

legend = axes.legend(list(reduced_accs.keys()) + list(originals.keys()), bbox_to_anchor=(1, 1.27), fontsize=22, ncol=2)
plt.margins(x=0.01, y=0.05)
plt.show()

In [None]:
reduced_avg_df = []
for name, accuracies in reduced_accs.items():
    average = accuracies["Accuracy"].sum() / 8
    reduced_avg_df.append({"Model": name, "Avg": average})

reduced_avg_df = pd.DataFrame(reduced_avg_df)
reduced_avg_df.sort_values(by=["Avg"], ascending=False, inplace=True)
reduced_avg_df.reset_index(drop=True, inplace=True)
display_df(reduced_avg_df)

In [None]:
uni_wn_weights = {}
uni_wn_corr = {}
for genre in GENRES:
    uni_wn_weights[genre] = uni_weights[genre].merge(wordnet_weights[genre], how="inner", on="Feature")
    uni_wn_weights[genre].rename(columns={"Weight_x": "Unigram", "Weight_y": "WordNet"}, inplace=True)
    
    uni_wn_weights[genre].sort_values(by=["Unigram"], ascending=False, inplace=True)
    corr = uni_wn_weights[genre]["Unigram"].corr(uni_wn_weights[genre]["WordNet"])
    corr = "{0:.3f}".format(corr)
    display_df(uni_wn_weights[genre], f"<h4>Unigram-WordNet Intersecting Feature Weights - {genre}</h4>"
                                      f"{genre} Correlation: {corr}", 10, True)
        
    csv = open(str(PROJ_ROOT.joinpath("data", f"{genre}_uni_wn_weights.csv")), "w+", newline="")
    uni_wn_weights[genre].to_csv(csv, index=False)
    csv.close()

In [None]:
wordnet_all_weights = []
for genre in GENRES:
    wordnet_genre_weights = wordnet_weights[genre].drop(columns=["Feature"])
    wordnet_genre_weights["Genre"] = genre
    wordnet_all_weights.append(wordnet_genre_weights)

wordnet_all_weights = pd.concat(wordnet_all_weights)

le = preprocessing.LabelEncoder()
wordnet_all_weights["Genre"] = le.fit_transform(wordnet_all_weights["Genre"])

In [None]:
WEIGHTS = {"Unigram": uni_weights,
           "Roget": roget_weights,
           "POS": pos_weights,
           "WordNet": wordnet_weights,
           "SentiWordNet": swn_weights,
           "NRC Sentiment Emotion Lexicons": nrc_weights}

fig, axes = plt.subplots(nrows=1, ncols=len(WEIGHTS), figsize=(20, 10))

for i, ((name, weights), genre) in enumerate(zip(WEIGHTS.items(), GENRES)):
    # csv = open(PROJ_ROOT.joinpath("data", f"{name}_weights.csv"), "w+", newline="")
    # list(weights.values())[0].to_csv(csv, header=False)
    list(weights.values())[0].head(10).plot(ax=axes[i], title=name, kind="bar", rot=60, width=0.7, colormap=plt.cm.tab20)

axes[0].set_ylabel("Weight", labelpad=20, fontsize=16)

setup(axes[0])
setup(axes[1])
setup(axes[2])
setup(axes[3])
setup(axes[4])
setup(axes[5])

plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
plt.plot(list(roget_acc.values()), label = 'roget section accuracy',linewidth=4)
ax.legend(loc='best')
plt.xticks(range(8), roget_df_scaled._genre.unique())
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
plt.plot(list(roget_acc.values()), label = 'roget section accuracy',linewidth=4)
ax.legend(loc='best')
plt.xticks(range(8), roget_df_scaled._genre.unique())
plt.show()