# Feature Engineering: Point of View

## import

In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import date
import json
from tqdm.notebook import tqdm

import re
from collections import Counter, defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import Phrases

In [None]:
sw = stopwords.words("english")

In [None]:
with open('../data/metascripts_repetition_df', 'rb') as file:
    metascripts = pickle.load(file)
    
with open('../data/pos_props_df.pickle', 'rb') as file:
    pos_df = pickle.load(file)
    
with open('../data/docs_pos_counts_words_dict.pickle', 'rb') as file:
    docs_pos_counts_words = pickle.load(file)

## prepare the data

In [None]:
descriptions = list(metascripts['description'].values())
scripts = list(metascripts['transcript'].values())
scripts_dict = zip(descriptions, scripts)

In [None]:
# tokenize scripts to words
tok_scripts = [regexp_tokenize(script, r"\b[a-zA-Z'\w\-\*]+\b") for script in scripts]

# lower case all words
tok_scripts_lc = [[token.lower() for token in script] for script in tok_scripts]

# return dictionary of counts of words for each show
word_counts = [Counter(token for token in script) for script in tok_scripts_lc]

# make a gensim corpus
dictionary = Dictionary(tok_scripts_lc)
corpus = [dictionary.doc2bow(script) for script in tok_scripts_lc]

In [None]:
# designate point-of-view pronouns
first_pron = ["i", "me", "my", "mine", "myself",
            "we", "us", "our", "ours", "ourselves"]

second_pron = ["you", "your", "yours"]

third_pron = ["he", "she", "it",
            "him", "her",
            "his", "its", "hers",
            "they", "them", "their", "theirs"]

## point-of-view frequencies and proportions

In [None]:
def count_pov(word_dict, pos_dict):
    """
    Count pronoun usage using pronoun labels and spaCy part-of-speech assignments.
    Nouns and proper nouns are counted as third person along with the usual 3rd-person pronouns.
    """
    pov_dict = defaultdict(int)
    for word, count in word_dict.items():
        if word in first_pron:
            pov_dict['first_person'] += count
        elif word in second_pron:
            pov_dict['second_person'] += count
        elif word in third_pron:
            pov_dict['third_person'] += count
    pov_dict['third_person'] += pos_dict['NOUN'] + pos_dict['PROPN']
    return dict(pov_dict)

In [None]:
# create a readable and searchable wordcount dictionary of dictionaries
word_counts_dicts = [{dictionary[entry[0]]: entry[1] for entry in index_counts} for index_counts in corpus]

In [None]:
# create the pov counts list of dictionaries using the function defined above.
pov_counts = [count_pov(word_count_dict, pos_count_dict) for word_count_dict, pos_count_dict in zip(word_counts_dicts, docs_pos_counts_words)]

# get proportion of each type of pronoun compared to all words (hence "overall")
pov_props_overall = [{pov: count/sum(word_count_dict.values()) for pov, count in pov_count_dict.items()} for pov_count_dict, word_count_dict in zip(pov_counts, word_counts_dicts)]

# get proportion of each type of pronoun compared to other pronouns
pov_props_relative = [{pov: count/sum(pov_count_dict.values()) for pov, count in pov_count_dict.items()} for pov_count_dict in pov_counts]

In [None]:
# create dataframes from each list of dictionaries created above
def make_pov_df(pov_dict):
    return pd.DataFrame.from_dict(dict(zip(descriptions, pov_dict)), orient = 'index').reset_index().rename(columns = {'index':'description'})

pov_counts_df = make_pov_df(pov_counts)
pov_props_overall_df = make_pov_df(pov_props_overall)
pov_props_relative_df = make_pov_df(pov_props_relative)

## pickle point-of-view dfs

In [None]:
with open('../data/pov_counts_df.pickle', 'wb') as file:
    pickle.dump(pov_counts_df, file)
    
with open('../data/pov_props_overall_df.pickle', 'wb') as file:
    pickle.dump(pov_props_overall_df, file)
    
with open('../data/pov_props_relative_df.pickle', 'wb') as file:
    pickle.dump(pov_props_relative_df, file)