# Feature Engineering: Parts of Speech

## imports

In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import date
import json
from tqdm.notebook import tqdm

import re
from collections import Counter, defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import Phrases

import spacy

In [None]:
sw = stopwords.words("english")

In [None]:
with open('../data/metascripts_df_profanity', 'rb') as file:
    metascripts = pickle.load(file)

## prepare the data

In [None]:
descriptions = list(metascripts['description'].values())
scripts = list(metascripts['transcript'].values())
scripts_dict = zip(descriptions, scripts)

## part-of-speech frequencies and proportions

In [None]:
# create chunker to manage memory usage
def chunker(iterable, chunksize):
    for i in range(0, len(list(iterable)), chunksize):
        yield iterable[i:i+chunksize]

# create function that builds a dictionary from pos counts
def get_doc_pos_count(doc):
    pos_dict = {}
    for token in doc:
        if token.pos_ in pos_dict.keys():
            pos_dict[token.pos_] += 1
        else:
            pos_dict[token.pos_] = 1
    return pos_dict

In [None]:
# instantiate the English model: nlp
nlp = spacy.load('en_core_web_md')

# stream scripts in chunks through the nlp pipe, make pos counts dictionaries, and append to complete list
docs_pos_counts = []
for scripts_subset in tqdm(chunker(scripts, 10), total = np.ceil(len(scripts)/10)):
    subset_list = [get_doc_pos_count(doc) for doc in nlp.pipe(scripts_subset)]
    docs_pos_counts.extend(subset_list)

In [None]:
# turn list of dictionary counts to dictionary of dictionary counts of only actual words
docs_pos_counts_words = [{pos: count for pos, count in count_dict.items() if pos not in ['PUNCT', 'SPACE', 'X']} for count_dict in docs_pos_counts]
show_pos_counts = dict(zip(descriptions, docs_pos_counts_words))

# create dictionary of dictionaries of pos proportions by show
show_pos_props = {description: {pos: count/sum(counts_dict.values()) for pos, count in counts_dict.items()} for description, counts_dict in show_pos_counts.items()}

In [None]:
# create pos proportion dataframe, pos_df
pos_df = (pd.DataFrame.from_dict(show_pos_props, orient = 'index')
                      .fillna(0)
                      .reset_index()
                      .rename(columns = {'index':'description'})
        )

## pickle pos_df and docs_pos_counts_words for future use

In [None]:
# pickle pos_df for future use
with open('../data/pos_props_df.pickle', 'wb') as file:
    pickle.dump(pos_df, file)
    
with open('../data/docs_pos_counts_words_dict.pickle', 'wb') as file:
    pickle.dump(pos_df, file)

## explore

In [None]:
pos_col = 'VERB'
px.box(pos_df, x = pos_col, hover_data = ['description'], points = 'all')