# Feature Engineering: Repetition and Phrases

I've found two good ways to get ngrams:

   1. Using Gensim's [Phrases model](https://radimrehurek.com/gensim_3.8.3/models/phrases.html) iteratively across the corpus, where the kth iteration creates a kgram
   2. Using one of SKLearn's text feature extraction modules [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) or [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer), which is equivalent to the CountVectorizer followed by the [TfidfTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer)

## imports

In [1]:
import pickle
import numpy as np
import pandas as pd
from datetime import date
import json
from tqdm.notebook import tqdm

import re
from collections import Counter, defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import Phrases

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
sw = stopwords.words("english")

In [3]:
with open(f'../data/metascripts_df_profanity.pickle', 'rb') as file:
    metascripts = pickle.load(file)

## prepare the data

In [4]:
descriptions = list(metascripts['description'].values)
scripts = list(metascripts['transcript'].values)
scripts_dict = dict(zip(descriptions, scripts))

## use sklearn to get repetition of phrases up to 7grams

In [83]:
# get ngram term frequencies using count vectorizer
ct_vectorizer = CountVectorizer(lowercase = True, 
                             token_pattern = r"\b[a-zA-z][a-zA-Z\-']*\b", 
                             ngram_range = (1, 7),
                             stop_words = "english"
                            )
scripts_tf = ct_vectorizer.fit_transform(scripts)

# get ngram "term frequency-inverse document frequncies" using count vectorizer
tfidf_vectorizer = TfidfVectorizer(**ct_vectorizer.get_params())
scripts_tfidf = tfidf_vectorizer.fit_transform(scripts)

import warnings
warnings.filterwarnings('ignore')

In [88]:
len(tfidf_vectorizer.get_feature_names_out())

7159922

In [85]:
# determine how many phrases are repeated and what proportion of phrases are repeated for each show
threepeat_counts = (scripts_tf >= 3).sum(axis = 1).flatten().tolist()[0]
threepeat_props = ((scripts_tf >= 3).sum(axis = 1) / (scripts_tf > 0).sum(axis = 1)).flatten().tolist()[0]

# add columns to metascripts
metascripts['threepeat counts'] = threepeat_counts
metascripts['threepeat proportions'] = threepeat_props

In [None]:
import plotly.express as px

px.box(metascripts, x = 'threepeat proportions', hover_data = ['description'], points = 'all')

## pickle results

In [87]:
with open('../data/metascripts_repetition_df.pickle', 'wb') as file:
    pickle.dump(metascripts, file)

## appendix:
### with gensim

In [75]:
lemmatizer = WordNetLemmatizer()

In [76]:
tok_scripts = [regexp_tokenize(transcript, r"['\-\w]+") for description, transcript in scripts_dict.items()]
docs_lem = [[lemmatizer.lemmatize(tok.lower()) for tok in transcript] for transcript in tok_scripts]
docs_no_lem = [[tok.lower() for tok in transcript] for transcript in tok_scripts]

In [77]:
# still not picking up anything greater than a bigram. May need to reduce the Phrases threshold.

def append_ngrams(docs, ngram):
    for idx in range(len(docs)):
        for token in ngram[docs[idx]]:
            if '_' in token:
                # if token is an ngram, add to document.
                docs[idx].append(token)
    return docs

def make_ngrams(tok_corpus, with_dict = False, lemmatize = True, max_n = 2, min_count = 5, **kwargs):
    ngram_dict = {}
    if lemmatize:
        docs = [[lemmatizer.lemmatize(tok.lower()) for tok in transcript] for transcript in tok_corpus]
    else:
        docs = [[tok.lower() for tok in transcript] for transcript in tok_corpus]
    for n in range(2, max_n+1):
        if n == 2:
            ngram_dict[f'{str(n)}grams'] = Phrases(docs, min_count = min_count, **kwargs)
        else:
            ngram_dict[f'{str(n)}grams'] = Phrases(ngram_dict[f'{str(n-1)}grams'][docs], min_count = min_count, **kwargs)
    docs = append_ngrams(docs, ngram_dict[f'{str(max_n)}grams'])
    if with_dict:
        return docs, ngram_dict
    else:
        return docs

In [None]:
docs, ngram_dict = make_ngrams(tok_scripts, with_dict = True, lemmatize = True, max_n = 4, min_count = 1, threshold = 1)

In [None]:
ngram_dict

In [None]:
for ind in range(len(docs)):
    c = Counter(tok for tok in docs[ind] if re.search("(.+_){2}", tok))
    if len(c) > 0:
        print(c)

In [None]:
[(descriptions[ind], len(re.findall("what is that", script.lower()))) for ind, script in enumerate(scripts) if len(re.findall("what is that", script.lower())) > 0 ][15:19]