# Utility notebook for dataframe prep

This notebook outlines the process for building a dataset for use with a prepared csv with authors and posts. This csv can be built using the functions in the `parse_file.py` script. The output of this file is a pickled numpy array of GloVe vector embeddings and sequence encodings of posts.

The procedure used to prepare the dataframe can be summarized by:
- Filter out posts with low # of words
- Filter out long outliers
- Select top n occuring authors
- Create a vocabulary and a mapping of vocab -> GloVe vector
- Create embedding matrix using the vocabulary
- Encode each post to create new dataframe

In [None]:
import csv
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
tqdm.pandas()

In [None]:
GLOVE_FILE = "glove.twitter.27B.50d.txt"
GLOVE_TYPE = "twitter"
EMBEDDING_VECTOR_SIZE = 50 # should match glove file
MIN_POST_LENGTH = 250 # maybe fillers not included
INPUT_LENGTH = 200
N_AUTHORS = 25
MAX_NUM_WORDS=5000
POST_FILE = "files/posts.csv"


## Run initial prep on the dataframe
- Get length of each post and filter on post length
- Remove extremely long posts (likely copy/pasted articles)
- Select n most represented authors

In [None]:
def get_length(x):
    try:
        return len(x.split())
    except:
        return 0


def prepare_df(df, min_post_length, n_authors):
    
    # filter posts by post length
    df["length"] = df.post.apply(lambda s: get_length(s))
    length_filtered_df = df[df["length"] > min_post_length]
    
    # remove outlier lengths
    length_filtered_df = length_filtered_df[length_filtered_df.length < length_filtered_df.length.quantile(.95)]
    
    # author counts
    author_counts = length_filtered_df["author"].value_counts()
    
    # select n highest authors
    author_filtered_df = length_filtered_df[length_filtered_df["author"].isin(author_counts[:n_authors].index.tolist())]
    
    return author_filtered_df

## Set up utility functions for extracting embeddings from GloVe file and building the embedding matrix

In [None]:
def build_embeddings(glove_file):
    embeddings_dict = {}
    
    # only keep words we have glove entries for
    rebuilt_vocab = []
    
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddings_dict[word] = values[1:]

    return embeddings_dict

In [None]:
def build_embedding_matrix(tokenizer, glove_dim, embeddings_dict, max_words):
    embedding_matrix = np.zeros((max_words, glove_dim))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

Note: this is kind of cyclical and dumb because we're originally filtering on the post length; but, I don't want to compile a full vocabulary and then pare it down so I'm using the post length redundantly for both the initial filter and encoding filter. Might change later...

In [None]:
# Create dataframe from the original post file
df = pd.read_csv(POST_FILE)
df

### Run dataframe preparation...

In [None]:
prepared_df = prepare_df(df, MIN_POST_LENGTH, N_AUTHORS)
prepared_df

### Build embedding dictionary...

In [None]:
embedding_dict = build_embeddings(GLOVE_FILE)

### Prepare tokenizer and integer encode the docs...

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# prepare tokenizer
t = Tokenizer(num_words=MAX_NUM_WORDS) 
# max_num_words has to be higher than actual to account for 
# vocab loss in GloVe representation

t.fit_on_texts(prepared_df["post"])

# integer encode the documents
output_df = prepared_df
encoded_docs = t.texts_to_sequences(prepared_df["post"])
padded = pad_sequences(encoded_docs, maxlen=INPUT_LENGTH, padding='post')

### Convert all sequences to array.... 
This conversion is necessary for downstream use of the df.values
Without the conversion, the result is a nested numpy array. 

In [None]:
output_df["input"] = list(padded)

# temporary fix for numpy df entries 
output_df["input"] = output_df.input.progress_apply(lambda r: r.tolist())

### Create the embedding matrix

In [None]:
embedding_dict = build_embeddings(GLOVE_FILE)
embedding_matrix = build_embedding_matrix(t, EMBEDDING_VECTOR_SIZE, embedding_dict, MAX_NUM_WORDS)

### Save the embedding matrix and dataframe

In [None]:
# drop null values
output_df.dropna(axis=0)

In [None]:
output_df.to_pickle(f"files/data/{GLOVE_TYPE}_{N_AUTHORS}_{EMBEDDING_VECTOR_SIZE}_{INPUT_LENGTH}_df.pickle")

In [None]:
# save the embedding matrix
import pickle
matrix_filename = f"files/data/{GLOVE_TYPE}_{N_AUTHORS}_{EMBEDDING_VECTOR_SIZE}_{INPUT_LENGTH}_embedding.pickle"
with open(matrix_filename,'wb') as f:
    pickle.dump(np.array(embedding_matrix), f)

output_df.to_pickle(f"files/data/{GLOVE_TYPE}_{N_AUTHORS}_{EMBEDDING_VECTOR_SIZE}_{INPUT_LENGTH}_df.pickle")

### Get the vocab size

In [None]:
vocab_size = len(embedding_matrix)