# Preprocessing Text Modality

In this section, we begin with the text modality data, which has been converted from GloVe embeddings back to words (`text_modality.pkl`, see `preprocessing_transform_from_glove_to_words.ipynb`). We then preprocess the text in the following way-:

- Tokenization: Using the [spaCy](https://pypi.org/project/spacy/) package, we tokenize each text into individual words.
- Stop word removal: Common stop words are removed.
- Rare word filtering: Rare words are removed to reduce sparsity in the dataset.
- Word counts by sentence: Finally, the data is transformed into word count representations, segmented by sentences.


## Packages & functions

In [1]:
import pickle
from tqdm import tqdm
import numpy as np

import spacy
nlp = spacy.load("en_core_web_trf")

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def transform_to_input(how_long_sentences):

    index_text_sentence = []
    text_modality_sentence = [] 

    for i in tqdm(range(N)):
        text_tmp = np.array([word for word in text_modality_token[i].split() if word not in (my_stops) and word in word_dictionary_vectorizer])
        how_many_tmp = np.ceil(text_tmp.shape[0]/how_long_sentences).astype('int')

        for j in range(how_many_tmp):
            index_text_sentence.append(i)

            if j == (how_many_tmp-1):
                text_modality_sentence.append(' '.join(text_tmp[how_long_sentences*(how_many_tmp-1):]))
            else:
                text_modality_sentence.append(' '.join(text_tmp[(how_long_sentences*j):(how_long_sentences*j+how_long_sentences)]))

    print('How many observations in, how many sentences', len(np.unique(np.array(index_text_sentence)))/N, len(index_text_sentence))
    
    vectorizer=CountVectorizer(min_df=min_df0)
    X = vectorizer.fit_transform(text_modality_sentence)
    word_dictionary = vectorizer.get_feature_names()
    X = X.toarray()

    data_text = []
    text_not_empty_ind = []
    for i in tqdm(range(N)):

        if np.sum(X[np.array(index_text_sentence) == i,:]) > 0:

            X_tmp = X[np.array(index_text_sentence) == i,:]
            X_tmp = X_tmp[np.sum(X_tmp, axis=1) > 0, :]
            data_text.append(X_tmp)
            text_not_empty_ind.append(True)
        
        else: 
            data_text.append([])
            text_not_empty_ind.append(False)

    return data_text, text_not_empty_ind, index_text_sentence, text_modality_sentence, X, word_dictionary

## MOSI

In [164]:
with open('data_transformed/MOSI/text_modality.pkl', "rb") as input_file:
    text_modality = pickle.load(input_file)
N = len(text_modality)

### Tokenization

We tokenize observed sentences and delete stop words.

In [166]:
# https://ashutoshtripathi.com/2020/04/13/parts-of-speech-tagging-and-dependency-parsing-using-spacy-nlp/
stop_words_part_of_speech = ['DET', 'PRON', 'CONJ', 'CCONJ', 'ADP']

In [167]:
text_modality_token = []

In [None]:
for n in tqdm(range(N)):
    nlp_n = nlp(text_modality[n])
    nlp_n = [nlp_n_w for nlp_n_w in nlp_n if not nlp_n_w.pos_ in stop_words_part_of_speech]
    text_modality_token.append(' '.join([nlp_n_w.lemma_ for nlp_n_w in nlp_n]))

### Split into sentences

In [169]:
min_df0 = 5
vectorizer = CountVectorizer(min_df=min_df0)
X = vectorizer.fit_transform(text_modality_token)
word_dictionary_vectorizer = vectorizer.get_feature_names()
X = X.toarray()

In [176]:
my_stops = ['be', 'do', 'have']

In [None]:
data_text, text_not_empty_ind, index_text_sentence, text_modality_sentence, X, word_dictionary = transform_to_input(1)

In [None]:
len(data_text)

In [183]:
# with open('data_transformed/MOSI/text_modality_array_sentence1word.pkl', 'wb') as f:  
#     pickle.dump([data_text, text_not_empty_ind], f, protocol=5)

## MOSEI

In [5]:
with open('data_transformed/MOSEI/text_modality_v7619.pkl', "rb") as input_file:
    text_modality_0 = pickle.load(input_file)
text_modality_0 = text_modality_0[0]

In [92]:
with open('data_transformed/MOSEI/text_modality_v15239.pkl', "rb") as input_file:
    text_modality_1 = pickle.load(input_file)
text_modality_1 = text_modality_1[0]

In [94]:
with open('data_transformed/MOSEI/text_modality_v22859.pkl', "rb") as input_file:
    text_modality_2 = pickle.load(input_file)
text_modality_2 = text_modality_2[0]

In [121]:
text_modality = text_modality_0 + text_modality_1 + text_modality_2

In [123]:
N = len(text_modality)

### Tokenization

Computed in `preprocessing_text_modality_MOSEI.py`

### Split into sentences

In [6]:
with open('data_transformed/MOSEI/text_modality_token_v0.pkl', "rb") as input_file:
    text_modality_0 = pickle.load(input_file)
with open('data_transformed/MOSEI/text_modality_token_v1.pkl', "rb") as input_file:
    text_modality_1 = pickle.load(input_file)
with open('data_transformed/MOSEI/text_modality_token_v2.pkl', "rb") as input_file:
    text_modality_2 = pickle.load(input_file)

In [7]:
text_modality_token = text_modality_0 + text_modality_1 + text_modality_2
N = len(text_modality_token)

In [None]:
for n in tqdm(range(N)):
    text_modality_token[n] = (' ').join([w for w in text_modality_token[n].split() if not 'youngentrepreneur.com' in w])
    text_modality_token[n] = (' ').join([w for w in text_modality_token[n].split() if not '�' in w])

In [10]:
min_df0 = 50
vectorizer = CountVectorizer(min_df=min_df0)
X = vectorizer.fit_transform(text_modality_token)
word_dictionary_vectorizer = vectorizer.get_feature_names()
X = X.toarray()

In [16]:
my_stops = ['be', 'do', 'have']

In [None]:
data_text, text_not_empty_ind, index_text_sentence, text_modality_sentence, X, word_dictionary = transform_to_input(1)

In [18]:
with open('data_transformed/MOSEI/text_modality_array_sentence1words.pkl', 'wb') as f:  
    pickle.dump([data_text, text_not_empty_ind], f, protocol=5)