# 2-Representation

## Import

In [14]:
from numpy.random import shuffle
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

from text_to_tokens import load, to_words_text, remove_stop_words_text,\
    stem_text

## Tokenization

I'll tokenize each document by paritioning it on whitespace. In addition, I'll remove English stop words and stem all the words that remain.

In [None]:
# I'm working with the IMDB movie reviews dataset
ds_name = 'imdb'
docs_all = load(ds_name)

# strings needed to specify input/output filenames
dir = '../data'
tag = 'tokenized'
suffix = 'pkl'

df_tokenized = {}
for (key, docs) in docs_all.items():
    new_dict = docs
    new_dict = to_words_text(new_dict)
    new_dict = remove_stop_words_text(new_dict)
    new_dict = stem_text(new_dict)        
    df_tokenized[key] = new_dict
        
    filename = f'{dir}/{ds_name}-{key}-{tag}.{suffix}'
    with open(filename, 'wb') as f:
        pickle.dump(df_tokenized[key], f)


## Test

As a test of the above code, I'll read in both pickled files and give them a cursory inspection.

In [4]:
with open(f'{dir}/{ds_name}-train-{tag}.{suffix}', 'rb') as f:
    train = pickle.load(f)

with open(f'{dir}/{ds_name}-test-{tag}.{suffix}', 'rb') as f:
    test = pickle.load(f)

print(f'Shape of train: ({len(train["text"])}, {len(train.keys())})')
print(f'Shape of test: ({len(test["text"])}, {len(test.keys())})')


Shape of train: (25000, 2)
Shape of test: (25000, 2)


I'll print out the first 10 tokens (words) of an arbitrary text sample from each of train and test.

In [5]:
train['text'][1000][:11]

['admit',
 'laugh',
 'watch',
 'movi',
 'few',
 'comedi',
 'sawbr',
 'br',
 'budget',
 'have',
 'consist']

In [6]:
test['text'][1000][:11]

['film',
 'about',
 'struggl',
 'actor',
 'tri',
 'satisfact',
 'life',
 'especi',
 'love',
 'he',
 'tast']

## Vectorize

I'll use TF-IDF to vectorize the collection of text tokens to get ready for a classification analysis using `RandomForestClassifier`.

In [17]:
train_corpus = [' '.join(tokens) for tokens in train['text']]
test_corpus = [' '.join(tokens) for tokens in test['text']]

shuffle(train_corpus)
shuffle(test_corpus)

n_samples = 10
small_train_corpus = train_corpus[:n_samples]
small_test_corpus = test_corpus[:n_samples]


In [20]:
vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=100
)

X_train = vectorizer.fit_transform(small_train_corpus)
X_test = vectorizer.transform(small_test_corpus)


## Train a Random Forest