# 2-Representation

## Import

In [95]:
import numpy as np
from numpy.random import choice
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score,\
    recall_score, f1_score

from text_to_tokens import load, to_words_text, remove_stop_words_text,\
    stem_text

## Tokenization

I'll tokenize each document by paritioning it on whitespace. In addition, I'll remove English stop words and stem all the words that remain.

In [None]:
# I'm working with the IMDB movie reviews dataset
ds_name = 'imdb'
docs_all = load(ds_name)

# strings needed to specify input/output filenames
dir = '../data'
tag = 'tokenized'
suffix = 'pkl'

df_tokenized = {}
for (key, docs) in docs_all.items():
    new_dict = docs
    new_dict = to_words_text(new_dict)
    new_dict = remove_stop_words_text(new_dict)
    new_dict = stem_text(new_dict)        
    df_tokenized[key] = new_dict
        
    filename = f'{dir}/{ds_name}-{key}-{tag}.{suffix}'
    with open(filename, 'wb') as f:
        pickle.dump(df_tokenized[key], f)


## Test

As a test of the above code, I'll read in both pickled files and give them a cursory inspection.

In [4]:
with open(f'{dir}/{ds_name}-train-{tag}.{suffix}', 'rb') as f:
    train = pickle.load(f)

with open(f'{dir}/{ds_name}-test-{tag}.{suffix}', 'rb') as f:
    test = pickle.load(f)

print(f'Shape of train: ({len(train["text"])}, {len(train.keys())})')
print(f'Shape of test: ({len(test["text"])}, {len(test.keys())})')


Shape of train: (25000, 2)
Shape of test: (25000, 2)


I'll print out the first 10 tokens (words) of an arbitrary text sample from each of train and test.

In [34]:
train['text'][1000][:11]

['admit',
 'laugh',
 'watch',
 'movi',
 'few',
 'comedi',
 'sawbr',
 'br',
 'budget',
 'have',
 'consist']

In [35]:
test['text'][1000][:11]

['film',
 'about',
 'struggl',
 'actor',
 'tri',
 'satisfact',
 'life',
 'especi',
 'love',
 'he',
 'tast']

## Vectorize

I'll use TF-IDF to vectorize the collection of text tokens to get ready for a classification analysis using `RandomForestClassifier`.

In [58]:
# For both train and test sets, join the tokens together to form a corpus.
train_df = pd.DataFrame(train)
train_df['text'] = train_df['text'].apply(lambda x: ' '.join(x))

test_df = pd.DataFrame(test)
test_df['text'] = test_df['text'].apply(lambda x: ' '.join(x))

test_df.head()

Unnamed: 0,text,label
0,love scifi am will put lot scifi moviestv usua...,0
1,worth entertain valu a rental especi like acti...,0
2,a total averag film a semialright action seque...,0
3,star rate saturday night friday night friday m...,0
4,off let say havent enjoy van damm movi bloodsp...,0


In [85]:
# Consider a subset of each (train and test) shuffled corpus.
n_samples = 150

train_indices = choice(np.arange(train_df.shape[0]), n_samples)
test_indices = choice(np.arange(test_df.shape[0]), n_samples)

small_train_df = train_df.loc[train_indices, :] 
small_test_df = test_df.loc[test_indices, :]


In [86]:
X_train = small_train_df['text']
X_test = small_test_df['text']
y_train = small_train_df['label']
y_test = small_test_df['label']

vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=150
)

X_train_ft = np.array(vectorizer.fit_transform(X_train).todense())
X_test_t = np.array(vectorizer.transform(X_test).todense())

print(f'Shape of X_train_ft: {X_train_ft.shape}')
print(f'Shape of X_test_t: {X_test_t.shape}')


Shape of X_train_ft: (150, 150)
Shape of X_test_t: (150, 150)


## Train a Random Forest Classifier

In [98]:
rfc = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 5, 6]
}

search = GridSearchCV(
    rfc,
    param_grid=param_grid,
    n_jobs=-1,
    refit=True,
    cv=None,
    verbose=1
)

results = search.fit(X_train_ft, y_train)
print(f'Best Parameters: {results.best_params_}\n')


Fitting 5 folds for each of 6 candidates, totalling 30 fits


I'll score the results:

In [97]:
model = results.best_estimator_
y_train_pred = model.predict(X_train_ft)
y_test_pred = model.predict(X_test_t)

print('Training scores:')
print(f'Accuracy: {accuracy_score(y_train, y_train_pred)}')
print(f'Precision: {precision_score(y_train, y_train_pred)}')
print(f'Recall: {recall_score(y_train, y_train_pred)}')
print(f'F1: {f1_score(y_train, y_train_pred)}\n')

print('Test scores:')
print(f'Accuracy: {accuracy_score(y_test, y_test_pred)}')
print(f'Precision: {precision_score(y_test, y_test_pred)}')
print(f'Recall: {recall_score(y_test, y_test_pred)}')
print(f'F1: {f1_score(y_test, y_test_pred)}\n')


Best Parameters: {'max_depth': 5, 'n_estimators': 100}
Training scores:
Accuracy: 0.98
Precision: 0.9642857142857143
Recall: 1.0
F1: 0.9818181818181818

Test scores:
Accuracy: 0.6466666666666666
Precision: 0.57
Recall: 0.8507462686567164
F1: 0.6826347305389222

