In this notebook the feature extraction will be performed. For the first prototype I am going to performs following transformations:
1. word tokenize
2. decapitalize
3. remove stopwords

This is my tokenization sequence performed on input string

Once I have a suitable set of tokens, I am going to compute tfidf for the training set. I want to use the tfidf generated on train set on validation and test sets later. 

In [1]:
# build a tokenizer
import os
import sys

from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

get_dir = os.path.dirname
PROJ_ROOT = get_dir(get_dir(os.path.abspath('__file__')))
print(PROJ_ROOT)
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

from data import dataset
 
stop_words = stopwords.words('english')
 
def tokenize(text):
    """Perform tokenization of input sentence
    
    Arguments:
        text: string representing a single sentence.
    
    Output:
        List of tokens
        
    First, using nltk word_tokenize splits the sentence into tokens
    Then, lowercases all tokens
    Finally, removes stopwords tokens and digits
    return a list of valid tokens
    """
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]

# let's test our function
test_sentence = 'Mouse is a small rodent'
expected_output = ['mouse', 'small', 'rodent']
assert expected_output == tokenize(test_sentence)
print(tokenize(test_sentence))

C:\Users\48519\Professional Stuff\various\machine_learning\mouse_disambiguation
['mouse', 'small', 'rodent']


In [2]:
# Now, since the tokenizer is ready we can build our vectorizer.

READ_DIRECTORY = dataset.DEFAULT_PROCESSED_TEXT_DATA_DIRECTORY
train_dataset_filepath = os.path.join(READ_DIRECTORY, 'train.csv')
train_dataset = pd.read_csv(train_dataset_filepath, sep=';')
train_dataset.head()

vectorization_parameters ={
    'ngrams': (1, 3),
    'min_df': 2,
    'max_df': 0.5
}

# vectorizer = TfidfVectorizer(
#     analyzer='word',
#     tokenizer=tokenize,
#     max_df=vectorization_parameters['max_df'],
#     min_df=vectorization_parameters['min_df'],
#     ngram_range=vectorization_parameters['ngrams']
# )

vectorizer = dataset.TextLabelsVectorizer(dataset.DEFAULT_VECTORIZER_SETTINGS)

vectorizer.fit(train_dataset.iloc[:,0], train_dataset.iloc[:,1]) 
features, classes = vectorizer.transform(train_dataset.iloc[:,0], train_dataset.iloc[:,1]) 
inverse_classes = vectorizer.get_classes_name(classes)
print(inverse_classes[:5], classes[:5])
print(vectorizer.get_params())
# print(type(features))
# print(features[0:2])

[('device', 160), ('animal', 54)]
['animal', 'device', 'device', 'device', 'animal'] [1 0 0 0 1]
{'label_encoder': {'labels': {'device': 0, 'animal': 1}, 'inverse_mapping': {'0': 'device', '1': 'animal'}}, 'vectorizer': {'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 0.5, 'max_features': None, 'min_df': 2, 'ngram_range': (1, 3), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': <bound method TextLabelsVectorizer.tokenize of <data.dataset.TextLabelsVectorizer object at 0x056865F0>>, 'use_idf': True, 'vocabulary': None}}


what is left now is to save the generated features as well as to save the parameters of the vectorizer. 
It will be used at evaluation to tranform the validation set

In [4]:
FEATURES_SAVE_DIRECTORY = os.path.join(dataset.DEFAULT_PROCESSED_DATA_DIRECTORY, 'features')