In this notebook the feature extraction will be performed. For the first prototype I am going to performs following transformations:
1. word tokenize
2. decapitalize
3. remove stopwords

This is my tokenization sequence performed on input string

Once I have a suitable set of tokens, I am going to compute tfidf for the training set. I want to use the tfidf generated on train set on validation and test sets later. 

In [37]:
# build a tokenizer
import os
import sys

from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

get_dir = os.path.dirname
PROJ_ROOT = get_dir(get_dir(os.path.abspath('__file__')))
print(PROJ_ROOT)
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

from data import dataset
 
stop_words = stopwords.words('english')
 
def tokenize(text):
    """Perform tokenization of input sentence
    
    Arguments:
        text: string representing a single sentence.
    
    Output:
        List of tokens
        
    First, using nltk word_tokenize splits the sentence into tokens
    Then, lowercases all tokens
    Finally, removes stopwords tokens and digits
    return a list of valid tokens
    """
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]

# let's test our function
test_sentence = 'Mouse is a small rodent'
expected_output = ['mouse', 'small', 'rodent']
assert expected_output == tokenize(test_sentence)
print(tokenize(test_sentence))

C:\Users\48519\Professional Stuff\various\machine_learning\mouse_disambiguation
['mouse', 'small', 'rodent']


In [41]:
# Now, since the tokenizer is ready we can build our vectorizer.

READ_DIRECTORY = dataset.DEFAULT_PROCESSED_TEXT_DATA_DIRECTORY
train_dataset_filepath = os.path.join(READ_DIRECTORY, 'train.csv')
train_dataset = pd.read_csv(train_dataset_filepath, sep=';')
train_dataset.head()

vectorization_parameters ={
    'ngrams': (1, 3),
    'min_df': 2,
    'max_df': 0.5
}

vectorizer = TfidfVectorizer(
    analyzer='word',
    tokenizer=tokenize,
    max_df=vectorization_parameters['max_df'],
    min_df=vectorization_parameters['min_df'],
    ngram_range=vectorization_parameters['ngrams']
)

vectorizer.fit(train_dataset.iloc[:,0]) # train on variables from train set (dont pass classes)
features = vectorizer.transform(train_dataset.iloc[:,0].copy())
print(type(features))
print(len(vectorizer.idf_))
print(features[0:2])
print(vectorizer.get_feature_names())

<class 'scipy.sparse.csr.csr_matrix'>
553
  (0, 349)	0.4123504493344321
  (0, 295)	0.4123504493344321
  (0, 187)	0.4123504493344321
  (0, 186)	0.3702289282010234
  (0, 185)	0.3702289282010234
  (0, 161)	0.464502316729765
  (1, 535)	0.45457813663961644
  (1, 315)	0.5416259942887288
  (1, 235)	0.45457813663961644
  (1, 64)	0.5416259942887288
['1980s', '3byte', '3byte packets', 'ability', 'able', 'absolute', 'acceleration', 'accessory', 'actions', 'adjustable', 'aiming', 'allow', 'almost', 'along', 'also', 'alternative', 'alternative actions', 'alto', 'america', 'amiga', 'amiga atari', 'amiga atari st', 'apodemus', 'apparently', 'applications', 'arid', 'around', 'array', 'atari', 'atari st', 'available', 'axes', 'axis', 'backward', 'ball', 'ball mouse', 'baney', 'based', 'batteries', 'battery', 'beams', 'became', 'become', 'bill', 'bill english', 'birds', 'birds prey', 'breeding', 'bring', 'burrows', 'bus', 'button', 'button click', 'button mouse', 'buttons', 'called', 'came', 'cats', 'ca

what is left now is to save the generated features as well as to save the parameters of the vectorizer. 
It will be used at evaluation to tranform the validation set

In [42]:
FEATURES_SAVE_DIRECTORY = os.path.join(dataset.DEFAULT_PROCESSED_DATA_DIRECTORY, 'features')

def name_from_configuration(configuration):
    return 'ngrams_{ngrams}_maxdf_{max_df}_min_df_{min_df}'.format(**configuration)

# we can perform the vectorization with different parameters, so it is important to keep track of it somehow
transformation_name = name_from_configuration(vectorization_parameters)
transformation_name

FEATURES_SAVE_DIRECTORY = os.path.join(FEATURES_SAVE_DIRECTORY, transformation_name)
DATA_MODEL_SAVE_DIRECTORY = os.path.join(PROJ_ROOT, 'models', 'data', transformation_name)

from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
classes_binarized = LE.fit_transform(train_dataset.iloc[:,1])
print(classes_binarized)
train_dataset = np.hstack(features.todense())

features = pd.DataFrame(
    features.todense(),
    columns=vectorizer.get_feature_names(),
)



[0 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1
 0 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1
 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0
 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 1]
