# Combining features
Feature union and pipelines will be discussed as a mean to combining lexical, syntactic and semantic features.
FeatureUnion is built by using a list of (key, value) pair, where key is the name you want to give to a transformation (i.e. arbitrary string serving as identifier) and value is an estimator object.

We will use the dataest from the IMDB movie reviews

# # Load dataset

In [24]:
import pandas as pd
df_orig = pd.read_csv('all/training_set_rel3.tsv', encoding='ISO-8859-1', delimiter='\t', header=0)
df_orig[0:4]

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,


In [25]:
df_orig.shape

(12976, 28)

In [26]:
# Filter data of the essay_set number 1, and keep only two columns for example
df = df_orig[df_orig['essay_set']==1][['essay_id', 'essay', 'domain1_score']].copy()
df.shape

(1783, 3)

In [27]:
df[0:5]

Unnamed: 0,essay_id,essay,domain1_score
0,1,"Dear local newspaper, I think effects computer...",8
1,2,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,"Dear @LOCATION1, I know having computers has a...",8


In [5]:
# define X and Y
X = df['essay'].values
y = df['domain1_score'].values

# Transformer
In order to extract the features that we may be interested in, we implement a transformer for each feature we want to extract. A transformer will receive data, make some changes and return the data. Methods to implement:
    Fit method->Learning and training
    Transform method->Apply transformation to unseen data

In [28]:
# Generic transformer
from sklearn.base import BaseEstimator, TransformerMixin

class GenericTransformer(BaseEstimator, TransformerMixin):
    def transform(self, X, y=None):
        return do_something_to(X, self.vars)
    
    def fit(self, X, y=None):
        return self

# Lexical features
Character features (i.e. number of exclamation marks) are omitted 

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import sent_tokenize, word_tokenize

class LexicalStats (BaseEstimator, TransformerMixin):
    """Extract lexical features from each document"""
    
    def number_sentences(self, doc):
        sentences = sent_tokenize(doc, language = 'english')
        return len(sentences)
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, docs):
        return [{'length':len(doc),
                'num_sentences':self.number_sentences(doc)}
               for doc in docs]
    

In [34]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

def custom_tokenizer (words):
    """Preprocess tokens as seen in lexical notebooks"""
    tokens = word_tokenize(words.lower())
    porter = PorterStemmer()
    lemmas = [porter.stem(t) for t in tokens]
    stoplist = stopwords.words('english')
    lemmas_clean = [w for w in lemmas if w not in stoplist]
    punctuation = set(string.punctuation)
    lemmas_punct = [w for w in lemmas_clean if w not in punctuation]
    return lemmas_punct

# Syntactic features
(example)

In [35]:



class PosStats(BaseEstimator, TransformerMixin):
    """Obtain number of tokens with POS categories"""

    def stats(self, doc):
        tokens = custom_tokenizer(doc)
        tagged = pos_tag(tokens, tagset='universal')
        counts = Counter(tag for word,tag in tagged)
        total = sum(counts.values())
        #copy tags so that we return always the same number of features
        pos_features = {'NOUN': 0, 'ADJ': 0, 'VERB': 0, 'ADV': 0, 'CONJ': 0, 
                        'ADP': 0, 'PRON':0, 'NUM': 0}
        
        pos_dic = dict((tag, float(count)/total) for tag,count in counts.items())
        for k in pos_dic:
            if k in pos_features:
                pos_features[k] = pos_dic[k]
        return pos_features
    
    def transform(self, docs, y=None):
        return [self.stats(doc) for doc in docs]
    
    def fit(self, docs, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

# Feature extraction Pipelines

By using pipelines, we will extract desired properties.

In [36]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer


ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (1, 3), encoding = 'ISO-8859-1', 
                                        tokenizer=custom_tokenizer)),
  ('tfidf_transformer', TfidfTransformer())
])

# Feature union pipeline
Pipelines consist of sequencial steps: One step works on the next step
Feature unions consist of parallel tasks whose result is grouped when all have finished

In [38]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation



## All the steps of the Pipeline should end with a sparse vector as the input data

pipeline = Pipeline([
       ('features', FeatureUnion([
                    ('lexical_stats', Pipeline([
                                ('stats', LexicalStats()),
                                ('vectors', DictVectorizer())
                            ])),
                    ('words', TfidfVectorizer(tokenizer=custom_tokenizer)),
                    ('ngrams', ngrams_featurizer),
                    ('pos_stats', Pipeline([
                                ('pos_stats', PosStats()), # Gets number of tokens
                                ('vectors', DictVectorizer())
                            ])),
                    ('lda', Pipeline([ 
                                ('count', CountVectorizer(tokenizer=custom_tokenizer)),
                                ('lda',  LatentDirichletAllocation(n_topics=4, max_iter=5,
                                                       learning_method='online', 
                                                       learning_offset=50.,
                                                      random_state=0))
                            ])),
                ])),
       
        ('clf', MultinomialNB(alpha=.01))  # classifier
    ])

# Using KFold validation

cv = KFold(X.shape[0], 2, shuffle=True, random_state=33)
scores = cross_val_score(pipeline, X, y, cv=cv)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Scores in every iteration [ 0.39125561  0.43097643]
Accuracy: 0.41 (+/- 0.04)
