# Advanced Feature Engineering

This worksheet goes through advanced feature engineering, specifically for text.

## Latent Semantic Analysis (LSA)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def latent_semantic_analysis(docs):
    tfidf = TfidfVectorizer() # using default params
    tfidf.fit(docs) # Creating dictionary
    vecs = tfidf.transform(docs) # Using dictionary to vectorize documents
    svd = TruncatedSVD(n_components=100) # Generating top 100 components
    svd.fit(vecs) # create SVD matrices
    return svd.transform(vecs) # Use LSA to vectorize documents

In [4]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train =fetch_20newsgroups(subset='train')
latent_semantic_analysis(newsgroups_train.data)

array([[ 0.24972705, -0.06943193, -0.01310763, ..., -0.05284011,
         0.01323889,  0.00471858],
       [ 0.1399918 , -0.07671546, -0.0397579 , ...,  0.01248961,
        -0.03032798,  0.01511617],
       [ 0.37184255, -0.04142754, -0.06709132, ...,  0.03632207,
         0.00969969,  0.0215647 ],
       ..., 
       [ 0.18476811, -0.00611375, -0.08039057, ...,  0.01452309,
         0.00263139, -0.00114293],
       [ 0.18795807, -0.06606888,  0.04158112, ...,  0.02202557,
         0.00278947,  0.04899025],
       [ 0.08231697, -0.09080567,  0.00372905, ...,  0.00476824,
         0.02563993,  0.00800737]])

# Latent Dirichlet Analysis (LDA)