# Advanced Feature Engineering

This worksheet goes through advanced feature engineering, specifically for text.

## Latent Semantic Analysis (LSA)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def latent_semantic_analysis(docs):
    tfidf = TfidfVectorizer() # using default params
    tfidf.fit(docs) # Creating dictionary
    vecs = tfidf.transform(docs) # Using dictionary to vectorize documents
    svd = TruncatedSVD(n_components=100) # Generating top 100 components
    svd.fit(vecs) # create SVD matrices
    return svd.transform(vecs) # Use LSA to vectorize documents

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train =fetch_20newsgroups(subset='train')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
latent_semantic_analysis(newsgroups_train.data)

array([[  2.49727052e-01,  -6.94289473e-02,  -1.31116562e-02, ...,
          6.60643460e-03,  -1.53582571e-02,  -3.92809110e-03],
       [  1.39991804e-01,  -7.67136144e-02,  -3.97582007e-02, ...,
          1.93868388e-02,   4.74816098e-02,  -2.61638745e-02],
       [  3.71842551e-01,  -4.14278553e-02,  -6.70927815e-02, ...,
         -1.85374081e-03,   1.31141932e-02,   2.60122555e-02],
       ..., 
       [  1.84768111e-01,  -6.11303637e-03,  -8.03916601e-02, ...,
          8.02409181e-03,   2.09608120e-02,  -8.68951618e-03],
       [  1.87958069e-01,  -6.60668898e-02,   4.15762044e-02, ...,
          2.52229408e-02,   3.03755239e-02,   4.08418272e-02],
       [  8.23169690e-02,  -9.08059518e-02,   3.72872349e-03, ...,
          7.94479273e-03,  -3.14019836e-02,  -5.84739850e-05]])

# Latent Dirichlet Analysis (LDA)

We attempt to run LDA on the newsgroups_train data:

In [6]:
data = newsgroups_train.data

In [11]:
import nltk
from nltk.corpus import stopwords
from snowballstemmer import EnglishStemmer

nltk.download('stopwords')
stemmer = EnglishStemmer()
stop = stopwords.words('english')
stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
stoplist = stemmer.stemWords(stop)
stoplist = set(stoplist)
stop = set(sorted(stop + list(stoplist))) 

data.replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ',inplace=True,regex=True)
wordlist = filter(None, " ".join(list(set(list(itertools.chain(*data['text_data'].str.split(' ')))))).split(" "))
data = [' '.join(filter(None,filter(lambda word: word not in stop, line))) for line in data.str.lower().str.split(' ')]

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


NameError: name 'ascii_lowercase' is not defined

In [3]:
from gensim.models.ldamodel import LdaModel

In [19]:
from gensim import corpora
gensim_docs = [d.split(" ") for d in newsgroups_train.data]
dictionary = corpora.Dictionary(gensim_docs)

In [20]:
print(dictionary)

Dictionary(427021 unique tokens: ['', '\nall', "(where's", '----', '----\n\n\n\n\n']...)


In [17]:
lda = LdaModel(gensim_corpus, num_topics=20)

ValueError: too many values to unpack (expected 2)