# Advanced Feature Engineering

This worksheet goes through advanced feature engineering, specifically for text.

## Latent Semantic Analysis (LSA)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def latent_semantic_analysis(docs):
    tfidf = TfidfVectorizer() # using default params
    tfidf.fit(docs) # Creating dictionary
    vecs = tfidf.transform(docs) # Using dictionary to vectorize documents
    svd = TruncatedSVD(n_components=100) # Generating top 100 components
    svd.fit(vecs) # create SVD matrices
    return svd.transform(vecs) # Use LSA to vectorize documents

In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train =fetch_20newsgroups(subset='train')

In [5]:
latent_semantic_analysis(newsgroups_train.data)

array([[  2.49727052e-01,  -6.94289473e-02,  -1.31116562e-02, ...,
          6.60643460e-03,  -1.53582571e-02,  -3.92809110e-03],
       [  1.39991804e-01,  -7.67136144e-02,  -3.97582007e-02, ...,
          1.93868388e-02,   4.74816098e-02,  -2.61638745e-02],
       [  3.71842551e-01,  -4.14278553e-02,  -6.70927815e-02, ...,
         -1.85374081e-03,   1.31141932e-02,   2.60122555e-02],
       ..., 
       [  1.84768111e-01,  -6.11303637e-03,  -8.03916601e-02, ...,
          8.02409181e-03,   2.09608120e-02,  -8.68951618e-03],
       [  1.87958069e-01,  -6.60668898e-02,   4.15762044e-02, ...,
          2.52229408e-02,   3.03755239e-02,   4.08418272e-02],
       [  8.23169690e-02,  -9.08059518e-02,   3.72872349e-03, ...,
          7.94479273e-03,  -3.14019836e-02,  -5.84739850e-05]])

# Latent Dirichlet Analysis (LDA)

We attempt to run LDA on the newsgroups_train data:

In [4]:
data = newsgroups_train.data
data = data[:100]
print(data[:4])

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n", "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 1

## Cleaning the data

We first tokenize it into words:

In [21]:
import nltk
from nltk import word_tokenize
doc_words = [word_tokenize(doc) for doc in data]

def to_lower(doc):
    return [word.lower() for word in doc]

doc_words = list(map(to_lower, doc_words))
print(doc_words[0])

['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of', 'production', ',', 'where', 'this', 'car', 'is', 'made', ',', 'history', ',', 'or', 'whatever', 'inf

next, we remove anything that's not a word:

In [22]:
def remove_nonwords(arr):
    return [word for word in arr if word.isalpha()]

doc_words = list(map(remove_nonwords, doc_words))
    
print(doc_words[0])

['from', 'lerxst', 'where', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', 'it', 'was', 'a', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'a', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'i', 'know', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


next, we remove stop words, which are common english words. We also use a stemmer to extend our list of stopwords.

In [23]:
import nltk
from nltk.corpus import stopwords
from snowballstemmer import EnglishStemmer

nltk.download('stopwords')
stemmer = EnglishStemmer()
stop = stopwords.words('english')
stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within'])
stoplist = stemmer.stemWords(stop)
stoplist = set(stoplist)
stop = set(sorted(stop + list(stoplist)))
print("stop words:")
print(stop)

def remove_stopwords(doc):
    return [word for word in doc if not word in stop]

doc_words = list(map(remove_stopwords, doc_words))
print(doc_words[0])



[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
stop words:
{'they', 'abov', 'few', "hadn't", 'against', 'what', 'be', 'himself', 'she', 'but', 'again', 'for', 'him', "mustn't", 'myself', 'through', 'seven', 'whi', 'your', "it's", "shan't", 'too', "didn't", "couldn't", 'why', 'own', 'was', 'aren', 'll', "haven't", "she's", 'hers', 'than', 'the', 'theirs', 'until', 'yourself', 'eight', 'once', 'while', 'both', "you'd", "you'r", 'yourselves', 'may', 'ourselv', 'from', 'did', 'wasn', 'o', 'one', 'very', 'no', 'who', 'and', 'as', 'won', 'his', 'do', 'if', 'between', 'me', "should'v", "don't", 'into', 'its', "you're", 'their', 'six', 'zero', "won't", 'that', 's', 'same', 'y', "should've", 'has', 'further', 'are', 'shouldn', 'any', "shouldn't", 'so', 've', 'am', 'now', 'we', 'down', 'ten', 'then', 'being', 'a', 'can', 'other', 'herself', 'about', 'beside', 'it', "aren't", 'in', 'ourselves', 't', 'two', 'ain', 'must

In [24]:
from gensim.models.ldamodel import LdaModel

In [25]:
from gensim import corpora
dictionary = corpora.Dictionary(doc_words)

In [26]:
print(dictionary)

Dictionary(5296 unique tokens: ['addition', 'anyone', 'body', 'bricklin', 'brought']...)


next, we convert tokenized documents into a document-term matrix:

In [28]:
corpus = [dictionary.doc2bow(text) for text in doc_words]

In [38]:
lda = LdaModel(corpus, num_topics=30, id2word=dictionary)

In [39]:
print(lda.print_topics(num_topics=10,num_words=5))

[(1, '0.010*"lines" + 0.010*"subject" + 0.009*"organization" + 0.007*"car" + 0.006*"really"'), (18, '0.006*"know" + 0.006*"subject" + 0.006*"lines" + 0.005*"organization" + 0.005*"people"'), (7, '0.009*"plants" + 0.009*"nuclear" + 0.008*"water" + 0.005*"acne" + 0.005*"get"'), (6, '0.008*"subject" + 0.007*"lines" + 0.006*"organization" + 0.005*"university" + 0.005*"font"'), (9, '0.005*"lines" + 0.004*"organization" + 0.004*"suresh" + 0.003*"power" + 0.003*"option"'), (23, '0.004*"people" + 0.003*"rushdie" + 0.003*"root" + 0.003*"lines" + 0.003*"fatwa"'), (2, '0.008*"disease" + 0.007*"buy" + 0.007*"health" + 0.007*"people" + 0.006*"article"'), (22, '0.005*"armenian" + 0.004*"people" + 0.004*"conference" + 0.003*"russian" + 0.003*"jews"'), (24, '0.005*"subject" + 0.005*"lines" + 0.004*"armenians" + 0.004*"years" + 0.004*"would"'), (11, '0.007*"scsi" + 0.007*"subject" + 0.006*"lines" + 0.006*"chip" + 0.006*"things"')]
