# Lab 7 - Textual Data Analytics
Complete the code with TODO tag.
## 1. Feature Engineering
In this exercise we will understand the functioning of TF/IDF ranking. Implement the feature engineering and its application, based on the code framework provided below.

First we use textual data from Twitter.

In [37]:
import numpy as np
import pandas as pd
data = pd.read_csv('elonmusk_tweets.csv')
print(len(data))
data.head()

2819


Unnamed: 0,id,created_at,text
0,849636868052275200,2017-04-05 14:56:29,b'And so the robots spared humanity ... https:...
1,848988730585096192,2017-04-03 20:01:01,"b""@ForIn2020 @waltmossberg @mims @defcon_5 Exa..."
2,848943072423497728,2017-04-03 16:59:35,"b'@waltmossberg @mims @defcon_5 Et tu, Walt?'"
3,848935705057280001,2017-04-03 16:30:19,b'Stormy weather in Shortville ...'
4,848416049573658624,2017-04-02 06:05:23,"b""@DaveLeeBBC @verge Coal is dying due to nat ..."


### 1.1. Text Normalization
Now we need to normalize text by stemming, tokenizing, and removing stopwords.

In [38]:
from __future__ import print_function, division
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('punkt')
import string
from nltk.corpus import stopwords
import math
from collections import Counter
nltk.download('stopwords')
import pprint 
pp = pprint.PrettyPrinter(indent=4)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pube\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pube\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
from nltk.tokenize import word_tokenize

def normalize(document):
    tokens = word_tokenize(document)
    
    text = [word for word in tokens if word.isalpha()]
    
    stemmer = PorterStemmer()
    ret = ""
    for word in text:
        ret = ret+stemmer.stem(word)+" "
    return ret

print("Stemmed Data:")
original_documents = [x.strip() for x in data['text']] 
documents = [normalize(d).split() for d in original_documents]
documents[0]

Stemmed Data:


['so', 'the', 'robot', 'spare', 'human', 'http']

As you can see that the normalization is still not perfect. Please feel free to improve upon (OPTIONAL), e.g. https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/

### 1.2. Implement TF-IDF
Now you need to implement TF-IDF, including creating the vocabulary, computing term frequency, and normalizing by tf-idf weights.

In [45]:
# Flatten all the documents
flat_list = [word for doc in documents for word in doc]
# TODO: remove stop words from the vocabulary
stop_words = set(stopwords.words('english'))

filtered = [w for w in flat_list if not w in stop_words] 

# TODO: we take the 500 most common words only
counts = Counter(filtered)
vocabulary = [w for w in counts.most_common(500)]
print([x for x in vocabulary if x[0] == 'tesla'])
vocabulary = [x[0] for x in vocabulary]
assert len(vocabulary) == 500

#vocabulary.sort()
vocabulary[:10]

[('tesla', 288)]


['b',
 'http',
 'tesla',
 'I',
 'spacex',
 'model',
 'thi',
 'amp',
 'car',
 'teslamotor']

In [41]:
def tf(vocabulary, documents):
    matrix = [0] * len(documents)
    for i, document in enumerate(documents):
        counts = Counter(document)
        matrix[i] = [0] * len(vocabulary)
        for j, term in enumerate(vocabulary):
            matrix[i][j] = counts[term]
    return matrix

tf = tf(vocabulary, documents)
np.array(vocabulary)[np.where(np.array(tf[1]) > 0)], np.array(tf[1])[np.where(np.array(tf[1]) > 0)]

(array(['b', 'http', 'tesla', 'exactli', 'base'], dtype='<U15'),
 array([1, 1, 1, 1, 1]))

In [51]:
def idf(vocabulary, documents):
    idf = {}
    total_docs_len = len(documents)
    for word in vocabulary:
        x = 0
        for document in documents:
            if word in document:
                x += 1
        idf[word] = math.log(total_docs_len/x)
    return idf

idf_obj = idf(vocabulary, documents)

In [53]:
def vectorize(document, vocabulary, idf):
    vector = [0]*len(vocabulary)
    counts = Counter(document)
    print(counts)
    for i,term in enumerate(vocabulary):
        vector[i] = idf[term] * counts[term]
    return vector

document_vectors = [vectorize(s, vocabulary, idf_obj) for s in documents]
np.array(vocabulary)[np.where(np.array(document_vectors[1]) > 0)], np.array(document_vectors[1])[np.where(np.array(document_vectors[1]) > 0)]

Counter({'so': 1, 'the': 1, 'robot': 1, 'spare': 1, 'human': 1, 'http': 1})
Counter({'b': 1, 'waltmossberg': 1, 'mim': 1, 'exactli': 1, 'tesla': 1, 'is': 1, 'absurdli': 1, 'overvalu': 1, 'if': 1, 'base': 1, 'on': 1, 'the': 1, 'past': 1, 'but': 1, 'that': 1, 'http': 1})
Counter({'b': 1, 'waltmossberg': 1, 'mim': 1, 'Et': 1, 'tu': 1, 'walt': 1})
Counter({'weather': 1, 'in': 1, 'shortvil': 1})
Counter({'b': 1, 'daveleebbc': 1, 'verg': 1, 'coal': 1, 'is': 1, 'die': 1, 'due': 1, 'to': 1, 'nat': 1, 'ga': 1, 'frack': 1, 'It': 1, 'basic': 1, 'dead': 1})
Counter({'helicopt': 2, 'b': 1, 'lexxxzi': 1, 'It': 1, 'just': 1, 'a': 1, 'in': 1, 'cloth': 1})
Counter({'b': 1, 'verg': 1, 'It': 1, 'wo': 1, 'matter': 1})
Counter({'b': 1, 'supercoolcub': 1, 'pretti': 1, 'good': 1})
Counter({'you': 2, 'b': 1, 'whi': 1, 'did': 1, 'we': 1, 'wast': 1, 'so': 1, 'much': 1, 'time': 1, 'develop': 1, 'silli': 1, 'rocket': 1, 'damn': 1, 'alien': 1, 'So': 1, 'obtus': 1, 'have': 1, 'all': 1, 'thi': 1, 'crazi': 1, 'tech':

Counter({'in': 1, 'space': 1, 'new': 1, 'much': 1, 'appreci': 1, 'http': 1})
Counter({'spacex': 1, 'If': 1, 'you': 1, 'have': 1, 'audio': 1, 'photo': 1, 'or': 1, 'video': 1, 'of': 1, 'our': 1, 'anomali': 1, 'last': 1, 'week': 1, 'pleas': 1, 'send': 1, 'to': 1, 'report': 1, 'materi': 1, 'may': 1, 'be': 1, 'use': 1, 'for': 1})
Counter({'b': 1, 'abadclich': 1, 'most': 1, 'like': 1, 'true': 1, 'but': 1, 'we': 1, 'ca': 1, 'yet': 1, 'find': 1, 'it': 1, 'on': 1, 'ani': 1, 'vehicl': 1, 'sensor': 1})
Counter({'get': 1, 'back': 1, 'to': 1, 'autopilot': 1, 'updat': 1, 'blog': 1, 'tomorrow': 1})
Counter({'b': 1, 'nasa': 1, 'faa': 1, 'afpaa': 1, 'We': 1, 'have': 1, 'not': 1, 'rule': 1, 'that': 1, 'out': 1})
Counter({'b': 1, 'lewischandlerdn': 1, 'nope': 1, 'it': 1, 'wa': 1, 'me': 1})
Counter({'the': 2, 'tri': 1, 'to': 1, 'understand': 1, 'quieter': 1, 'bang': 1, 'sound': 1, 'a': 1, 'few': 1, 'second': 1, 'befor': 1, 'firebal': 1, 'goe': 1, 'off': 1, 'may': 1, 'come': 1, 'from': 1, 'rocket': 1, 'or'

Counter({'a': 2, 'lot': 2, 'reentri': 1, 'is': 1, 'faster': 1, 'and': 1, 'hotter': 1, 'than': 1, 'last': 1, 'time': 1, 'so': 1, 'odd': 1, 'of': 1, 'make': 1, 'it': 1, 'are': 1, 'mayb': 1, 'even': 1, 'but': 1, 'we': 1, 'should': 1, 'learn': 1, 'either': 1, 'way': 1})
Counter({'http': 2, 'spacex': 1, 'falcon': 1, 'and': 1, 'vertic': 1, 'on': 1, 'pad': 1, 'in': 1, 'advanc': 1, 'of': 1, 'ET': 1, 'launch': 1, 'attempt': 1})
Counter({'as': 2, 'b': 1, 'tesla': 1, 'is': 1, 'increas': 1, 'the': 1, 'product': 1, 'ramp': 1, 'fast': 1, 'possibl': 1, 'but': 1, 'I': 1, 'recommend': 1, 'order': 1, 'a': 1, 'model': 1, 'soon': 1, 'if': 1, 'you': 1, 'want': 1, 'deliveri': 1})
Counter({'b': 1, 'ye': 1})
Counter({'chamath': 1, 'I': 1, 'just': 1, 'publish': 1, 'jerkwat': 1, 'Do': 1, 'your': 1, 'math': 1, 'on': 1, 'http': 1, 'cc': 1, 'elonmusk': 1})
Counter({'teslamotor': 1, 'cool': 1, 'thing': 1, 'about': 1, 'model': 1, 'X': 1, 'edmund': 1, 'http': 1})
Counter({'to': 3, 'particul': 1, 'from': 1, 'air': 1, 

Counter({'a': 3, 'b': 1, 'RT': 1, 'mikebloomberg': 1, 'elonmusk': 1, 'deliv': 1, 'simpl': 1, 'messag': 1, 'tax': 1, 'will': 1, 'lead': 1, 'us': 1, 'to': 1, 'sustain': 1, 'futur': 1, 'that': 1, 'an': 1, 'idea': 1, 'mayor': 1})
Counter({'b': 1, 'mikebloomberg': 1, 'nice': 1, 'ride': 1, 'look': 1, 'forward': 1, 'to': 1, 'see': 1, 'you': 1, 'tomorrow': 1})
Counter({'summari': 1, 'of': 1, 'my': 1, 'talk': 1, 'in': 1, 'pari': 1, 'on': 1, 'climat': 1, 'chang': 1, 'http': 1})
Counter({'to': 1, 'give': 1, 'a': 1, 'talk': 1, 'at': 1, 'the': 1, 'sorbonn': 1, 'http': 1})
Counter({'b': 1, 'RT': 1, 'chamath': 1, 'founder': 1, 'edit': 1, 'arriv': 1, 'yesterday': 1, 'It': 1, 'unbeliev': 1, 'sickest': 1, 'car': 1, 'on': 1, 'the': 1, 'planet': 1, 'Cc': 1, 'elonmusk': 1, 'http': 1})
Counter({'b': 1, 'yep': 1, 'good': 1, 'point': 1, 'I': 1, 'hope': 1, 'you': 1, 'get': 1, 'back': 1, 'into': 1, 'space': 1, 'some': 1, 'day': 1})
Counter({'for': 2, 'credit': 1, 'reusabl': 1, 'suborbit': 1, 'rocket': 1, 'goe':

Counter({'land': 2, 'like': 1, 'falcon': 1, 'fine': 1, 'but': 1, 'excess': 1, 'later': 1, 'veloc': 1, 'caus': 1, 'it': 1, 'to': 1, 'tip': 1, 'over': 1, 'post': 1, 'http': 1})
Counter({'success': 1, 'dragon': 1, 'enrout': 1, 'to': 1, 'space': 1, 'station': 1, 'rocket': 1, 'land': 1, 'on': 1, 'droneship': 1, 'but': 1, 'too': 1, 'hard': 1, 'for': 1, 'surviv': 1})
Counter({'b': 1, 'min': 1, 'to': 1, 'liftoff': 1})
Counter({'http': 2, 'spacex': 1, 'weather': 1, 'go': 1, 'for': 1, 'launch': 1, 'today': 1, 'ET': 1, 'still': 1, 'a': 1, 'concern': 1})
Counter({'window': 1, 'alway': 1, 'tight': 1, 'when': 1, 'orbit': 1, 'synch': 1, 'need': 1, 'In': 1, 'thi': 1, 'case': 1, 'space': 1, 'station': 1, 'for': 1, 'graviti': 1, 'null': 1, 'point': 1, 'moon': 1, 'wa': 1, 'in': 1, 'the': 1, 'way': 1})
Counter({'postpon': 1, 'due': 1, 'to': 1, 'lightn': 1, 'from': 1, 'an': 1, 'approach': 1, 'anvil': 1, 'cloud': 1})
Counter({'of': 2, 'rocket': 1, 'land': 1, 'success': 1, 'today': 1, 'are': 1, 'still': 1, '

Counter({'http': 2, 'theonion': 1, 'more': 1, 'offic': 1, 'worker': 1, 'switch': 1, 'To': 1, 'fetal': 1, 'posit': 1, 'desk': 1})
Counter({'arstechnica': 1, 'supercomput': 1, 'not': 1, 'to': 1, 'beat': 1, 'the': 1, 'ture': 1, 'test': 1, 'http': 1, 'by': 1, 'nathanmattis': 1})
Counter({'sloth': 2, 'but': 1, 'the': 1, 'kinda': 1, 'had': 1, 'it': 1, 'come': 1, 'human': 1, 'blame': 1, 'for': 1, 'extinct': 1, 'of': 1, 'mammoth': 1, 'amp': 1, 'giant': 1, 'http': 1})
Counter({'oevaorg': 1, 'second': 1, 'summari': 1, 'of': 1, 'teslamotor': 1, 'annual': 1, 'sharehold': 1, 'meet': 1, 'http': 1, 'elonmusk': 1})
Counter({'of': 1, 'dragon': 1, 'unveil': 1, 'at': 1, 'http': 1})
Counter({'repair': 1, 'job': 1, 'of': 1, 'falcon': 1, 'ocean': 1, 'land': 1, 'vid': 1, 'by': 1, 'nasaspaceflight': 1, 'forum': 1, 'now': 1, 'show': 1, 'leg': 1, 'deploy': 1, 'http': 1})
Counter({'S': 1, 'in': 1, 'drag': 1, 'queen': 1, 'competit': 1, 'caranddriv': 1, 'http': 1})
Counter({'the': 2, 'albert': 1, 'amp': 1, 'prince

Counter({'fire': 1, 'of': 1, 'falcon': 1, 'advanc': 1, 'prototyp': 1, 'rocket': 1, 'over': 1, 'lb': 1, 'thrust': 1, 'enough': 1, 'to': 1, 'lift': 1, 'skyscrap': 1, 'http': 1})
Counter({'of': 2, 'expans': 1, 'tesla': 1, 'supercharg': 1, 'network': 1, 'underway': 1, 'will': 1, 'cover': 1, 'LA': 1, 'to': 1, 'NY': 1, 'by': 1, 'end': 1, 'year': 1, 'http': 1})
Counter({'TV': 1, 'featur': 1, 'debat': 1, 'of': 1, 'andreasjam': 1, 'vs': 1, 'some': 1, 'guy': 1, 'pick': 1, 'hi': 1, 'nose': 1, 'http': 1})
Counter({'b': 1, 'robertgaristo': 1, 'V': 1, 'much': 1, 'agre': 1, 'moreov': 1, 'there': 1, 'is': 1, 'risk': 1, 'of': 1, 'sever': 1, 'in': 1, 'feedback': 1, 'loop': 1, 'caus': 1, 'sudden': 1, 'warm': 1})
Counter({'a': 2, 'amp': 2, 'b': 1, 'RT': 1, 'robertgaristo': 1, 'elonmusk': 1, 'I': 1, 'physicist': 1, 'seem': 1, 'rare': 1, 'to': 1, 'find': 1, 'scientist': 1, 'not': 1, 'convinc': 1, 'by': 1, 'evid': 1, 'think': 1, 'there': 1})
Counter({'A': 2, 'can': 1, 'fit': 1, 'lot': 1, 'Of': 1, 'kindergart

Counter({'to': 2, 'auto': 1, 'dealer': 1, 'tri': 1, 'pass': 1, 'legisl': 1, 'block': 1, 'tesla': 1, 'store': 1, 'bill': 1, 'wa': 1, 'just': 1, 'defeat': 1, 'in': 1, 'senat': 1, 'thank': 1, 'MN': 1})
Counter({'b': 1, 'asherlaw': 1, 'spacex': 1, 'is': 1, 'expand': 1, 'launch': 1, 'op': 1, 'at': 1, 'canaver': 1, 'too': 1, 'need': 1, 'locat': 1, 'to': 1, 'handl': 1, 'flight': 1, 'rate': 1, 'and': 1, 'avoid': 1, 'weather': 1, 'risk': 1})
Counter({'austin': 1, 'talk': 1, 'with': 1, 'TX': 1, 'hous': 1, 'about': 1, 'creat': 1, 'an': 1, 'orbit': 1, 'launch': 1, 'complex': 1, 'near': 1, 'brownsvil': 1, 'then': 1, 'sxsw': 1, 'keynot': 1, 'tmrw': 1})
Counter({'b': 1, 'navidob': 1, 'pretti': 1, 'much': 1, 'could': 1, 'leav': 1, 'the': 1, 'countri': 1, 'of': 1, 'ibsen': 1, 'without': 1, 'do': 1, 'someth': 1, 'cultur': 1})
Counter({'a': 1, 'play': 1, 'about': 1, 'stalin': 1, 'in': 1, 'norwegian': 1, 'like': 1, 'watch': 1, 'mime': 1, 'with': 1, 'emoticon': 1, 'do': 1, 'solzhenitsyn': 1, 'http': 1})
Co

Counter({'b': 1, 'bentobewild': 1, 'ye': 1, 'we': 1, 'will': 1, 'unveil': 1, 'a': 1, 'fulli': 1, 'function': 1, 'advanc': 1, 'prototyp': 1, 'of': 1, 'the': 1, 'model': 1, 'X': 1, 'almost': 1, 'ident': 1, 'to': 1, 'product': 1})
Counter({'sheer': 1, 'size': 1, 'of': 1, 'falcon': 1, 'heavi': 1, 'is': 1, 'It': 1, 'could': 1, 'liter': 1, 'send': 1, 'a': 1, 'fulli': 1, 'load': 1, 'london': 1, 'bu': 1, 'to': 1, 'the': 1, 'moon': 1})
Counter({'london': 1, 'sunday': 1, 'time': 1, 'creat': 1, 'a': 1, 'chart': 1, 'show': 1, 'how': 1, 'falcon': 1, 'compar': 1, 'to': 1, 'the': 1, 'saturn': 1, 'V': 1, 'and': 1, 'shuttl': 1, 'http': 1})
Counter({'post': 1, 'a': 1, 'photo': 1, 'http': 1})
Counter({'we': 2, 'see': 2, 'b': 1, 'macdevi': 1, 'yeah': 1, 'will': 1, 'stream': 1, 'the': 1, 'whole': 1, 'mission': 1, 'realtim': 1, 'with': 1, 'no': 1, 'buffer': 1, 'you': 1, 'what': 1})
Counter({'like': 2, 'with': 1, 'space': 1, 'station': 1, 'now': 1, 'to': 1, 'happen': 1, 'in': 1, 'april': 1, 'hope': 1, 'will'

(array(['b', 'http', 'tesla', 'exactli', 'base'], dtype='<U15'),
 array([0.77094575, 1.00585301, 2.29516325, 4.61193298, 5.6415524 ]))

### 1.3. Compare the results with the reference implementation of scikit-learn library.

Now we use the scikit-learn library. As you can see that, the way we do text normalization affects the result. Feel free to further improve upon (OPTIONAL), e.g. https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn

In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english', max_features=500)

features = tfidf.fit(original_documents)
corpus_tf_idf = tfidf.transform(original_documents) 

sum_words = corpus_tf_idf.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf.vocabulary_.items()]
print(sorted(words_freq, key = lambda x: x[1], reverse=True)[:5])
print('testla', corpus_tf_idf[1, features.vocabulary_['tesla']])

[('http', 163.54366542841234), ('https', 151.85039944652075), ('rt', 112.61998731390989), ('tesla', 95.96401470715628), ('xe2', 88.20944486346477)]
testla 0.3495243100660956


### 1.4.  Apply TF-IDF for information retrieval
We can use the vector representation of documents to implement an information retrieval system. We test with the query $Q$ = "tesla nasa"

In [68]:
def cosine_similarity(v1,v2):
    return 1 - spatial.distance.cosine(v1, v2)

def search_vec(query, k, vocabulary, stemmer, document_vectors, original_documents):
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = vectorize(q, vocabulary, idf_obj)
    
    # TODO: rank the documents by cosine similarity
    scores = []
    index = 0
    for document_vector in document_vectors:
        cur_score = []
        cur_score.append(index)
        index = index + 1
        for v in document_vector:
            cur_score.append(v)
        scores.append(cur_score)

    print('Top-{0} documents'.format(k))
    for i in range(k):
        print(i, original_documents[scores[int(i)][0]])

query = "tesla nasa"
stemmer = PorterStemmer()
search_vec(query, 5, vocabulary, stemmer, document_vectors, original_documents)

Counter({'tesla': 1, 'nasa': 1})
Top-5 documents
0 b'And so the robots spared humanity ... https://t.co/v7JUJQWfCv'
1 b"@ForIn2020 @waltmossberg @mims @defcon_5 Exactly. Tesla is absurdly overvalued if based on the past, but that's irr\xe2\x80\xa6 https://t.co/qQcTqkzgMl"
2 b'@waltmossberg @mims @defcon_5 Et tu, Walt?'
3 b'Stormy weather in Shortville ...'
4 b"@DaveLeeBBC @verge Coal is dying due to nat gas fracking. It's basically dead."


We can also use the scikit-learn library to do the retrieval.

In [25]:
new_features = tfidf.transform([query])

cosine_similarities = linear_kernel(new_features, corpus_tf_idf).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]

topk = 5
print('Top-{0} documents'.format(topk))
for i in range(topk):
    print(i, original_documents[related_docs_indices[i]])

Top-5 documents
0 b'@ashwin7002 @NASA @faa @AFPAA We have not ruled that out.'
1 b"SpaceX could not do this without NASA. Can't express enough appreciation. https://t.co/uQpI60zAV7"
2 b'@NASA launched a rocket into the northern lights http://t.co/tR2cSeMV'
3 b'Whatever happens today, we could not have done it without @NASA, but errors are ours alone and me most of all.'
4 b'RT @NASA: Updated @SpaceX #Dragon #ISS rendezvous times: NASA TV coverage begins Sunday at 3:30amET: http://t.co/qrm0Dz4jPE. Grapple at  ...'
