### Lab 09: Converting word lists to feature vectors
DS4400 / Rachlin

In [1]:
# Finding unique words in text 

phrase = "to be or not to be that is the question"
words = phrase.lower().split()
unique = sorted(list(set(words)))
unique

['be', 'is', 'not', 'or', 'question', 'that', 'the', 'to']

In [2]:
# Counting words in text
from collections import Counter
freq = Counter(words)
dict(freq)

{'to': 2,
 'be': 2,
 'or': 1,
 'not': 1,
 'that': 1,
 'is': 1,
 'the': 1,
 'question': 1}

In [3]:
# List the four most frequent words and their associated frequency
freq.most_common(4)

[('to', 2), ('be', 2), ('or', 1), ('not', 1)]

In [4]:
# Extract just these words into a sorted list
common = sorted([x[0] for x in freq.most_common(4)])
common

['be', 'not', 'or', 'to']

In [5]:
# New phrase
new_phrase = "i'm not going to be a dentist i want to be a data scientist and study the sea currents"
new_words = new_phrase.split()
# A vectorized phrase
# 1 = word appears in our phrase
# 0 = word does not appear in our phrase

print(unique)
print([int(w in new_words) for w in unique])
print(common)
print([int(w in new_words) for w in common])




['be', 'is', 'not', 'or', 'question', 'that', 'the', 'to']
[1, 0, 1, 0, 0, 0, 1, 1]
['be', 'not', 'or', 'to']
[1, 1, 0, 1]


In [6]:
# Repeat vectorization, but this time store frequency of word in our phrase
# n = word appears in our phrase n times
# 0 = word does not appear in our phrase


# New phrase
new_phrase = "i'm not going to be a dentist i want to be a data scientist and study the sea currents"
new_words = new_phrase.split()
new_count = Counter(new_words)


print(unique)
print([new_count[w] for w in unique])
print(common)
print([new_count[w] for w in common])




['be', 'is', 'not', 'or', 'question', 'that', 'the', 'to']
[2, 0, 1, 0, 0, 0, 1, 2]
['be', 'not', 'or', 'to']
[2, 1, 0, 2]


In [7]:
# Reusable function
from collections import Counter

def word_vector(words, word_list, use_frequency=False):
    """ Convert a list of words to a vector by comparing with words in word_list
    words: A list of words which we convert to a vector
    word_list: The chosen words against which we compare
    use_frequency: if False, vector components are 1/0, else n = # of occurrences
    """
    
    word_list = sorted(list(set(word_list)))

    if use_frequency:    
        count = Counter(words)
        return [count[w] for w in word_list]
    else:
        return [int(w in words) for w in word_list]
        
    
    

In [12]:
new_phrase = "i'm not going to be a dentist i want to be a data scientist and study the sea currents"
word_vector(new_phrase.split(), unique, use_frequency=False)

[1, 0, 1, 0, 0, 0, 1, 1]