# Develop Deep Learning Models for Natural Language in Python

## Chapter 6 - Learn How to Prepare Data with Scikit-learn

### 6.2 - Word Counts with CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

text = ['The quick brown fox jumed over the lazy dog.']

# Create a transform
vectorizer = CountVectorizer()

# Fit & transform the text
vector = vectorizer.fit_transform(text)

print('Vocabulary:', vectorizer.vocabulary_)
print('Vector shape:', vector.shape)
print('Array:', vector.toarray())

Vocabulary: {'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumed': 3, 'over': 5, 'lazy': 4, 'dog': 1}
Vector shape: (1, 8)
Array: [[1 1 1 1 1 1 1 2]]


### 6.3 - Word Frequencies with tfidfVectorizer

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = ['The quick brown fox jumed over the lazy dog.', 'The dog', 'The fox']

# Create the transform
vectorizer = TfidfVectorizer()

vector = vectorizer.fit_transform(text)

print('Vocabulary:', vectorizer.vocabulary_)
print('Vector shape:', vector.shape)
print('Array:', vector.toarray())

Vocabulary: {'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumed': 3, 'over': 5, 'lazy': 4, 'dog': 1}
Vector shape: (3, 8)
Array: [[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]
 [0.         0.78980693 0.         0.         0.         0.
  0.         0.61335554]
 [0.         0.         0.78980693 0.         0.         0.
  0.         0.61335554]]


### 6.4 - Hashing with HashingVectorizer

In [16]:
from sklearn.feature_extraction.text import HashingVectorizer

text = ['The quick brown fox jumed over the lazy dog.']

# Create the transform
vectorizer = HashingVectorizer(n_features = 20)

vector = vectorizer.fit_transform(text)

print('Vector shape:', vector.shape)
print('Array:', vector.toarray())

Vector shape: (1, 20)
Array: [[ 0.          0.          0.          0.          0.30151134  0.30151134
   0.         -0.30151134  0.30151134  0.          0.          0.30151134
   0.         -0.30151134  0.         -0.30151134  0.          0.
  -0.60302269  0.        ]]
