In [1]:
# !pip install sklearn

In [2]:
import pandas as pd

# Tokenizing with SciKit-Learn

In [3]:
speeches = pd.DataFrame(
    [["Foxes are the most majestic animal. Very few animals can eat foxes"],
    ["Foxes live in the praries of England. Sometimes foxes get into people's back yards."],
    ["The foxes love to eat meat. If a fox smells meat, it will eat meat."],
    ["Ducks are nice animals too. Ducks eat bread"]],
    columns=['sentence'])
speeches

Unnamed: 0,sentence
0,Foxes are the most majestic animal. Very few a...
1,Foxes live in the praries of England. Sometime...
2,The foxes love to eat meat. If a fox smells me...
3,Ducks are nice animals too. Ducks eat bread


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

## YOU CAN EDIT THESE
y_columns = ['Party', 'Trifecta']
BINARY=False
NGRAM_RANGE=(1,1)
MIN_DF=0

vectorizer = CountVectorizer(
    stop_words='english', # 'english' if not custom list
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF
)


In [5]:
X = vectorizer.fit_transform(speeches['sentence'])
X

<4x17 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [6]:
word_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
[print(x) for x in speeches.sentence]
word_vectors.round(2)

Foxes are the most majestic animal. Very few animals can eat foxes
Foxes live in the praries of England. Sometimes foxes get into people's back yards.
The foxes love to eat meat. If a fox smells meat, it will eat meat.
Ducks are nice animals too. Ducks eat bread


Unnamed: 0,animal,animals,bread,ducks,eat,england,fox,foxes,live,love,majestic,meat,nice,people,praries,smells,yards
0,1,1,0,0,1,0,0,2,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,1,0,2,1,0,0,0,0,1,1,0,1
2,0,0,0,0,2,0,1,1,0,1,0,3,0,0,0,1,0
3,0,1,1,2,1,0,0,0,0,0,0,0,1,0,0,0,0


# TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(
    stop_words='english', 
    ngram_range=NGRAM_RANGE,
    binary=BINARY,
    min_df=MIN_DF
)
X = vectorizer.fit_transform(speeches['sentence'])
word_vectors = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
[print(x) for x in speeches.sentence]
word_vectors.round(2)

Foxes are the most majestic animal. Very few animals can eat foxes
Foxes live in the praries of England. Sometimes foxes get into people's back yards.
The foxes love to eat meat. If a fox smells meat, it will eat meat.
Ducks are nice animals too. Ducks eat bread


Unnamed: 0,animal,animals,bread,ducks,eat,england,fox,foxes,live,love,majestic,meat,nice,people,praries,smells,yards
0,0.46,0.37,0.0,0.0,0.3,0.0,0.0,0.59,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.39,0.0,0.5,0.39,0.0,0.0,0.0,0.0,0.39,0.39,0.0,0.39
2,0.0,0.0,0.0,0.0,0.34,0.0,0.27,0.17,0.0,0.27,0.0,0.8,0.0,0.0,0.0,0.27,0.0
3,0.0,0.3,0.38,0.75,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38,0.0,0.0,0.0,0.0
