# Sklearn

## Count Vectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

class Category:
  BOOKS = 'BOOKS'
  CLOTHING = 'CLOTHING'

X_train = [
      'i love the book',
      'this is a great book',
      'the fit is great',
      'i love the shoes'
]

y_train = [
      Category.BOOKS,
      Category.BOOKS,
      Category.CLOTHING,
      Category.CLOTHING,
]

X_test = [
      'i like the book',
      'you should ware warm clothes',
      'say you read a novel'
]

count_vect = CountVectorizer(
    lowercase = True,
    # stop_words = 'english',
    analyzer = 'word',
    # ngram_range = (1, 3)
    binary = True
)

X_train_count_vect = count_vect.fit_transform(X_train)
X_test_count_vect = count_vect.transform(X_test)

print(count_vect.get_feature_names())
print(X_train_count_vect.toarray())

print('SVC:', svc_model(X_train_count_vect, y_train, X_test_count_vect))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['book', 'fit', 'great', 'is', 'love', 'shoes', 'the', 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]
SVC: ['BOOKS' 'CLOTHING' 'CLOTHING']


## TFIDf

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()

X_tfidf_vect = tfidf_vect.fit_transform(X_train)

X_train_tfidf_vect = tfidf_vect.fit_transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

print(tfidf_vect.get_feature_names())
print(X_tfidf_vect.toarray())

print('SVC:', svc_model(X_train_count_vect, y_train, X_test_count_vect))

['book', 'fit', 'great', 'is', 'love', 'shoes', 'the', 'this']
[[0.61366674 0.         0.         0.         0.61366674 0.
  0.49681612 0.        ]
 [0.46580855 0.         0.46580855 0.46580855 0.         0.
  0.         0.59081908]
 [0.         0.61422608 0.4842629  0.4842629  0.         0.
  0.39205255 0.        ]
 [0.         0.         0.         0.         0.55349232 0.70203482
  0.44809973 0.        ]]
SVC: ['BOOKS' 'CLOTHING' 'CLOTHING']


# Spacy

In [24]:
from spacy.lang.en import English

nlp = English()

doc = nlp('Hello World!')

for token in doc:
  print(token, end = ', ')

token = doc[1]

print(token.text)

span = doc[1:3]

print(span.text)

doc = nlp('It costs $5.')

print('Index:', [token.i for token in doc])
print('Text:', [token for token in doc])
print('Is Puntuation:', [token.is_punct for token in doc])

Hello, World, !, World
World!
Index: [0, 1, 2, 3, 4]
Text: [It, costs, $, 5, .]
Is Puntuation: [False, False, False, False, True]


In [34]:
import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp('She ate the pizza')

for token in doc:
  print(token.text, token.pos_, token.dep_, token.head.text, sep = '\t')

print('\n')

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
  print(ent.text, ent.label_, sep = '\t')

She	PRON	nsubj	ate
ate	VERB	ROOT	ate
the	DET	det	pizza
pizza	NOUN	dobj	ate


Apple	ORG
U.K.	GPE
$1 billion	MONEY


In [38]:
import spacy

nlp = spacy.load('en_core_web_sm')

text = 'Nearly 40% of Marines have declined Covid-19 vaccine'

doc = nlp(text)

for ent in doc.ents:
  print(ent.text, ent.label_)

Nearly 40% PERCENT
Marines ORG


In [58]:
import spacy

nlp = spacy.load('en_core_web_sm')

text = 'Apple is looking forward buying U.K. startup for $2 Billion.'

doc = nlp(text)

for token in doc:
  print(token.text, token.lemma, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop, sep = '\t')

print('\n')

for ent in doc.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_, sep = '\t')

print('\n')

for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov, sep = '\t')

Apple	6418411030699964375	PROPN	NNP	nsubj	Xxxxx	True	False
is	10382539506755952630	AUX	VBZ	aux	xx	True	True
looking	16096726548953279178	VERB	VBG	ROOT	xxxx	True	False
forward	17319973514577326793	ADV	RB	advmod	xxxx	True	False
buying	9457496526477982497	VERB	VBG	advcl	xxxx	True	False
U.K.	14409890634315022856	PROPN	NNP	compound	X.X.	False	False
startup	7622488711881293715	NOUN	NN	dobj	xxxx	True	False
for	16037325823156266367	ADP	IN	prep	xxx	True	True
$	11283501755624150392	SYM	$	quantmod	$	False	False
2	15180167692696242062	NUM	CD	compound	d	False	False
Billion	1231493654637052630	NUM	CD	pobj	Xxxxx	True	False
.	12646065887601541794	PUNCT	.	punct	.	False	False


Apple	0	5	ORG
U.K.	32	36	GPE
$2 Billion	49	59	MONEY


Apple	True	21.877224	True
is	True	23.891558	True
looking	True	24.650583	True
forward	True	22.551302	True
buying	True	20.931828	True
U.K.	True	22.256153	True
startup	True	20.861286	True
for	True	21.47685	True
$	True	21.279455	True
2	True	22.103958	True
Billion	True	21.820871	Tr

# Models

## SVC

In [6]:
from sklearn.svm import SVC

def svc_model(X_train, y_train, X_test):

  svc = SVC()
  svc.fit(X_train, y_train)
  return svc.predict(X_test)