In [2]:
# spacy is object oriented
# nltk is string oriented

In [3]:
import spacy


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/opt/anaconda3/lib/python3.12/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/anaconda3/lib/python3.12/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/sit

In [4]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("The dr. told me so and I love myself. It was quite challenging, but I did it!")

for sentence in doc.sents:
    print(sentence) # splits two sentences accurately!

The dr. told me so and I love myself.
It was quite challenging, but I did it!


In [5]:
for sentence in doc.sents: # word tokenization in spacy
    for word in sentence:
        print(word)

The
dr
.
told
me
so
and
I
love
myself
.
It
was
quite
challenging
,
but
I
did
it
!


- spacy provides the most efficient NLP algorithm for a given task. if you care about the end result, go with spacy.
- nltk provides access to many algorithms, if you care about specific algo and customizations, choose nltk.

In [6]:
import nltk

from nltk.tokenize import sent_tokenize

sent_tokenize("The dr. told me so and I love myself. It was quite challenging, but I did it!")

['The dr. told me so and I love myself.',
 'It was quite challenging, but I did it!']

In [7]:
# creating a language object

nlp = spacy.blank("tr") # you can use en or other langs

doc = nlp("Merhabalar! Benim adım Hasan Can Bıyık. Ben harikayım :)")

for token in doc:
    print(token)

Merhabalar
!
Benim
adım
Hasan
Can
Bıyık
.
Ben
harikayım
:
)


In [10]:
doc[0]

Merhabalar

In [12]:
doc = nlp("""'Let's go to N.Y.!'""")

for token in doc:
    print(token)

'
Let
's
go
to
N.Y.
!
'


In [14]:
type(token)

spacy.tokens.token.Token

In [15]:
doc[1:5]

Let's go to

In [17]:
token0 = doc[1]
token0

Let

In [18]:
dir(token0) # dir helps getting all the methods of that class

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [19]:
type(token0)

spacy.tokens.token.Token

In [23]:
token0.is_alpha # because it is alphabetic, a word, not a number.

True

In [22]:
token0.like_num # because it is not a number.

False

# spacy is more convenient than regular expressions to grab emails

with open() as f:
    text = f.readlines() # reads all the lines as an array

In [30]:
with open("/Users/hasancan/Downloads/students.txt") as f:
    text = f.readlines()
text # reads as an array, all the lines!

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [31]:
# Spacy texts single texts, so I will convert this to a big text.

text = ' '.join(text) # seperate by space, text is an array, and join the elements and use space as a delimeter
text # created one single sentence!



In [33]:
doc = nlp(text)

emails = [] # we created an empty list to store them!

for token in doc:
    if token.like_email:
        emails.append(token.text)

emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

In [40]:
nlp = spacy.blank('tr')

doc = nlp("Merhaba! Ben Dr. Bıyık ve bugün size ben yardımcı olacağım, eğiteceğim!!")

for token in doc:
    print(token, 'a currency: ', token.is_currency, '/a number: ', token.like_num, '/a word: ', token.is_alpha)

Merhaba a currency:  False /a number:  False /a word:  True
! a currency:  False /a number:  False /a word:  False
Ben a currency:  False /a number:  False /a word:  True
Dr. a currency:  False /a number:  False /a word:  False
Bıyık a currency:  False /a number:  False /a word:  True
ve a currency:  False /a number:  False /a word:  True
bugün a currency:  False /a number:  False /a word:  True
size a currency:  False /a number:  False /a word:  True
ben a currency:  False /a number:  False /a word:  True
yardımcı a currency:  False /a number:  False /a word:  True
olacağım a currency:  False /a number:  False /a word:  True
, a currency:  False /a number:  False /a word:  False
eğiteceğim a currency:  False /a number:  False /a word:  True
! a currency:  False /a number:  False /a word:  False
! a currency:  False /a number:  False /a word:  False


In [44]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case('gimme', [
    {ORTH: 'gim'},
    {ORTH: 'me'}])

doc = nlp("gimme double cheese extra large pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'pizza']

In [46]:
nlp.pipe_names # the pipeline is blank!

[]

In [49]:
# you can also load a pre-trained pipeline!

nlp = spacy.load("en_core_web_sm")

In [50]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x3271ca630>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x3271caed0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x3271dc5f0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x3172c6090>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x16c954b50>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x3271dc660>)]

In [52]:
doc = nlp("captain america ate $100 of burgers, then he said he can do this all day long!")

for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)

captain  |  PROPN  |  captain
america  |  PROPN  |  america
ate  |  VERB  |  eat
$  |  SYM  |  $
100  |  NUM  |  100
of  |  ADP  |  of
burgers  |  NOUN  |  burger
,  |  PUNCT  |  ,
then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
he  |  PRON  |  he
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day
long  |  ADV  |  long
!  |  PUNCT  |  !
