STRINGS TO HASHES

In [1]:
import spacy 
nlp = spacy.load("en_core_web_sm")

In [5]:
doc = nlp("I have a dog")

In [7]:
#find hash for the word "cat"
dog_hash = nlp.vocab.strings['dog']
dog_hash

7562983679033046312

In [8]:
#find cat_hash variable to the the sting 
dog_string = nlp.vocab.strings[dog_hash]
dog_string

'dog'

In [9]:
#lookup labeled "person" to get hash
words = nlp("David Bowie is a PERSON")

In [10]:
person_hash = nlp.vocab.strings['PERSON']
person_hash

380

In [11]:
person_string = nlp.vocab.strings[person_hash]
person_string

'PERSON'

DOC | SPAN | TOKEN

##### Creating a document

In [13]:
#create doc from scratch 
import spacy 
from spacy.tokens import Doc
nlp = spacy.load("en_core_web_sm")

In [14]:
#sent should equal "spaCy is cool!"
words = ["spaCy", "is", 'cool', '!']
spaces = [True, True, False, False]

#create a Doc from words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
doc.text

'spaCy is cool!'

In [15]:
words_ = ['Go', ',', 'get', 'started', '!']
spaces_ = [False, True, True, False, False]

doc_ = Doc(nlp.vocab, words=words_, spaces=spaces_)
doc_.text

'Go, get started!'

In [16]:
sent = ['Oh', ',', 'really', '?']
space = [False, True, False, False]

docu = Doc(nlp.vocab, words=sent, spaces=space)
docu.text

'Oh, really?'

##### Creating docs & SPANS mannually

In [17]:
#DOC &  SPAN 
from spacy.lang.en import English
from spacy.tokens import Doc, Span

In [18]:
NLP = English()

words3 = ['I', 'like', 'David', 'Bowie']
spaces3 = [True, True, True, False]

In [19]:
#create a doc
doc3 = Doc(nlp.vocab, words=words3, spaces=spaces3)
doc3.text

'I like David Bowie'

In [21]:
#create span for "David bowie" and assign label PERSON
span = Span(doc3, 2, 4, label='PERSON')
print(span.text, span.label_)

David Bowie PERSON


In [22]:
#add the span to the doc
doc3.ents = [span]

In [23]:
print([(ent.text, ent.label) for ent in doc3.ents])

[('David Bowie', 380)]


WORD VECTORS

In [24]:
import spacy 
from spacy.lang.en import English
nlp = spacy.load('en_core_web_md')

In [25]:
#process text
document = nlp("Two bananas in pyjamas")

#get vector for the token = "bananas"
bananas_vector = document[1].vector
bananas_vector

array([-2.2009e-01, -3.0322e-02, -7.9859e-02, -4.6279e-01, -3.8600e-01,
        3.6962e-01, -7.7178e-01, -1.1529e-01,  3.3601e-02,  5.6573e-01,
       -2.4001e-01,  4.1833e-01,  1.5049e-01,  3.5621e-01, -2.1508e-01,
       -4.2743e-01,  8.1400e-02,  3.3916e-01,  2.1637e-01,  1.4792e-01,
        4.5811e-01,  2.0966e-01, -3.5706e-01,  2.3800e-01,  2.7971e-02,
       -8.4538e-01,  4.1917e-01, -3.9181e-01,  4.0434e-04, -1.0662e+00,
        1.4591e-01,  1.4643e-03,  5.1277e-01,  2.6072e-01,  8.3785e-02,
        3.0340e-01,  1.8579e-01,  5.9999e-02, -4.0270e-01,  5.0888e-01,
       -1.1358e-01, -2.8854e-01, -2.7068e-01,  1.1017e-02, -2.2217e-01,
        6.9076e-01,  3.6459e-02,  3.0394e-01,  5.6989e-02,  2.2733e-01,
       -9.9473e-02,  1.5165e-01,  1.3540e-01, -2.4965e-01,  9.8078e-01,
       -8.0492e-01,  1.9326e-01,  3.1128e-01,  5.5390e-02, -4.2423e-01,
       -1.4082e-02,  1.2708e-01,  1.8868e-01,  5.9777e-02, -2.2215e-01,
       -8.3950e-01,  9.1987e-02,  1.0180e-01, -3.1299e-01,  5.50

COMPARING SIMILARITIES 

In [35]:
summer1 = nlp("It's a warm summer day")
summer2 = nlp("cold and rainy")

In [36]:
#get the similarity of the two docs 
sims = summer1.similarity(summer2)
sims

0.7787822684387955

In [37]:
reg_a = nlp('''Regulation A is an exemption from registration for public offerings. Regulation A has two offering tiers: Tier 1, for offerings of up to $20 million in a 
12-month period; and Tier 2, for offerings of up to $50 million in a 12-month period. For offerings of up to $20 million, companies can elect to proceed under 
the requirements for either Tier 1 or Tier 2. There are certain basic requirements applicable to both Tier 1 and Tier 2 offerings, including company eligibility 
requirements, bad actor disqualification provisions, disclosure, and other matters. Additional requirements apply to Tier 2 offerings, including limitations on 
the amount of money a non-accredited investor may invest in a Tier 2 offering, requirements for audited financial statements and the filing of ongoing reports. 
Issuers in Tier 2 offerings are not required to register or qualify their offerings with state securities regulators.''')

reg_d = nlp('''FESTUS, Miss. (FOX 13) - One café put a major twist on a classic ice cream sundae by introduced: pickle splits. The Pine Mountain Country Coffee House in Festus, 
Missouri simply replaced the banana in a banana split with a pickle. The owner of the café said she first tried the combination back when she was younger. It started 
as a “dare” but she ended up liking it. CONTINUE READING BELOW It wasn’t until her husband teased her into putting it on the menu that her pickle passion was made 
public. The dish comes with vanilla, strawberry and chocolate ice cream. All topped with whipped cream, cherries and, of course, pickle spears.''')

In [38]:
#ge the similarity of doc_one and doc_two 
similarity = reg_a.similarity(reg_d)
similarity

0.8085691837631346

##### Token Similarities

In [43]:
tokes = nlp("keyboard and pizza")
token1, token2 = tokes[0], tokes[2]

#get similarity of tokens
similarities = token1.similarity(token2)
similarities

0.1346367

##### SPAN similarities

In [47]:
spansent = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

#create spans
span1 = spansent[3:5]
span2 = spansent[12:15]

#get similarity 
span_sims = span1.similarity(span2)
span_sims

0.7517392

DEBUGGING PATTERNS

In [50]:
from spacy.matcher import Matcher

In [48]:
phrase = "Twitch Prime, the perks program for Amazon Prime members offering free loot, games and other benefits, is ditching one of its best features: ad-free viewing. According to an email sent out to Amazon Prime members today, ad-free viewing will no longer be included as a part of Twitch Prime for new members, beginning on September 14. However, members with existing annual subscriptions will be able to continue to enjoy ad-free viewing until their subscription comes up for renewal. Those with monthly subscriptions will have access to ad-free viewing until October 15."

In [53]:
doc = nlp(phrase)
# Create the match patterns
pattern1 = [{'LOWER': 'amazon'}, {'IS_TITLE': True, 'POS': 'PROPN'}]
pattern2 = [{'LOWER': 'ad'},{"TEXT": '-'}, {"LOWER": "free"}, {'POS': 'NOUN'}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add('PATTERN1', None, pattern1)
matcher.add('PATTERN2', None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)


PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


EFFICIENT PHRASE MATCHING

In [56]:
#list of countries is loaded below as COUNTRIES

In [59]:
COUNTRIES = """Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'United States Minor Outlying Islands',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cabo Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of the)',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Falkland Islands (Malvinas)',
 'Faroe Islands',
 'Fiji',
 'Finland',
 'France',
 'French Guiana',
 'French Polynesia',
 'French Southern Territories',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Gibraltar',
 'Greece',
 'Greenland',
 'Grenada',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guernsey',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Heard Island and McDonald Islands',
 'Holy See',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 "Côte d'Ivoire",
 'Iran (Islamic Republic of)',
 'Iraq',
 'Ireland',
 'Isle of Man',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jersey',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kuwait',
 'Kyrgyzstan',
 "Lao People's Democratic Republic",
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Macao',
 'Macedonia (the former Yugoslav Republic of)',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Malta',
 'Marshall Islands',
 'Martinique',
 'Mauritania',
 'Mauritius',
 'Mayotte',
 'Mexico',
 'Micronesia (Federated States of)',
 'Moldova (Republic of)',
 'Monaco',
 'Mongolia',
 'Montenegro',
 'Montserrat',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nauru',
 'Nepal',
 'Netherlands',
 'New Caledonia',
 'New Zealand',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Niue',
 'Norfolk Island',
 "Korea (Democratic People's Republic of)",
 'Northern Mariana Islands',
 'Norway',
 'Oman',
 'Pakistan',
 'Palau',
 'Palestine, State of',
 'Panama',
 'Papua New Guinea',
 'Paraguay',
 'Peru',
 'Philippines',
 'Pitcairn',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Republic of Kosovo',
 'Réunion',
 'Romania',
 'Russian Federation',
 'Rwanda',
 'Saint Barthélemy',
 'Saint Helena, Ascension and Tristan da Cunha',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Martin (French part)',
 'Saint Pierre and Miquelon',
 'Saint Vincent and the Grenadines',
 'Samoa',
 'San Marino',
 'Sao Tome and Principe',
 'Saudi Arabia',
 'Senegal',
 'Serbia',
 'Seychelles',
 'Sierra Leone',
 'Singapore',
 'Sint Maarten (Dutch part)',
 'Slovakia',
 'Slovenia',
 'Solomon Islands',
 'Somalia',
 'South Africa',
 'South Georgia and the South Sandwich Islands',
 'Korea (Republic of)',
 'South Sudan',
 'Spain',
 'Sri Lanka',
 'Sudan',
 'Suriname',
 'Svalbard and Jan Mayen',
 'Swaziland',
 'Sweden',
 'Switzerland',
 'Syrian Arab Republic',
 'Taiwan',
 'Tajikistan',
 'Tanzania, United Republic of',
 'Thailand',
 'Timor-Leste',
 'Togo',
 'Tokelau',
 'Tonga',
 'Trinidad and Tobago',
 'Tunisia',
 'Turkey',
 'Turkmenistan',
 'Turks and Caicos Islands',
 'Tuvalu',
 'Uganda',
 'Ukraine',
 'United Arab Emirates',
 'United Kingdom of Great Britain and Northern Ireland',
 'United States of America',
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Venezuela (Bolivarian Republic of)',
 'Viet Nam',
 'Wallis and Futuna',
 'Western Sahara',
 'Yemen',
 'Zambia',
 'Zimbabwe'"""

In [70]:
doc = nlp("Czech Republic may help Slovakia protect its airspace")

In [71]:
#import phrase matcher 
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [72]:
patterns = list(nlp.pipe(COUNTRIES))

In [73]:
matcher.add('COUNTRY', None, *patterns)

In [74]:
#call matcher on test doc and print results
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

[]


In [1]:
from spacy import displacy

In [5]:
displacy.render(reg_d, style='ent')