In [1]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    GIT_ROOT = 'https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master'
    os.system(f'wget {GIT_ROOT}/ch04/setup.py')

%run -i setup.py

You are working on Google Colab.
Files will be downloaded to "/content".
Downloading required files ...
!wget -P /content https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/settings.py
!wget -P /content/packages/blueprints https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/packages/blueprints/__init__.py
!wget -P /content/packages/blueprints https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/packages/blueprints/exploration.py
!wget -P /content/packages/blueprints https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/packages/blueprints/preparation.py
!wget -P /content/data/reddit-selfposts https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/reddit-selfposts/rspct_autos.tsv.gz
!wget -P /content/data/reddit-selfposts https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/reddit-selfposts/subreddit

In [2]:
%run "$BASE_DIR/settings.py"

%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'png'

# to print output of all statements and not just the last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# otherwise text between $ signs will be interpreted as formula and printed in italic
pd.set_option('display.html.use_mathjax', False)

# path to import blueprints packages
sys.path.append(BASE_DIR + '/packages')

In [3]:
import pandas as pd

posts_file = "rspct.tsv.gz"
posts_file = f"{BASE_DIR}/data/reddit-selfposts/rspct_autos.tsv.gz" ### real location
posts_df = pd.read_csv(posts_file, sep='\t')

subred_file = "subreddit_info.csv.gz"
subred_file = f"{BASE_DIR}/data/reddit-selfposts/subreddit_info.csv.gz" ### real location
subred_df = pd.read_csv(subred_file).set_index(['subreddit'])

df = posts_df.join(subred_df, on='subreddit')

In [4]:
column_mapping = {
    'id': 'id',
    'subreddit': 'subreddit',
    'title': 'title',
    'selftext': 'text',
    'category_1': 'category',
    'category_2': 'subcategory',
    'category_3': None, # no data
    'in_data': None, # not needed
    'reason_for_exclusion': None # not needed
}

# define remaining columns
columns = [c for c in column_mapping.keys() if column_mapping[c] != None]

# select and rename those columns
df = df[columns].rename(columns=column_mapping)

In [5]:
df = df[df['category']=='autos']

In [6]:
df.sample(1).T

Unnamed: 0,3040
id,7svmwi
subreddit,Mustang
title,2018 Mustang - Thoughts on options?
text,hey everyone I’m going to be ordering a new mustang soon and thinking about getting it with the following:<lb><lb>- GT Performance Package (For Sure)<lb><lb>- Shaker HD 12 speaker upgraded audio s...
category,autos
subcategory,ford


# 정규 표현식으로 노이즈 식별

In [7]:
pd.options.display.max_colwidth = None ###
df.sample(1, random_state=7).T
pd.options.display.max_colwidth = 200 ###

Unnamed: 0,14356
id,7jc2k4
subreddit,volt
title,Dashcam for 2017 volt
text,Hello.<lb>I'm looking into getting a dashcam. <lb>Does anyone have any recommendations? <lb><lb>I'm generally looking for a rechargeable one so that I don't have to route wires down to the cigarette lighter. <lb>Unless there are instructions on how to wire it properly without wires showing. <lb><lb><lb>Thanks!
category,autos
subcategory,chevrolet


In [8]:
text = """
After viewing the [PINKIEPOOL Trailer](https://www.youtu.be/watch?v=ieHRoHUg)
it got me thinking about the best match ups.
<lb>Here's my take:<lb><lb>[](/sp)[](/ppseesyou) Deadpool<lb>[](/sp)[](/ajsly)
Captain America<lb>"""

In [9]:
import re

RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')

def impurity(text, min_len=10):
    """returns the share of suspicious characters in a text"""
    if text == None or len(text) < min_len:
        return 0
    else:
        return len(RE_SUSPICIOUS.findall(text))/len(text)

print(impurity(text))

0.09009009009009009


In [10]:
df['impurity'] = df['text'].apply(impurity, min_len=10)

In [11]:
df[['text','impurity']].sort_values(by='impurity',ascending=False).head(3)
pd.options.display.max_colwidth = 300 ###

Unnamed: 0,text,impurity
19682,"Looking at buying a 335i with 39k miles and 11 months left on the CPO warranty. I asked the dealer if the HPFP had been replaced, and for a copy of the service history. He said it had been replace...",0.21
12357,"I'm looking to lease an a4 premium plus automatic with the nav package.<lb><lb>Vehicle Price:<tab><tab>$49,150.00<tab> <lb> <tab>AutoNation Savings:<tab>-<tab>$3,867.00<tab> <lb> <tab>AutoNation P...",0.17
2730,"Breakdown below:<lb><lb>Elantra GT<lb><lb>2.0L 4-cylinder<lb><lb>6-speed Manual Transmission<lb><lb>$19,350<lb><lb>Elantra GT<lb><lb>2.0L 4-cylinder<lb><lb>6-speed Automatic Transmission w/ SHIFTR...",0.14


In [14]:
## <lb> (줄바꿈) <tab>같은 태그가 많다. 정규 표현식을 이용해서 이들 외에 다른 태그가 있는지 확인
from blueprints.exploration import count_words
count_words(df, column='text', preprocess=lambda t: re.findall(r'<[\w/]*>', t))

  0%|          | 0/20000 [00:00<?, ?it/s]

Unnamed: 0_level_0,freq
token,Unnamed: 1_level_1
<lb>,100729
<tab>,642


# 정규 표현식으로 노이즈 제거

In [15]:
import html

def clean(text):
    # &amp;과 같은 html 이스케이프를 문자로 변환
    text = html.unescape(text)
    # <tab>과 같은 태그를 공백으로 변환
    text = re.sub(r'<[^<>]*>', ' ', text)
    # [Some text](https://....)와 같은 마크다운 url을 공백으로 변환
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # [0]과 같은 괄호 안의 텍스트 또는 코드를 공백으로 변환
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # 특수문자로만 구성된 문자열을 공백으로 변환
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # 이때 &#은 변환되지만 #cool은 변환되지 않는다.
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [16]:
clean_text = clean(text)
print(clean_text)

After viewing the PINKIEPOOL Trailer it got me thinking about the best match ups. Here's my take: Deadpool Captain America


In [17]:
print("Impurity:", impurity(clean_text))

Impurity: 0.0


In [18]:
df['clean_text'] = df['text'].apply(clean)

In [19]:
df['impurity'] = df['clean_text'].apply(impurity, min_len=20)
df[['clean_text','impurity']].sort_values(by='impurity', ascending=False).head(3)

Unnamed: 0,clean_text,impurity
14058,"Mustang 2018, 2019, or 2020? Must Haves!! 1. Have a Credit score of 780\+ for the best low interest rates! 2. Join a Credit Union to finance the vehicle! 3. Or Find a Lender to finance the vehicle! 4. Downpayment of 20&#37; to the actual vehicle cost! 5. Be 25 years old for a lower price for ins...",0.03
18934,"At the dealership, they offered an option for foot-well illumination, but I cannot find any reference to this online. Has anyone gotten it? How does it look? Anyone have pictures. Not sure if this link will work, but here goes: https://www.mazdausa.com/shopping-tools/build-and-price/mazda3-hatch...",0.03
16505,"I am looking at four Caymans, all are in a similar price range. The major differences are the miles, the years, and one isn’t a S. https://www.cargurus.com/Cars/inventorylisting/viewDetailsFilterViewInventoryListing.action?sourceContext=usedPaidSearchNoZip&newSearchFromOverviewPage=true&entitySe...",0.02


# textacy를 사용한 문자 정규화

In [20]:
text = "The café “Saint-Raphaël” is loca-\nted on Côte dʼAzur."

In [21]:
import textacy.preprocessing as tprep

def normalize(text):
    text = tprep.normalize.hyphenated_words(text)
    text = tprep.normalize.quotation_marks(text)
    text = tprep.normalize.unicode(text)
    text = tprep.remove.accents(text)
    return text

In [22]:
print(normalize(text))

The cafe "Saint-Raphael" is located on Cote d'Azur.


# textacy를 사용한 패턴 기반 데이터 마스킹

In [23]:
from textacy.preprocessing.resources import RE_URL

count_words(df, column='clean_text', preprocess=RE_URL.findall).head(3)

  0%|          | 0/20000 [00:00<?, ?it/s]

Unnamed: 0_level_0,freq
token,Unnamed: 1_level_1
www.getlowered.com,3
http://www.ecolamautomotive.com/#!2/kv7fq,2
https://www.reddit.com/r/Jeep/comments/4ux232/just_ordered_an_android_head_unit_joying_jeep/,2


In [24]:
import re

text = "Check out https://spacy.io/usage/spacy-101"

# Regular expression to replace URLs with a placeholder
cleaned_text = re.sub(r'http[s]?://\S+', '_URL_', text)
print(cleaned_text)

Check out _URL_


In [25]:
df['clean_text'] = df['clean_text'].map(normalize)

In [26]:
df

Unnamed: 0,id,subreddit,title,text,category,subcategory,impurity,clean_text
0,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. This was before I knew anything about motorcycling whatsoever. Me and some college buddies would always go out on the strip to the dance clubs. We always ended up at a bar called Hogs &amp; Heifers. It's worth noting the females working there can outd...,autos,harley davidson,0.00,Funny story. I went to college in Las Vegas. This was before I knew anything about motorcycling whatsoever. Me and some college buddies would always go out on the strip to the dance clubs. We always ended up at a bar called Hogs Heifers. It's worth noting the females working there can outdrink A...
1,5s0q8r,Mustang,Roush vs Shleby GT500,"I am trying to determine which is faster, and I've seen the dealership video with the two racing(Roush won 2/3). But I was wondering if it was just because of the bigass supercharger in the Roush. <lb><lb><lb>Also I can't find the same specs on any two websites, what are some trustworthy sources...",autos,ford,0.00,"I am trying to determine which is faster, and I've seen the dealership video with the two racing(Roush won 2/3). But I was wondering if it was just because of the bigass supercharger in the Roush. Also I can't find the same specs on any two websites, what are some trustworthy sources for this ki..."
2,5z3405,Volkswagen,2001 Golf Wagon looking for some insight,"Hello! <lb><lb>Trying to find some information on replacing a 2001 Golf Wagon starter (gas).... mine's gone out and going to the dealership is quite out of the range right now. Nor is it in the biggest of rushes. <lb><lb>I live in Japan, and I'm wondering if anyone had any international shipping...",autos,VW,0.00,"Hello! Trying to find some information on replacing a 2001 Golf Wagon starter (gas).... mine's gone out and going to the dealership is quite out of the range right now. Nor is it in the biggest of rushes. I live in Japan, and I'm wondering if anyone had any international shipping websites they c..."
3,7df18v,Lexus,IS 250 Coolant Flush/Change,https://www.cars.com/articles/how-often-should-i-change-engine-coolant-1420680853669/<lb><lb>I have a IS 250 AWD from 2006. About 73K miles on it. I've never touched the engine radiator coolant and can't find anything on when to change this in the book. It just says 'long life 100k Toyota coolan...,autos,lexus,0.00,https://www.cars.com/articles/how-often-should-i-change-engine-coolant-1420680853669/ I have a IS 250 AWD from 2006. About 73K miles on it. I've never touched the engine radiator coolant and can't find anything on when to change this in the book. It just says 'long life 100k Toyota coolant.' Doe...
4,5tpve8,volt,Gen1 mpg w/ dead battery?,"Hi, new to this subreddit. I'm considering buying a Gen1 Volt, but I can't find any straight answers as to what kind of mpg it gets after the battery is completely dead (say I take a 300 mile trip). What kind of highway mpg does the Gen1 volt get after the battery is depleted?",autos,chevrolet,0.00,"Hi, new to this subreddit. I'm considering buying a Gen1 Volt, but I can't find any straight answers as to what kind of mpg it gets after the battery is completely dead (say I take a 300 mile trip). What kind of highway mpg does the Gen1 volt get after the battery is depleted?"
...,...,...,...,...,...,...,...,...
19995,7i2k6y,4Runner,Bilstein Shocks,"I read a lot Forums and people recommend getting TUNDRA Bilstein Shocks for a 3rd gen 4 runner, what is the difference? and why do they recommend that? I bought Springs tundra Springs for the front and 1997 landcruiser springs fro the rear now I just need shocks. <lb>Thank you",autos,toyota,0.00,"I read a lot Forums and people recommend getting TUNDRA Bilstein Shocks for a 3rd gen 4 runner, what is the difference? and why do they recommend that? I bought Springs tundra Springs for the front and 1997 landcruiser springs fro the rear now I just need shocks. Thank you"
19996,83p2kv,Harley,Question on potential purchase of crashed bike.,"I am thinking about buying a 2010 Harley Sportster 1200 custom for $6k with 7k miles. It has Vance radius pipes, t bars, screaming eagle filters, sissy bar, new seat and either a new tank or a new paint job as it does not say Harley Davidson on the tank anywhere but has pin stripes.. the title ...",autos,harley davidson,0.00,"I am thinking about buying a 2010 Harley Sportster 1200 custom for $6k with 7k miles. It has Vance radius pipes, t bars, screaming eagle filters, sissy bar, new seat and either a new tank or a new paint job as it does not say Harley Davidson on the tank anywhere but has pin stripes.. the title i..."
19997,7x722h,volt,Got our first warning light on our dash,"My husband and I were headed somewhere and I was cold. So, I figured I could turn the car on before he unplugged it. When it turned on it was fine. Then he got into the drivers seat ten seconds later and we got a ""check charging system"" any suggestions? I figure I messed it up by turning it on b...",autos,chevrolet,0.00,"My husband and I were headed somewhere and I was cold. So, I figured I could turn the car on before he unplugged it. When it turned on it was fine. Then he got into the drivers seat ten seconds later and we got a ""check charging system"" any suggestions? I figure I messed it up by turning it on b..."
19998,7v2xmg,Lexus,Any IS models to avoid?,"I am looking at getting a used Lexus IS (2014 model year and newer). Are there any trim levels that I should avoid? Thinking about getting a 250, however, I might spend a little bit more and get a 350 if there is a significant performance increase, but I wanted to know if there is anything I sho...",autos,lexus,0.00,"I am looking at getting a used Lexus IS (2014 model year and newer). Are there any trim levels that I should avoid? Thinking about getting a 250, however, I might spend a little bit more and get a 350 if there is a significant performance increase, but I wanted to know if there is anything I sho..."


## 정규표현식을 사용한 토큰화

In [27]:
text = """
2019-08-10 23:32: @pete/@louis - I don't have a well-designed
solution for today's problem. The code of module AC68 should be -1.
Have to think a bit... #goodnight ;-) 😩😬"""

In [28]:
tokens = re.findall(r'\w\w+', text)
print(*tokens, sep='|')
# 모든 특수 문자와 이모티콘이 사라졌다.

2019|08|10|23|32|pete|louis|don|have|well|designed|solution|for|today|problem|The|code|of|module|AC68|should|be|Have|to|think|bit|goodnight


In [29]:
RE_TOKEN = re.compile(r"""
               ( [#]?[@\w'’\.\-\:]*\w     # 단어, 해시테그, 이메일 주소
               | [:;<]\-?[\)\(3]          # 폭넓게 설정한 기본 텍스트 이미지의 패턴
               | [\U0001F100-\U0001FFFF]  # 폭넓게 설정한 이모지의 유니코드 코드 범위
               )
               """, re.VERBOSE)

def tokenize(text):
    return RE_TOKEN.findall(text)

tokens = tokenize(text)
print(*tokens, sep='|')

2019-08-10|23:32|@pete|@louis|I|don't|have|a|well-designed|solution|for|today's|problem|The|code|of|module|AC68|should|be|-1|Have|to|think|a|bit|#goodnight|;-)|😩|😬


## NLTK를 사용한 토큰화
 - PunktSentenceTokenizer와 함께 TreebankWordTokenizer를 사용

In [30]:
import nltk

tokens = nltk.tokenize.casual_tokenize(text)
print(*tokens, sep='|')

2019-08-|10|23:32|:|@pete|/|@louis|-|I|don't|have|a|well-designed|solution|for|today's|problem|.|The|code|of|module|AC68|should|be|-|1|.|Have|to|think|a|bit|...|#goodnight|;-)|😩|😬


# 스페이시를 사용한 언어 처리
  - 토큰화, 품사 태거, 의존성 구문 분석기, 개체명 인식기 같은 처리 구성 요소의 통합파이프라인을 제공

  - 텍스트 -> 토큰화 작업 -> 품사 태거 -> 의존분석 파서 -> 개체명인식

In [31]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [32]:
nlp.pipeline # 태거, 파서, 개체명인식기

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f1830525a20>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f1830526740>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f1830649850>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f1831f55a80>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f1830605240>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f1830648f20>)]

In [40]:
## 텍스트 처리
nlp = spacy.load('en_core_web_sm')
text = "My_ best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

In [41]:
for token in doc:
  print(token, end="|")

My|_|best|friend|Ryan|Peters|likes|fancy|adventure|games|.|

In [42]:
def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_,
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_,
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)

    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df

In [43]:
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
2,best,good,False,True,ADJ,amod,,O
3,friend,friend,False,True,NOUN,compound,,O
4,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
5,Peters,Peters,False,True,PROPN,nsubj,PERSON,I
6,likes,like,False,True,VERB,ROOT,,O
7,fancy,fancy,False,True,ADJ,amod,,O
8,adventure,adventure,False,True,NOUN,compound,,O
9,games,game,False,True,NOUN,dobj,,O


In [44]:
import re ###
import spacy ###
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, \
                       compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):

    # use default patterns except the ones matched by re.search
    prefixes = [pattern for pattern in nlp.Defaults.prefixes
                if pattern not in ['-', '_', '#']]
    suffixes = [pattern for pattern in nlp.Defaults.suffixes
                if pattern not in ['_']]
    infixes  = [pattern for pattern in nlp.Defaults.infixes
                if not re.search(pattern, 'xx-xx')]

    return Tokenizer(vocab          = nlp.vocab,
                     rules          = nlp.Defaults.tokenizer_exceptions,
                     prefix_search  = compile_prefix_regex(prefixes).search,
                     suffix_search  = compile_suffix_regex(suffixes).search,
                     infix_finditer = compile_infix_regex(infixes).finditer,
                     token_match    = nlp.Defaults.token_match)

nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = custom_tokenizer(nlp)

doc = nlp(text)
for token in doc:
  print(token, end="|")

My_|best|friend|Ryan|Peters|likes|fancy|adventure|games|.|

## 불용어 제거

In [46]:
nlp = spacy.load('en_core_web_sm') ###
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


## 품사 기반 원형 추출
(원형 복원은 불변형 어근에 매핑)

In [47]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

print(*[t.lemma_ for t in doc], sep='|')

my|good|friend|Ryan|Peters|like|fancy|adventure|game|.


In [48]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN','PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


In [49]:
import textacy

tokens = textacy.extract.words(doc,
            filter_stops = True,           # 기본 True, 불용어처리o
            filter_punct = True,           # 기본 True, 문장부호 필터링o
            filter_nums = True,            # 기본 True, 숫자필터링할지 여부o
            include_pos = ['ADJ', 'NOUN'], # 형용사, 명사만 추출
            exclude_pos = None,            # 특정품사를 제외하지 않음
            min_freq = 1)                  # 단어가 문서에 나타나야하는 최소 빈도

print(*[t for t in tokens], sep='|')

best|friend|fancy|adventure|games


In [50]:
def extract_lemmas(doc, **kwargs):
  return [t.lemma_ for t in textacy.extract.words(doc, **kwargs)]

lemmas = extract_lemmas(doc, include_pos=['ADJ','NOUN'])
print(*lemmas, sep='|')

good|friend|fancy|adventure|game


## 명사구 추출(n-gram)

In [52]:
import spacy
import textacy

# SpaCy 모델 로드
nlp = spacy.load("en_core_web_sm")

# 입력 텍스트
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

# 패턴 정의 (형용사 + 명사 조합)
patterns = [{"POS": "ADJ"}, {"POS": "NOUN"}]

# Matcher 객체 생성
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
matcher.add("ADJ_NOUN_PATTERN", [patterns])  # 패턴 이름과 패턴 추가

# 매칭 결과 가져오기
matches = matcher(doc)

# 결과 출력
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.lemma_)


good friend
fancy adventure


In [53]:
print(*doc.noun_chunks, sep='}')

My best friend}Ryan Peters}fancy adventure games


In [55]:
import spacy
from spacy.matcher import Matcher

def extract_noun_phrases(doc, preceding_pos=['NOUN'], sep='_'):
    # SpaCy Matcher 객체 생성
    matcher = Matcher(doc.vocab)

    # 패턴 정의
    for pos in preceding_pos:
        pattern = [{"POS": pos}, {"POS": "NOUN"}]
        matcher.add(f"{pos}_NOUN_PATTERN", [pattern])  # 패턴 이름과 패턴 추가

    # 매칭 결과
    matches = matcher(doc)

    # 결과를 결합하여 반환
    result = []
    for match_id, start, end in matches:
        span = doc[start:end]
        result.append(sep.join([token.lemma_ for token in span]))

    return result

# SpaCy 모델 로드
nlp = spacy.load("en_core_web_sm")
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

# 함수 호출
print(*extract_noun_phrases(doc, ['ADJ', 'NOUN'], sep='|'))


good|friend fancy|adventure adventure|game
