# 第6章: 英語テキストの処理

## 50. 文区切り

In [15]:
import re

fname = 'data/nlp.txt'

def nlp_lines():
    
    with open(fname) as lines:
        
        pattern = re.compile(r'''
            ( 
              
                ^
                .*?
                [\.|\;|\:|\?|\!]
            )
            \s
            (
                [A-Z].*
            )
        ''', re.MULTILINE  + re.VERBOSE + re.DOTALL)
    
        for line in lines:
        
            line = line.strip()
        
            while line:
                match = pattern.match(line)
                if match:
                
                    yield match.group(1)
                    line = match.group(2)
                
                else:
                    yield line
                    line = ''


In [16]:
for line in nlp_lines():
    print(line)

Natural language processing
From Wikipedia, the free encyclopedia
Natural language processing (NLP) is a field of computer science, artificial intelligence, and linguistics concerned with the interactions between computers and human (natural) languages.
As such, NLP is related to the area of humani-computer interaction.
Many challenges in NLP involve natural language understanding, that is, enabling computers to derive meaning from human or natural language input, and others involve natural language generation.
History
The history of NLP generally starts in the 1950s, although work can be found from earlier periods.
In 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
The Georgetown experiment in 1954 involved fully automatic translation of more than sixty Russian sentences into English.
The authors claimed that within three or five years, machine translation would be a 

## 51. 単語の切り出し


In [17]:
def nlp_words():
    for line in nlp_lines():
        words = line.split(' ')
        
        for word in words:
            yield word
        yield ''

In [18]:
for word in nlp_words():
    print(word)

Natural
language
processing

From
Wikipedia,
the
free
encyclopedia

Natural
language
processing
(NLP)
is
a
field
of
computer
science,
artificial
intelligence,
and
linguistics
concerned
with
the
interactions
between
computers
and
human
(natural)
languages.

As
such,
NLP
is
related
to
the
area
of
humani-computer
interaction.

Many
challenges
in
NLP
involve
natural
language
understanding,
that
is,
enabling
computers
to
derive
meaning
from
human
or
natural
language
input,
and
others
involve
natural
language
generation.

History

The
history
of
NLP
generally
starts
in
the
1950s,
although
work
can
be
found
from
earlier
periods.

In
1950,
Alan
Turing
published
an
article
titled
"Computing
Machinery
and
Intelligence"
which
proposed
what
is
now
called
the
Turing
test
as
a
criterion
of
intelligence.

The
Georgetown
experiment
in
1954
involved
fully
automatic
translation
of
more
than
sixty
Russian
sentences
into
English.

The
authors
claimed
that
within
three
or
five
years,
machine
translation
wo

## 52. ステミング

In [19]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

for word in nlp_words():
    word = word.strip(',.:;(){}[]')
    stem = stemmer.stem(word)
    print(('{}\t{}').format(word, stem))


Natural	natur
language	languag
processing	process
	
From	from
Wikipedia	wikipedia
the	the
free	free
encyclopedia	encyclopedia
	
Natural	natur
language	languag
processing	process
NLP	nlp
is	is
a	a
field	field
of	of
computer	comput
science	scienc
artificial	artifici
intelligence	intellig
and	and
linguistics	linguist
concerned	concern
with	with
the	the
interactions	interact
between	between
computers	comput
and	and
human	human
natural	natur
languages	languag
	
As	As
such	such
NLP	nlp
is	is
related	relat
to	to
the	the
area	area
of	of
humani-computer	humani-comput
interaction	interact
	
Many	mani
challenges	challeng
in	in
NLP	nlp
involve	involv
natural	natur
language	languag
understanding	understand
that	that
is	is
enabling	enabl
computers	comput
to	to
derive	deriv
meaning	mean
from	from
human	human
or	or
natural	natur
language	languag
input	input
and	and
others	other
involve	involv
natural	natur
language	languag
generation	gener
	
History	histori
	
The	the
history	histori
of	of
NLP	nlp
gene

## 53. Tokenization

In [20]:
from pycorenlp import StanfordCoreNLP

In [21]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [22]:
with open('data/nlp.txt', 'r') as file:
    text = file.read()

In [23]:
text

'Natural language processing\nFrom Wikipedia, the free encyclopedia\n\nNatural language processing (NLP) is a field of computer science, artificial intelligence, and linguistics concerned with the interactions between computers and human (natural) languages. As such, NLP is related to the area of humani-computer interaction. Many challenges in NLP involve natural language understanding, that is, enabling computers to derive meaning from human or natural language input, and others involve natural language generation.\n\nHistory\n\nThe history of NLP generally starts in the 1950s, although work can be found from earlier periods. In 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.\n\nThe Georgetown experiment in 1954 involved fully automatic translation of more than sixty Russian sentences into English. The authors claimed that within three or five years, machine translati

In [30]:
output = nlp.annotate(text, properties={
    'annotators' : 'tokenize,ssplit,pos,depparse,parse,lemma,ner,dcoref',
    'outputFormat': 'xml'
})

In [31]:
import xml.etree.ElementTree as ET
root = ET.fromstring(output)

In [32]:
for word in root.iter('word'):
    print(word.text)

Natural
language
processing
From
Wikipedia
,
the
free
encyclopedia
Natural
language
processing
-LRB-
NLP
-RRB-
is
a
field
of
computer
science
,
artificial
intelligence
,
and
linguistics
concerned
with
the
interactions
between
computers
and
human
-LRB-
natural
-RRB-
languages
.
As
such
,
NLP
is
related
to
the
area
of
humani-computer
interaction
.
Many
challenges
in
NLP
involve
natural
language
understanding
,
that
is
,
enabling
computers
to
derive
meaning
from
human
or
natural
language
input
,
and
others
involve
natural
language
generation
.
History
The
history
of
NLP
generally
starts
in
the
1950s
,
although
work
can
be
found
from
earlier
periods
.
In
1950
,
Alan
Turing
published
an
article
titled
``
Computing
Machinery
and
Intelligence
''
which
proposed
what
is
now
called
the
Turing
test
as
a
criterion
of
intelligence
.
The
Georgetown
experiment
in
1954
involved
fully
automatic
translation
of
more
than
sixty
Russian
sentences
into
English
.
The
authors
claimed
that
within
three
or
five

## 54. 品詞タグ付け

In [33]:
def indent(elem, level=0):
    i = "\n" + level*"  "
    j = "\n" + (level-1)*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for subelem in elem:
            indent(subelem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = j
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = j
    return elem        

In [34]:
indent(root)
ET.dump(root)

<root>
  <document>
    <sentences>
      <sentence id="1">
        <tokens>
          <token id="1">
            <word>Natural</word>
          <lemma>natural</lemma>
          <CharacterOffsetBegin>0</CharacterOffsetBegin>
          <CharacterOffsetEnd>7</CharacterOffsetEnd>
          <POS>JJ</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="2">
            <word>language</word>
          <lemma>language</lemma>
          <CharacterOffsetBegin>8</CharacterOffsetBegin>
          <CharacterOffsetEnd>16</CharacterOffsetEnd>
          <POS>NN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="3">
            <word>processing</word>
          <lemma>processing</lemma>
          <CharacterOffsetBegin>17</CharacterOffsetBegin>
          <CharacterOffsetEnd>27</CharacterOffsetEnd>
          <POS>NN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <toke

          <CharacterOffsetEnd>370</CharacterOffsetEnd>
          <POS>NN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="8">
            <word>understanding</word>
          <lemma>understanding</lemma>
          <CharacterOffsetBegin>371</CharacterOffsetBegin>
          <CharacterOffsetEnd>384</CharacterOffsetEnd>
          <POS>NN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="9">
            <word>,</word>
          <lemma>,</lemma>
          <CharacterOffsetBegin>384</CharacterOffsetBegin>
          <CharacterOffsetEnd>385</CharacterOffsetEnd>
          <POS>,</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="10">
            <word>that</word>
          <lemma>that</lemma>
          <CharacterOffsetBegin>386</CharacterOffsetBegin>
          <CharacterOffsetEnd>390</CharacterOffsetEnd>
          <POS>WDT</POS>
          <NER>O<

          <POS>NN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="9">
            <word>titled</word>
          <lemma>title</lemma>
          <CharacterOffsetBegin>669</CharacterOffsetBegin>
          <CharacterOffsetEnd>675</CharacterOffsetEnd>
          <POS>VBN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="10">
            <word>``</word>
          <lemma>``</lemma>
          <CharacterOffsetBegin>676</CharacterOffsetBegin>
          <CharacterOffsetEnd>677</CharacterOffsetEnd>
          <POS>``</POS>
          <NER>O</NER>
          </token>
        <token id="11">
            <word>Computing</word>
          <lemma>Computing</lemma>
          <CharacterOffsetBegin>677</CharacterOffsetBegin>
          <CharacterOffsetEnd>686</CharacterOffsetEnd>
          <POS>NNP</POS>
          <NER>O</NER>
          <Speaker>24</Speaker>
          </token>
        <token id="12">
        

        <dep type="mwe">
            <governor idx="11">more</governor>
          <dependent idx="12">than</dependent>
          </dep>
        <dep type="nummod">
            <governor idx="15">sentences</governor>
          <dependent idx="13">sixty</dependent>
          </dep>
        <dep type="amod">
            <governor idx="15">sentences</governor>
          <dependent idx="14">Russian</dependent>
          </dep>
        <dep type="nmod:of">
            <governor idx="9">translation</governor>
          <dependent idx="15">sentences</dependent>
          </dep>
        <dep type="case">
            <governor idx="17">English</governor>
          <dependent idx="16">into</dependent>
          </dep>
        <dep type="nmod:into">
            <governor idx="6">involved</governor>
          <dependent idx="17">English</dependent>
          </dep>
        <dep type="punct">
            <governor idx="6">involved</governor>
          <dependent idx="18">.</dependent>
          </de

        <dep type="punct">
            <governor idx="13">report</governor>
          <dependent idx="16">,</dependent>
          </dep>
        <dep type="ref">
            <governor idx="13">report</governor>
          <dependent idx="17">which</dependent>
          </dep>
        <dep type="acl:relcl">
            <governor idx="13">report</governor>
          <dependent idx="18">found</dependent>
          </dep>
        <dep type="mark">
            <governor idx="25">failed</governor>
          <dependent idx="19">that</dependent>
          </dep>
        <dep type="nummod">
            <governor idx="21">year</governor>
          <dependent idx="20">ten</dependent>
          </dep>
        <dep type="nmod:tmod">
            <governor idx="25">failed</governor>
          <dependent idx="21">year</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="23">research</governor>
          <dependent idx="22">long</dependent>
          </dep>
        <dep ty

          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="45">
            <word>1966</word>
          <lemma>1966</lemma>
          <CharacterOffsetBegin>1655</CharacterOffsetBegin>
          <CharacterOffsetEnd>1659</CharacterOffsetEnd>
          <POS>CD</POS>
          <NER>DATE</NER>
          <NormalizedNER>1966</NormalizedNER>
          <Speaker>PER0</Speaker>
          <Timex tid="t11" type="DATE">1966</Timex>
          </token>
        <token id="46">
            <word>.</word>
          <lemma>.</lemma>
          <CharacterOffsetBegin>1659</CharacterOffsetBegin>
          <CharacterOffsetEnd>1660</CharacterOffsetEnd>
          <POS>.</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        </tokens>
      <parse>(ROOT
  (S
    (S
      (NP
        (NP (DT Some) (RB notably) (JJ successful) (NN NLP) (NNS systems))
        (VP (VBN developed)
          (PP (IN in)
            (NP (DT the) (NNS 1960s)))))
     

          <dependent idx="15">startlingly</dependent>
          </dep>
        <dep type="amod">
            <governor idx="17">interaction</governor>
          <dependent idx="16">human-like</dependent>
          </dep>
        <dep type="dobj">
            <governor idx="13">provided</governor>
          <dependent idx="17">interaction</dependent>
          </dep>
        <dep type="punct">
            <governor idx="13">provided</governor>
          <dependent idx="18">.</dependent>
          </dep>
        </dependencies>
      <dependencies type="enhanced-plus-plus-dependencies">
          <dep type="root">
            <governor idx="0">ROOT</governor>
          <dependent idx="13">provided</dependent>
          </dep>
        <dep type="advcl">
            <governor idx="13">provided</governor>
          <dependent idx="1">Using</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="4">information</governor>
          <dependent idx="2">almost</depend

        <dep type="nsubj">
            <governor idx="15">structured</governor>
          <dependent idx="14">which</dependent>
          </dep>
        <dep type="acl:relcl">
            <governor idx="10">conceptual</governor>
          <dependent idx="15">structured</dependent>
          </dep>
        <dep type="amod">
            <governor idx="17">information</governor>
          <dependent idx="16">real-world</dependent>
          </dep>
        <dep type="dobj">
            <governor idx="15">structured</governor>
          <dependent idx="17">information</dependent>
          </dep>
        <dep type="case">
            <governor idx="20">data</governor>
          <dependent idx="18">into</dependent>
          </dep>
        <dep type="amod">
            <governor idx="20">data</governor>
          <dependent idx="19">computer-understandable</dependent>
          </dep>
        <dep type="nmod">
            <governor idx="15">structured</governor>
          <dependent idx="20"

            <governor idx="3">MARGIE</governor>
          <dependent idx="52">.</dependent>
          </dep>
        </dependencies>
      <dependencies type="enhanced-dependencies">
          <dep type="root">
            <governor idx="0">ROOT</governor>
          <dependent idx="3">MARGIE</dependent>
          </dep>
        <dep type="nsubj">
            <governor idx="3">MARGIE</governor>
          <dependent idx="1">Examples</dependent>
          </dep>
        <dep type="cop">
            <governor idx="3">MARGIE</governor>
          <dependent idx="2">are</dependent>
          </dep>
        <dep type="punct">
            <governor idx="5">Schank</governor>
          <dependent idx="4">-LRB-</dependent>
          </dep>
        <dep type="appos">
            <governor idx="3">MARGIE</governor>
          <dependent idx="5">Schank</dependent>
          </dep>
        <dep type="punct">
            <governor idx="5">Schank</governor>
          <dependent idx="6">,</dependent>
    

          <dependent idx="4">1980s</dependent>
          </dep>
        <dep type="punct">
            <governor idx="10">based</governor>
          <dependent idx="5">,</dependent>
          </dep>
        <dep type="amod">
            <governor idx="8">systems</governor>
          <dependent idx="6">most</dependent>
          </dep>
        <dep type="compound">
            <governor idx="8">systems</governor>
          <dependent idx="7">NLP</dependent>
          </dep>
        <dep type="nsubjpass">
            <governor idx="10">based</governor>
          <dependent idx="8">systems</dependent>
          </dep>
        <dep type="auxpass">
            <governor idx="10">based</governor>
          <dependent idx="9">were</dependent>
          </dep>
        <dep type="case">
            <governor idx="13">sets</governor>
          <dependent idx="11">on</dependent>
          </dep>
        <dep type="amod">
            <governor idx="13">sets</governor>
          <dependent idx="12"

            <governor idx="37">underpinnings</governor>
          <dependent idx="36">theoretical</dependent>
          </dep>
        <dep type="nsubj">
            <governor idx="38">discouraged</governor>
          <dependent idx="37">underpinnings</dependent>
          </dep>
        <dep type="dep">
            <governor idx="19">gradual</governor>
          <dependent idx="38">discouraged</dependent>
          </dep>
        <dep type="det">
            <governor idx="40">sort</governor>
          <dependent idx="39">the</dependent>
          </dep>
        <dep type="dobj">
            <governor idx="38">discouraged</governor>
          <dependent idx="40">sort</dependent>
          </dep>
        <dep type="case">
            <governor idx="43">linguistics</governor>
          <dependent idx="41">of</dependent>
          </dep>
        <dep type="compound">
            <governor idx="43">linguistics</governor>
          <dependent idx="42">corpus</dependent>
          </dep>
  

          <dependent idx="11">decision</dependent>
          </dep>
        <dep type="nmod:such_as">
            <governor idx="7">algorithms</governor>
          <dependent idx="12">trees</dependent>
          </dep>
        <dep type="punct">
            <governor idx="7">algorithms</governor>
          <dependent idx="13">,</dependent>
          </dep>
        <dep type="dobj">
            <governor idx="14">produced</governor>
          <dependent idx="15">systems</dependent>
          </dep>
        <dep type="case">
            <governor idx="19">rules</governor>
          <dependent idx="16">of</dependent>
          </dep>
        <dep type="amod">
            <governor idx="19">rules</governor>
          <dependent idx="17">hard</dependent>
          </dep>
        <dep type="amod">
            <governor idx="19">rules</governor>
          <dependent idx="18">if-then</dependent>
          </dep>
        <dep type="nmod:of">
            <governor idx="15">systems</governor>
   

          <dependent idx="21">has</dependent>
          </dep>
        <dep type="conj:and">
            <governor idx="6">tagging</governor>
          <dependent idx="22">focused</dependent>
          </dep>
        <dep type="case">
            <governor idx="25">models</governor>
          <dependent idx="23">on</dependent>
          </dep>
        <dep type="amod">
            <governor idx="25">models</governor>
          <dependent idx="24">statistical</dependent>
          </dep>
        <dep type="nmod:on">
            <governor idx="22">focused</governor>
          <dependent idx="25">models</dependent>
          </dep>
        <dep extra="true" type="nsubj">
            <governor idx="28">make</governor>
          <dependent idx="25">models</dependent>
          </dep>
        <dep type="punct">
            <governor idx="25">models</governor>
          <dependent idx="26">,</dependent>
          </dep>
        <dep type="ref">
            <governor idx="25">models</governor>

          <dep type="root">
            <governor idx="0">ROOT</governor>
          <dependent idx="6">robust</dependent>
          </dep>
        <dep type="amod">
            <governor idx="2">models</governor>
          <dependent idx="1">Such</dependent>
          </dep>
        <dep type="nsubj">
            <governor idx="6">robust</governor>
          <dependent idx="2">models</dependent>
          </dep>
        <dep type="cop">
            <governor idx="6">robust</governor>
          <dependent idx="3">are</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="6">robust</governor>
          <dependent idx="4">generally</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="6">robust</governor>
          <dependent idx="5">more</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="8">given</governor>
          <dependent idx="7">when</dependent>
          </dep>
        <dep type="ccomp">
 

          </token>
        <token id="7">
            <word>advantage</word>
          <lemma>advantage</lemma>
          <CharacterOffsetBegin>3991</CharacterOffsetBegin>
          <CharacterOffsetEnd>4000</CharacterOffsetEnd>
          <POS>NN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="8">
            <word>of</word>
          <lemma>of</lemma>
          <CharacterOffsetBegin>4001</CharacterOffsetBegin>
          <CharacterOffsetEnd>4003</CharacterOffsetEnd>
          <POS>IN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="9">
            <word>existing</word>
          <lemma>exist</lemma>
          <CharacterOffsetBegin>4004</CharacterOffsetBegin>
          <CharacterOffsetEnd>4012</CharacterOffsetEnd>
          <POS>VBG</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="10">
            <word>multilingual</word>
          

            <governor idx="46">systems</governor>
          <dependent idx="48">government</dependent>
          </dep>
        <dep type="punct">
            <governor idx="4">able</governor>
          <dependent idx="49">.</dependent>
          </dep>
        </dependencies>
      </sentence>
    <sentence id="26">
        <tokens>
          <token id="1">
            <word>However</word>
          <lemma>however</lemma>
          <CharacterOffsetBegin>4268</CharacterOffsetBegin>
          <CharacterOffsetEnd>4275</CharacterOffsetEnd>
          <POS>RB</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="2">
            <word>,</word>
          <lemma>,</lemma>
          <CharacterOffsetBegin>4275</CharacterOffsetBegin>
          <CharacterOffsetEnd>4276</CharacterOffsetEnd>
          <POS>,</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="3">
            <word>most</word>
          <l

        <dep type="case">
            <governor idx="15">more</governor>
          <dependent idx="14">of</dependent>
          </dep>
        <dep type="nmod:of">
            <governor idx="13">methods</governor>
          <dependent idx="15">more</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="17">learning</governor>
          <dependent idx="16">effectively</dependent>
          </dep>
        <dep type="acl">
            <governor idx="15">more</governor>
          <dependent idx="17">learning</dependent>
          </dep>
        <dep type="case">
            <governor idx="20">amounts</governor>
          <dependent idx="18">from</dependent>
          </dep>
        <dep type="amod">
            <governor idx="20">amounts</governor>
          <dependent idx="19">limited</dependent>
          </dep>
        <dep type="nmod:from">
            <governor idx="17">learning</governor>
          <dependent idx="20">amounts</dependent>
          </dep>


          <lemma>than</lemma>
          <CharacterOffsetBegin>4898</CharacterOffsetBegin>
          <CharacterOffsetEnd>4902</CharacterOffsetEnd>
          <POS>IN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="10">
            <word>supervised</word>
          <lemma>supervised</lemma>
          <CharacterOffsetBegin>4903</CharacterOffsetBegin>
          <CharacterOffsetEnd>4913</CharacterOffsetEnd>
          <POS>JJ</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="11">
            <word>learning</word>
          <lemma>learning</lemma>
          <CharacterOffsetBegin>4914</CharacterOffsetBegin>
          <CharacterOffsetEnd>4922</CharacterOffsetEnd>
          <POS>NN</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="12">
            <word>,</word>
          <lemma>,</lemma>
          <CharacterOffsetBegin>4922</CharacterOffsetBeg

          <dependent idx="37">results</dependent>
          </dep>
        <dep type="punct">
            <governor idx="4">is</governor>
          <dependent idx="38">.</dependent>
          </dep>
        </dependencies>
      <dependencies type="collapsed-ccprocessed-dependencies">
          <dep type="root">
            <governor idx="0">ROOT</governor>
          <dependent idx="4">is</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="4">is</governor>
          <dependent idx="1">However</dependent>
          </dep>
        <dep type="punct">
            <governor idx="4">is</governor>
          <dependent idx="2">,</dependent>
          </dep>
        <dep type="expl">
            <governor idx="4">is</governor>
          <dependent idx="3">there</dependent>
          </dep>
        <dep type="det">
            <governor idx="7">amount</governor>
          <dependent idx="5">an</dependent>
          </dep>
        <dep type="amod">
            <gov

          <dependent idx="6">is</dependent>
          </dep>
        <dep type="case">
            <governor idx="9">that</governor>
          <dependent idx="8">from</dependent>
          </dep>
        <dep type="nmod:from">
            <governor idx="7">different</governor>
          <dependent idx="9">that</dependent>
          </dep>
        <dep type="case">
            <governor idx="13">attempts</governor>
          <dependent idx="10">of</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="13">attempts</governor>
          <dependent idx="11">most</dependent>
          </dep>
        <dep type="amod">
            <governor idx="13">attempts</governor>
          <dependent idx="12">prior</dependent>
          </dep>
        <dep type="nmod:of">
            <governor idx="9">that</governor>
          <dependent idx="13">attempts</dependent>
          </dep>
        <dep type="case">
            <governor idx="16">processing</governor>
          <de

          <dependent idx="9">learning</dependent>
          </dep>
        <dep type="dobj">
            <governor idx="7">using</governor>
          <dependent idx="10">algorithms</dependent>
          </dep>
        <dep type="punct">
            <governor idx="10">algorithms</governor>
          <dependent idx="11">-</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="10">algorithms</governor>
          <dependent idx="12">often</dependent>
          </dep>
        <dep type="punct">
            <governor idx="15">not</governor>
          <dependent idx="13">,</dependent>
          </dep>
        <dep type="mark">
            <governor idx="15">not</governor>
          <dependent idx="14">although</dependent>
          </dep>
        <dep type="parataxis">
            <governor idx="7">using</governor>
          <dependent idx="15">not</dependent>
          </dep>
        <dep type="dep">
            <governor idx="15">not</governor>
          <depend

          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="12">
            <word>NLP</word>
          <lemma>nlp</lemma>
          <CharacterOffsetBegin>6025</CharacterOffsetBegin>
          <CharacterOffsetEnd>6028</CharacterOffsetEnd>
          <POS>NN</POS>
          <NER>ORGANIZATION</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="13">
            <word>tasks</word>
          <lemma>task</lemma>
          <CharacterOffsetBegin>6029</CharacterOffsetBegin>
          <CharacterOffsetEnd>6034</CharacterOffsetEnd>
          <POS>NNS</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        <token id="14">
            <word>.</word>
          <lemma>.</lemma>
          <CharacterOffsetBegin>6034</CharacterOffsetBegin>
          <CharacterOffsetEnd>6035</CharacterOffsetEnd>
          <POS>.</POS>
          <NER>O</NER>
          <Speaker>PER0</Speaker>
          </token>
        </tokens>
  

            <governor idx="5">algorithms</governor>
          <dependent idx="4">earliest-used</dependent>
          </dep>
        <dep type="nmod:of">
            <governor idx="1">Some</governor>
          <dependent idx="5">algorithms</dependent>
          </dep>
        <dep type="punct">
            <governor idx="1">Some</governor>
          <dependent idx="6">,</dependent>
          </dep>
        <dep type="case">
            <governor idx="10">trees</governor>
          <dependent idx="7">such</dependent>
          </dep>
        <dep type="mwe">
            <governor idx="7">such</governor>
          <dependent idx="8">as</dependent>
          </dep>
        <dep type="compound">
            <governor idx="10">trees</governor>
          <dependent idx="9">decision</dependent>
          </dep>
        <dep type="nmod:such_as">
            <governor idx="1">Some</governor>
          <dependent idx="10">trees</dependent>
          </dep>
        <dep type="punct">
            <

          <dependent idx="39">system</dependent>
          </dep>
        <dep type="punct">
            <governor idx="3">have</governor>
          <dependent idx="40">.</dependent>
          </dep>
        </dependencies>
      <dependencies type="collapsed-dependencies">
          <dep type="root">
            <governor idx="0">ROOT</governor>
          <dependent idx="3">have</dependent>
          </dep>
        <dep type="amod">
            <governor idx="2">models</governor>
          <dependent idx="1">Such</dependent>
          </dep>
        <dep type="nsubj">
            <governor idx="3">have</governor>
          <dependent idx="2">models</dependent>
          </dep>
        <dep type="det">
            <governor idx="5">advantage</governor>
          <dependent idx="4">the</dependent>
          </dep>
        <dep type="dobj">
            <governor idx="3">have</governor>
          <dependent idx="5">advantage</dependent>
          </dep>
        <dep type="mark">
         

        <dep type="det">
            <governor idx="15">procedures</governor>
          <dependent idx="13">The</dependent>
          </dep>
        <dep type="amod">
            <governor idx="15">procedures</governor>
          <dependent idx="14">learning</dependent>
          </dep>
        <dep type="nsubj">
            <governor idx="21">focus</governor>
          <dependent idx="15">procedures</dependent>
          </dep>
        <dep type="acl">
            <governor idx="15">procedures</governor>
          <dependent idx="16">used</dependent>
          </dep>
        <dep type="case">
            <governor idx="19">learning</governor>
          <dependent idx="17">during</dependent>
          </dep>
        <dep type="compound">
            <governor idx="19">learning</governor>
          <dependent idx="18">machine</dependent>
          </dep>
        <dep type="nmod:during">
            <governor idx="16">used</governor>
          <dependent idx="19">learning</dependent>
   

          <dependent idx="8">statistical</dependent>
          </dep>
        <dep type="compound">
            <governor idx="10">algorithms</governor>
          <dependent idx="9">inference</dependent>
          </dep>
        <dep type="nmod:of">
            <governor idx="6">use</governor>
          <dependent idx="10">algorithms</dependent>
          </dep>
        <dep type="mark">
            <governor idx="12">produce</governor>
          <dependent idx="11">to</dependent>
          </dep>
        <dep type="advcl">
            <governor idx="5">make</governor>
          <dependent idx="12">produce</dependent>
          </dep>
        <dep type="dobj">
            <governor idx="12">produce</governor>
          <dependent idx="13">models</dependent>
          </dep>
        <dep type="nsubj">
            <governor idx="16">robust</governor>
          <dependent idx="14">that</dependent>
          </dep>
        <dep extra="true" type="nsubj">
            <governor copy="1" idx=

        <dep type="advmod">
            <governor idx="11">or</governor>
          <dependent idx="12">more</dependent>
          </dep>
        <dep type="advmod">
            <governor idx="3">handling</governor>
          <dependent idx="13">generally</dependent>
          </dep>
        <dep type="punct">
            <governor idx="3">handling</governor>
          <dependent idx="14">,</dependent>
          </dep>
        <dep type="parataxis">
            <governor idx="3">handling</governor>
          <dependent idx="15">creating</dependent>
          </dep>
        <dep type="dobj">
            <governor idx="15">creating</governor>
          <dependent idx="16">systems</dependent>
          </dep>
        <dep extra="true" type="nsubj">
            <governor idx="21">make</governor>
          <dependent idx="16">systems</dependent>
          </dep>
        <dep type="case">
            <governor idx="19">rules</governor>
          <dependent idx="17">of</dependent>
          </

          <dependent idx="1">However</dependent>
          </dep>
        <dep type="punct">
            <governor idx="11">made</governor>
          <dependent idx="2">,</dependent>
          </dep>
        <dep type="nsubjpass">
            <governor idx="11">made</governor>
          <dependent idx="3">systems</dependent>
          </dep>
        <dep type="case">
            <governor idx="7">rules</governor>
          <dependent idx="4">based</dependent>
          </dep>
        <dep type="mwe">
            <governor idx="4">based</governor>
          <dependent idx="5">on</dependent>
          </dep>
        <dep type="amod">
            <governor idx="7">rules</governor>
          <dependent idx="6">hand-written</dependent>
          </dep>
        <dep type="nmod:based_on">
            <governor idx="3">systems</governor>
          <dependent idx="7">rules</dependent>
          </dep>
        <dep type="aux">
            <governor idx="11">made</governor>
          <dependent i

          </dep>
        <dep type="nmod">
            <governor idx="12">requires</governor>
          <dependent idx="26">increases</dependent>
          </dep>
        <dep type="case">
            <governor idx="29">complexity</governor>
          <dependent idx="27">in</dependent>
          </dep>
        <dep type="det">
            <governor idx="29">complexity</governor>
          <dependent idx="28">the</dependent>
          </dep>
        <dep type="nmod">
            <governor idx="26">increases</governor>
          <dependent idx="29">complexity</dependent>
          </dep>
        <dep type="case">
            <governor idx="33">process</governor>
          <dependent idx="30">of</dependent>
          </dep>
        <dep type="det">
            <governor idx="33">process</governor>
          <dependent idx="31">the</dependent>
          </dep>
        <dep type="compound">
            <governor idx="33">process</governor>
          <dependent idx="32">annotation</dependent

          <dependent idx="20">conference</dependent>
          </dep>
        <dep type="dep">
            <governor idx="25">SIGNLL</governor>
          <dependent idx="21">CoNLL</dependent>
          </dep>
        <dep type="cc">
            <governor idx="21">CoNLL</governor>
          <dependent idx="22">and</dependent>
          </dep>
        <dep type="compound">
            <governor idx="24">body</governor>
          <dependent idx="23">peak</dependent>
          </dep>
        <dep type="conj:and">
            <governor idx="21">CoNLL</governor>
          <dependent idx="24">body</dependent>
          </dep>
        <dep extra="true" type="dep">
            <governor idx="25">SIGNLL</governor>
          <dependent idx="24">body</dependent>
          </dep>
        <dep extra="true" type="dep">
            <governor idx="7">learning</governor>
          <dependent idx="25">SIGNLL</dependent>
          </dep>
        <dep type="conj:and">
            <governor idx="10">known</

        <head>31</head>
        <text>rules</text>
        </mention>
      </coreference>
    <coreference>
        <mention representative="true">
          <sentence>21</sentence>
        <start>24</start>
        <end>26</end>
        <head>25</head>
        <text>statistical models</text>
        </mention>
      <mention>
          <sentence>21</sentence>
        <start>24</start>
        <end>46</end>
        <head>25</head>
        <text>statistical models , which make soft , probabilistic decisions based on attaching real-valued weights to the features making up the input data</text>
        </mention>
      <mention>
          <sentence>40</sentence>
        <start>9</start>
        <end>27</end>
        <head>10</head>
        <text>statistical models , which make soft , probabilistic decisions based on attaching real-valued weights to each input feature</text>
        </mention>
      <mention>
          <sentence>40</sentence>
        <start>9</start>
        <end>11</end>

In [35]:
for token in root.iter('token'):
    word = token[0].text
    lemma = token[1].text
    pos = token[4].text
    
    print('{}\t{}\t{}'.format(word, lemma, pos))

Natural	natural	JJ
language	language	NN
processing	processing	NN
From	from	IN
Wikipedia	Wikipedia	NNP
,	,	,
the	the	DT
free	free	JJ
encyclopedia	encyclopedia	NN
Natural	natural	JJ
language	language	NN
processing	processing	NN
-LRB-	-lrb-	-LRB-
NLP	nlp	NN
-RRB-	-rrb-	-RRB-
is	be	VBZ
a	a	DT
field	field	NN
of	of	IN
computer	computer	NN
science	science	NN
,	,	,
artificial	artificial	JJ
intelligence	intelligence	NN
,	,	,
and	and	CC
linguistics	linguistics	NNS
concerned	concern	VBN
with	with	IN
the	the	DT
interactions	interaction	NNS
between	between	IN
computers	computer	NNS
and	and	CC
human	human	JJ
-LRB-	-lrb-	-LRB-
natural	natural	JJ
-RRB-	-rrb-	-RRB-
languages	language	NNS
.	.	.
As	as	IN
such	such	JJ
,	,	,
NLP	nlp	NN
is	be	VBZ
related	relate	VBN
to	to	TO
the	the	DT
area	area	NN
of	of	IN
humani-computer	humani-computer	JJ
interaction	interaction	NN
.	.	.
Many	many	JJ
challenges	challenge	NNS
in	in	IN
NLP	nlp	NN
involve	involve	VBP
natural	natural	JJ
language	language	NN
understanding	unde

## 55. 固有表現抽出

In [36]:
for token in root.iter('token'):
    word = token[0].text
    ner = token[5].text
    
    if ner == 'PERSON':
        print(word)

Alan
Turing
Joseph
Weizenbaum
MARGIE
Schank
Wilensky
Meehan
Lehnert
Carbonell
Lehnert
Racter
Jabberwacky
Moore


## 56. 共参照解析

### 構造体のまま置換するのは大変なので、文字列の置換とする。

In [40]:
# coreferenceの取得

corefs = []
for coref in root.iter('coreference'):
    corefs.append(coref)

In [120]:
# sentenceの取得
sents = {}

for sent in root.iter('sentence'):
    
    try:
        sent_id = sent.attrib['id']
    except KeyError:
        continue
    
    tokens = []
    for token in sent.iter('token'):
          tokens.append(token[0].text)
    
    sents[sent_id] = tokens
        

In [129]:
for coref in corefs:
    for mention in coref.iter('mention'):
        rep = ''
        
        # 代表参照表現を取得
        try: 
            if mention.attrib['representative'] == 'true':
                rep_text = mention[4].text
        
        # 参照表現を代表参照表現で置き換える
        except KeyError:
            sent_id = mention[0].text
            start = mention[1].text
            end = mention[2].text
            text = mention[4].text
            
            #　記法： 参照表現 （ 代表参照表現 ) 
            new_text = '{} ( {} )'.format(text, rep_text) 
            new_text = new_text.split(' ')
            
            sent = sents[sent_id]
            new_sent = sent[:int(start)-1] + new_text + sent[int(end):]
            
            print(new_sent)
                                    
                
                

['Natural', 'language', 'processing', 'From', 'Wikipedia', ',', 'the', 'free', 'encyclopedia', 'Natural', 'language', 'processing', '-LRB-', 'NLP', '-RRB-', 'is', 'a', 'field', 'of', 'computer', 'science', '(', 'the', 'free', 'encyclopedia', 'Natural', 'language', 'processing', '-LRB-', 'NLP', '-RRB-', ')', 'artificial', 'intelligence', ',', 'and', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '-LRB-', 'natural', '-RRB-', 'languages', '.']
['Starting', 'in', 'the', 'late', '1980s', ',', 'however', ',', 'there', 'was', 'a', 'revolution', 'in', 'NLP', 'with', 'the', 'introduction', 'of', 'machine', 'learning', 'algorithms', 'for', 'language', 'processing', '(', 'the', 'free', 'encyclopedia', 'Natural', 'language', 'processing', '-LRB-', 'NLP', '-RRB-', ')']
['This', 'was', 'due', 'to', 'both', 'the', 'steady', 'increase', 'in', 'computational', 'power', 'resulting', 'from', 'Moore', "'s", 'Law', 'and', 'the', 'gradual', 'lessening', 'o

## 57. 係り受け解析

## 58. タプルの抽出

## 59. S式の解析

# 第7章: データベース

## 60. KVSの構築

In [146]:
!head  data/artist.json

{"name": "WIK▲N", "tags": [{"count": 1, "value": "sillyname"}], "sort_name": "WIK▲N", "ended": true, "gid": "8972b1c1-6482-4750-b51f-596d2edea8b1", "id": 805192}
{"name": "Gustav Ruppke", "sort_name": "Gustav Ruppke", "ended": true, "gid": "b4f76788-7e6f-41b7-ac7b-dfb67f66282e", "type": "Person", "id": 578352}
{"name": "Pete Moutso", "sort_name": "Moutso, Pete", "ended": true, "gid": "49add228-eac5-4de8-836c-d75cde7369c3", "type": "Person", "id": 371203}
{"ended": true, "gid": "c112a400-af49-4665-8bba-741531d962a1", "sort_name": "Zachary", "id": 273232, "name": "Zachary"}
{"name": "The High Level Ranters", "sort_name": "High Level Ranters, The", "ended": true, "gid": "c42eed94-e233-44e2-82b8-3ed6dd9bf318", "type": "Group", "id": 153193}
{"begin": {"year": 1956}, "end": {"year": 1993}, "name": "The Silhouettes", "area": "United States", "sort_name": "Silhouettes, The", "ended": true, "gid": "ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b", "type": "Group", "id": 101060, "aliases": [{"name": 

In [1]:
import redis

r = redis.StrictRedis(host='localhost', port=6379, db=0)

In [33]:
import json

with open('data/artist.json') as file: 
    cnt  = 0
    for line in file:
        data = json.loads(line)
        if ('name' in data.keys() and 'area' in data.keys()):
            r.set(data['name'], data['area'])

## 61. KVSの検索

In [35]:
r.get('Pete Moutso')

In [36]:
r.get('Al Street')

b'United States'

## 62. KVS内の反復処理

In [49]:
cnt = 0
for key in r.keys():
    if r.get(key) == b'Japan':
        cnt += 1

In [50]:
print(cnt)

22128


## 63. オブジェクトを値に格納したKVS

In [75]:
import json

# アーティスト名に対してのタグを格納
with open('data/artist.json') as file: 
    for line in file:
        data = json.loads(line)
        if 'tags' in data.keys():
            r.hmset(data['name'], {'tags': data['tags']})

In [89]:
r.keys()

[b'Edgars Naglis',
 b'\xe3\x82\xb9\xe3\x83\x97\xe3\x83\xbc\xe3\x83\xb3\xe3\x82\xbf\xe3\x83\x83\xe3\x83\x97',
 b'I Am the Hamster',
 b'Scoundrels',
 b'\xe4\xbc\x8a\xe8\x97\xa4\xe5\xa4\x9a\xe8\xb3\x80\xe4\xb9\x8b',
 b'Luniz',
 b'Teflon Brothers',
 b'Mikko Karmila',
 b'Ain\xc4\x81rs Majors',
 b'Lena Katina',
 b'Steve Brookstein',
 b'Kenny Garrett',
 b'Lamb',
 b'Broken Bells',
 b'Z.O.O.I.',
 b'Tadday',
 b'\xd0\xad\xd0\xbc\xd0\xb8\xd0\xbb\xd1\x8c \xd0\x93\xd1\x80\xd0\xb8\xd0\xb3\xd0\xbe\xd1\x80\xd1\x8c\xd0\xb5\xd0\xb2\xd0\xb8\xd1\x87 \xd0\x93\xd0\xb8\xd0\xbb\xd0\xb5\xd0\xbb\xd1\x8c\xd1\x81',
 b'Clinic',
 b'Tereskova',
 b'Big Drill Car',
 b'The Impalers',
 b'Automatics',
 b'Poetrip',
 b'The Shirelles',
 b'Michael French',
 b'Upset Tummy',
 b'Nicole Kidman',
 b'ALLY & DIAZ',
 b'Julian Soule',
 b'The Cundeez',
 b'Andr\xc3\xa9 Ceccarelli',
 b'Jaa9 & OnklP',
 b'Alex Aris',
 b'Juris Valainis',
 b'Le Fiasko',
 b'ADS',
 b'Herry Monster',
 b'Netsky',
 b'Maximum Boycott',
 b'TLC',
 b'Bombay Talkie',


In [90]:
r.hmget(b'Edgars Naglis', 'tags')

[b"[{'value': 'latvian', 'count': 1}, {'value': 'unknown year', 'count': 1}]"]

## 64. MongoDBの構築

In [92]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017) # if empty, connect default host and port

# getting a database
db = client['nlp100']

# getting a connection
artists = db['artists']

In [111]:
import json

with open('data/artist.json') as file:
    for line in file:
        artist = json.loads(line)
        
        # insert
        artist_id = artists.insert_one(artist).inserted_id


In [112]:
# check db connections
db.collection_names(include_system_collections=False)

['artists']

In [113]:
artists.count()

1924995

In [114]:
import pprint
pprint.pprint(artists.find_one())

{'_id': ObjectId('5a646993a6640c1835232a75'),
 'ended': True,
 'gid': '8972b1c1-6482-4750-b51f-596d2edea8b1',
 'id': 805192,
 'name': 'WIK▲N',
 'sort_name': 'WIK▲N',
 'tags': [{'count': 1, 'value': 'sillyname'}]}


### create index ( name, aliases.name, tags.value, rating.value )

In [116]:
import pymongo

In [117]:
result = db.profiles.create_index([('name', pymongo.ASCENDING)], unique=True)

In [142]:
sorted(list(db.profiles.index_information()))

['_id_', 'aliases.name_1', 'name_1', 'rating.value_1', 'tags.value_1']

In [138]:
result = db.profiles.create_index([('name', pymongo.ASCENDING)], unique=True)

In [139]:
result2 = db.profiles.create_index([('aliases.name', pymongo.ASCENDING)], unique=False)

In [140]:
result3 = db.profiles.create_index([('tags.value', pymongo.ASCENDING)], unique=False)

In [141]:
result4 = db.profiles.create_index([('rating.value', pymongo.ASCENDING)], unique=False)

### drop index 

In [136]:
db.profiles.drop_index("tags.value_1")

## 65. MongoDBの検索

in mongo interactive shell,

### select database
> use nlp100

switched to db nlp100

### count records
> db.artists.count()

1924995

### search for "Queen"
> db.artists.find({"name": "Queen"})

{ "_id" : ObjectId("5a646bbfa6640c18353bb224"), "gid" : "420ca290-76c5-41af-999e-564d7c71f1a7", "ended" : true, "id" : 701492, "name" : "Queen", "tags" : [ { "value" : "kamen rider w", "count" : 1 }, { "value" : "related-akb48", "count" : 1 } ], "sort_name" : "Queen", "aliases" : [ { "name" : "Queen", "sort_name" : "Queen" } ], "area" : "Japan", "type" : "Character", "gender" : "Female" }
{ "_id" : ObjectId("5a646bcda6640c18353c78d0"), "ended" : true, "name" : "Queen", "rating" : { "value" : 92, "count" : 24 }, "sort_name" : "Queen", "tags" : [ { "value" : "hard rock", "count" : 2 }, { "value" : "70s", "count" : 1 }, { "value" : "queen family", "count" : 1 }, { "value" : "90s", "count" : 1 }, { "value" : "80s", "count" : 1 }, { "value" : "glam rock", "count" : 1 }, { "value" : "british", "count" : 4 }, { "value" : "english", "count" : 1 }, { "value" : "uk", "count" : 2 }, { "value" : "pop/rock", "count" : 1 }, { "value" : "pop-rock", "count" : 1 }, { "value" : "britannique", "count" : 1 }, { "value" : "classic pop and rock", "count" : 1 }, { "value" : "queen", "count" : 1 }, { "value" : "united kingdom", "count" : 1 }, { "value" : "langham 1 studio bbc", "count" : 1 }, { "value" : "kind of magic", "count" : 1 }, { "value" : "band", "count" : 1 }, { "value" : "rock", "count" : 6 }, { "value" : "platinum", "count" : 1 } ], "type" : "Group", "begin" : { "year" : 1970, "month" : 6, "date" : 27 }, "area" : "United Kingdom", "gid" : "0383dadf-2a4e-4d10-a46a-e9e041da8eb3", "id" : 192, "aliases" : [ { "name" : "女王", "sort_name" : "女王" } ] }
{ "_id" : ObjectId("5a646beda6640c18353e3328"), "gid" : "5eecaf18-02ec-47af-a4f2-7831db373419", "ended" : true, "id" : 992994, "name" : "Queen", "sort_name" : "Queen" }

In [5]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017) # if empty, connect default host and port

# getting a database
db = client['nlp100']

# getting a connection
artists = db['artists']

In [6]:
artists.count()

1924995

In [8]:
cursor = db.artists.find({"name": "Queen"})

for document in cursor:
    print(document)

{'aliases': [{'name': 'Queen', 'sort_name': 'Queen'}], 'tags': [{'count': 1, 'value': 'kamen rider w'}, {'count': 1, 'value': 'related-akb48'}], 'gender': 'Female', 'name': 'Queen', 'id': 701492, 'area': 'Japan', 'sort_name': 'Queen', '_id': ObjectId('5a646bbfa6640c18353bb224'), 'ended': True, 'type': 'Character', 'gid': '420ca290-76c5-41af-999e-564d7c71f1a7'}
{'rating': {'count': 24, 'value': 92}, 'sort_name': 'Queen', 'ended': True, 'gid': '0383dadf-2a4e-4d10-a46a-e9e041da8eb3', 'tags': [{'count': 2, 'value': 'hard rock'}, {'count': 1, 'value': '70s'}, {'count': 1, 'value': 'queen family'}, {'count': 1, 'value': '90s'}, {'count': 1, 'value': '80s'}, {'count': 1, 'value': 'glam rock'}, {'count': 4, 'value': 'british'}, {'count': 1, 'value': 'english'}, {'count': 2, 'value': 'uk'}, {'count': 1, 'value': 'pop/rock'}, {'count': 1, 'value': 'pop-rock'}, {'count': 1, 'value': 'britannique'}, {'count': 1, 'value': 'classic pop and rock'}, {'count': 1, 'value': 'queen'}, {'count': 1, 'value'

In [9]:
cursor.count()

3

## 66. 検索件数の取得

in mongo shell

### search and count
> db.artists.find({"area" : "Japan"}).count()

32975

## 67. 複数のドキュメントの取得 

In [25]:
cursol = artists.find({"aliases.name" : "Queen"})

for doc in cursol:
    print(doc)

{'aliases': [{'name': 'Queen', 'sort_name': 'Queen'}], 'tags': [{'count': 1, 'value': 'kamen rider w'}, {'count': 1, 'value': 'related-akb48'}], 'gender': 'Female', 'name': 'Queen', 'id': 701492, 'area': 'Japan', 'sort_name': 'Queen', '_id': ObjectId('5a646bbfa6640c18353bb224'), 'ended': True, 'type': 'Character', 'gid': '420ca290-76c5-41af-999e-564d7c71f1a7'}


## 68. ソート

In [45]:
import pymongo
cursol = artists.find({"tags.value" : "dance"}).sort([("rating.count", pymongo.DESCENDING)])

for doc in cursol[:10]:
    print('{}\t{}'.format(doc['name'],  doc.get("rating")))

Madonna	{'count': 26, 'value': 88}
Björk	{'count': 23, 'value': 84}
The Prodigy	{'count': 23, 'value': 90}
Rihanna	{'count': 15, 'value': 68}
Britney Spears	{'count': 13, 'value': 83}
Britney Spears	{'count': 13, 'value': 83}
Britney Spears	{'count': 13, 'value': 83}
Maroon 5	{'count': 11, 'value': 60}
Adam Lambert	{'count': 7, 'value': 100}
Adam Lambert	{'count': 7, 'value': 100}


## 69. Webアプリケーションの作成

# 第8章: 機械学習