# Session 3 - Introduction to ```spaCy```

In [1]:
# pip install spacy pandas
# type into terminal

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting pandas
  Downloading pandas-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp310-cp310-manylinux_2_5_x8

In [3]:
import spacy

In [None]:
# python -m spacy download en_core_web_md
# type into terminal

In [4]:
nlp = spacy.load("en_core_web_md")

In [5]:
type(nlp)

spacy.lang.en.English

In [6]:
# sample sentence
text = "My name is Ida and I study at Aarhus University."

In [7]:
# create spaCy doc
doc = nlp(text)

In [8]:
type(doc)

spacy.tokens.doc.Doc

In [9]:
print(doc)

My name is Ida and I study at Aarhus University.


In [10]:
for token in doc:
    print(token.text)

My
name
is
Ida
and
I
study
at
Aarhus
University
.


In [15]:
for token in doc:
    # get index, text, part of speech, dependencies, and morphological information
    # overview of possible functions: https://spacy.io/api/attributes
    print(token.i, token.text, token.pos_, token.dep_, token.morph)

0 My PRON poss Number=Sing|Person=1|Poss=Yes|PronType=Prs
1 name NOUN nsubj Number=Sing
2 is AUX ROOT Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
3 Ida PROPN attr Number=Sing
4 and CCONJ cc ConjType=Cmp
5 I PRON nsubj Case=Nom|Number=Sing|Person=1|PronType=Prs
6 study VERB conj Tense=Pres|VerbForm=Fin
7 at ADP prep 
8 Aarhus PROPN compound Number=Sing
9 University PROPN pobj Number=Sing
10 . PUNCT punct PunctType=Peri


In [16]:
# NER
for entity in doc.ents:
    print(entity.text, entity.label_)

Ida PERSON
Aarhus University ORG


# Working with ```pandas```

In [17]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [21]:
# creating empty list called annotations and appending to it
annotations= []
for token in doc:
    annotations.append([token.text, token.pos_, token.dep_])

In [26]:
df = pd.DataFrame(annotations, 
columns = ["text", "pos", "dep"])

In [27]:
df

Unnamed: 0,text,pos,dep
0,My,PRON,poss
1,name,NOUN,nsubj
2,is,AUX,ROOT
3,Ida,PROPN,attr
4,and,CCONJ,cc
5,I,PRON,nsubj
6,study,VERB,conj
7,at,ADP,prep
8,Aarhus,PROPN,compound
9,University,PROPN,pobj


In [33]:
# get just the column thats called pos and count the different variables
df["pos"].value_counts()

pos
PROPN    3
PRON     2
NOUN     1
AUX      1
CCONJ    1
VERB     1
ADP      1
PUNCT    1
Name: count, dtype: int64

In [37]:
df.to_csv("annotations.csv", index=False)

In [38]:
input_df = pd.read_csv("annotations.csv")

In [39]:
input_df

Unnamed: 0,text,pos,dep
0,My,PRON,poss
1,name,NOUN,nsubj
2,is,AUX,ROOT
3,Ida,PROPN,attr
4,and,CCONJ,cc
5,I,PRON,nsubj
6,study,VERB,conj
7,at,ADP,prep
8,Aarhus,PROPN,compound
9,University,PROPN,pobj
