### Christmas Story Analysis with spaCy

## Imports

In [71]:
import pandas as pd
import requests

In [76]:
# Install and import spacy and plotly.
!pip install spaCy
!pip install plotly
!pip install nbformat==5.1.2




In [77]:
# Import spacy
import spacy

# Install English language model
!spacy download en_core_web_sm

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

2023-12-13 15:51:17.568062: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-13 15:51:17.568151: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-13 15:51:17.568195: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now l

## Uploads

In [73]:
response1 = requests.get("https://www.gutenberg.org/cache/epub/6670/pg6670.txt")
response2 = requests.get("https://www.gutenberg.org/cache/epub/20453/pg20453.txt")
response3 = requests.get("https://www.gutenberg.org/cache/epub/14508/pg14508.txt")


In [75]:
text1 = response1.text
text2 = response2.text
text3 = response3.text

##Preprossesing

In [78]:
# Create empty lists for file names and contents
texts = [text1, text2, text3]
file_names = ["Christmas Eve", "The Christmas Child", "The Christmas Dinner"]
print(os.getcwd())

/content


In [79]:
# Create dictionary object associating each file name with its text
d = {'Filename':file_names,'Text':texts}

In [80]:
# Turn dictionary into a dataframe
xmas_story_df = pd.DataFrame(d)

In [81]:
xmas_story_df.head()

Unnamed: 0,Filename,Text
0,Christmas Eve,﻿The Project Gutenberg eBook of Christmas Eve\...
1,The Christmas Child,﻿The Project Gutenberg eBook of The Christmas ...
2,The Christmas Dinner,﻿The Project Gutenberg eBook of The Christmas ...


In [82]:
# Remove extra spaces from papers
xmas_story_df['Text'] = xmas_story_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
xmas_story_df.head()

Unnamed: 0,Filename,Text
0,Christmas Eve,﻿The Project Gutenberg eBook of Christmas Eve ...
1,The Christmas Child,﻿The Project Gutenberg eBook of The Christmas ...
2,The Christmas Dinner,﻿The Project Gutenberg eBook of The Christmas ...


In [83]:
# Remove .txt from title of each paper
xmas_story_df['Filename'] = xmas_story_df['Filename'].str.replace('.txt', '', regex=True)

# Rename column from paper ID to Title
xmas_story_df.rename(columns={"PAPER ID": "Filename"}, inplace=True)

In [84]:
# Print DataFrame
xmas_story_df.head()

Unnamed: 0,Filename,Text
0,Christmas Eve,﻿The Project Gutenberg eBook of Christmas Eve ...
1,The Christmas Child,﻿The Project Gutenberg eBook of The Christmas ...
2,The Christmas Dinner,﻿The Project Gutenberg eBook of The Christmas ...


##Creating Doc objects

In [85]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [86]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [87]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
xmas_story_df['Doc'] = xmas_story_df['Text'].apply(process_text)

##Text reduction

In [88]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [89]:
# Run the token retrieval function on the doc objects in the dataframe
xmas_story_df['Tokens'] = xmas_story_df['Doc'].apply(get_token)
xmas_story_df.head()

Unnamed: 0,Filename,Text,Doc,Tokens
0,Christmas Eve,﻿The Project Gutenberg eBook of Christmas Eve ...,"(﻿The, Project, Gutenberg, eBook, of, Christma...","[﻿The, Project, Gutenberg, eBook, of, Christma..."
1,The Christmas Child,﻿The Project Gutenberg eBook of The Christmas ...,"(﻿The, Project, Gutenberg, eBook, of, The, Chr...","[﻿The, Project, Gutenberg, eBook, of, The, Chr..."
2,The Christmas Dinner,﻿The Project Gutenberg eBook of The Christmas ...,"(﻿The, Project, Gutenberg, eBook, of, The, Chr...","[﻿The, Project, Gutenberg, eBook, of, The, Chr..."


In [90]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
xmas_story_df['Lemmas'] = xmas_story_df['Doc'].apply(get_lemma)

##Text Annotation

In [91]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
xmas_story_df['POS'] = xmas_story_df['Doc'].apply(get_pos)

In [92]:
# Create a list of part of speech tags
list(xmas_story_df['POS'])

[[('NOUN', 'NN'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('ADP', 'IN'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('AUX', 'VBZ'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('PRON', 'NN'),
  ('ADV', 'RB'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('PROPN', 'NNP'),
  ('PROPN', 'NNP'),
  ('CCONJ', 'CC'),
  ('ADJ', 'JJS'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NNS'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('CCONJ', 'CC'),
  ('ADP', 'IN'),
  ('ADV', 'RB'),
  ('PRON', 'DT'),
  ('NOUN', 'NNS'),
  ('ADV', 'RB'),
  ('PUNCT', '.'),
  ('PRON', 'PRP'),
  ('AUX', 'MD'),
  ('VERB', 'VB'),
  ('PRON', 'PRP'),
  ('PUNCT', ','),
  ('VERB', 'VB'),
  ('PRON', 'PRP'),
  ('ADV', 'RB'),
  ('CCONJ', 'CC'),
  ('VERB', 'VB'),
  ('VERB', 'VB'),
  ('VERB', 'VB'),
  ('PRON', 'PRP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NNS'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('PROP

In [93]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
xmas_story_df['Proper_Nouns'] = xmas_story_df['Doc'].apply(extract_proper_nouns)

In [94]:
list(xmas_story_df.loc[[0,2], 'Proper_Nouns'])

[['Project',
  'Gutenberg',
  'eBook',
  'Christmas',
  'Eve',
  'United',
  'States',
  'Project',
  'Gutenberg',
  'License',
  'United',
  'States',
  'eBook',
  'Christmas',
  'Eve',
  'Robert',
  'Browning',
  'Release',
  'October',
  'March',
  'English',
  'Juliet',
  'Sutherland',
  'Charles',
  'Franks',
  'Online',
  'Distributed',
  'Proofreading',
  'Team',
  'Al',
  'Haines',
  'PROJECT',
  'GUTENBERG',
  'CHRISTMAS',
  'EVE',
  'Juliet',
  'Sutherland',
  'Charles',
  'Franks',
  'Online',
  'Distributed',
  'Proofreading',
  'Team',
  'Al',
  'Haines',
  'CHRISTMAS',
  'EVE',
  'BROWNING',
  'Heaven',
  'inside--',
  'Lending',
  'Mount',
  'Zion',
  'Lot',
  'Gomorrah',
  'Hinge',
  'Lay',
  'Penitent',
  'Thief',
  'Saint',
  'John',
  'Candlestick',
  'Rev.',
  'Grand',
  'Inquisitor',
  'Seven',
  'Churches',
  'God',
  'ploughshares,--',
  'Founder',
  'Mine',
  'Supposing',
  'Testament',
  'wet',
  'apron',
  'wound',
  '--To',
  'wit',
  'Zion',
  'Chapel',
  'M

In [95]:
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
    print(label + ' : ' + spacy.explain(label))

CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


In [96]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
xmas_story_df['Named_Entities'] = xmas_story_df['Doc'].apply(extract_named_entities)
xmas_story_df['Named_Entities']

0    [PERSON, GPE, ORG, GPE, PRODUCT, PERSON, DATE,...
1    [PERSON, GPE, ORG, GPE, PRODUCT, ORG, DATE, DA...
2    [PERSON, ORG, GPE, ORG, GPE, PRODUCT, DATE, DA...
Name: Named_Entities, dtype: object

In [97]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
xmas_story_df['NE_Words'] = xmas_story_df['Doc'].apply(extract_named_entities)
xmas_story_df['NE_Words']

0    [(Project, Gutenberg, eBook), (the, United, St...
1    [(Project, Gutenberg, eBook), (the, United, St...
2    [(Project, Gutenberg, eBook), (The, Christmas,...
Name: NE_Words, dtype: object

In [98]:
# Extract the first Doc object
doc = xmas_story_df['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

##Download datasets

In [100]:
# Save DataFrame as csv (in Google Drive)
# Use this step only to save  csv to your computer's working directory
xmas_story_df.to_csv('xmas_story_spaCy_tags.csv')