# *Document Processing and Visualisation using spaCy*

In [70]:
import os
import numpy as np
import pandas as pd

import spacy
from spacy import displacy

import torch
import torch.nn as nn
import torch.nn.functional as tfunc

import matplotlib.pyplot as plt

<hr style="height:2px;border:none;color:#333;background-color:#333;" />

## Taking a document and dividing into sentences

In [71]:
text_path='/kaggle/input/bbc-full-text-document-classification/bbc/tech/001.txt'
with open(text_path,'r',encoding='utf-8') as f:
    text=f.read()

nlp=spacy.load('en_core_web_sm')
doc=nlp(text)

for sent in doc.sents:
    print(sent.text)

Ink helps drive democracy in Asia

The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and ultraviolet readers in the country's elections as part of a drive to prevent multiple voting.


This new technology is causing both worries and guarded optimism among different sectors of the population.
In an effort to live up to its reputation in the 1990s as "an island of democracy", the Kyrgyz President, Askar Akaev, pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections.
The US government agreed to fund all expenses associated with this decision.


The Kyrgyz Republic is seen by many experts as backsliding from the high point it reached in the mid-1990s with a hastily pushed through referendum in 2003, reducing the legislative branch to one chamber with 75 deputies.
The use of ink is only one part of a general effort to show commitment towards more open elections - the German Embassy, the 

In [72]:
doc[1].morph  #Morphology

Number=Sing|Person=3|Tense=Pres|VerbForm=Fin

# *Visualising Dependencies*

In [73]:
displacy.render(doc,style='dep',jupyter=True)

# *Named Entitiy Visualisation*

In [74]:
displacy.render(doc,style='ent',jupyter=True)

<hr style="height:8px;border:none;color:#333;background-color:#333;" />

In [75]:
for token in doc:
    print(token.text,token.lemma_,token.pos_,sep=" | ")

Ink | ink | NOUN
helps | helps | AUX
drive | drive | VERB
democracy | democracy | NOUN
in | in | ADP
Asia | Asia | PROPN


 | 

 | SPACE
The | the | DET
Kyrgyz | Kyrgyz | PROPN
Republic | Republic | PROPN
, | , | PUNCT
a | a | DET
small | small | ADJ
, | , | PUNCT
mountainous | mountainous | ADJ
state | state | NOUN
of | of | ADP
the | the | DET
former | former | ADJ
Soviet | soviet | ADJ
republic | republic | NOUN
, | , | PUNCT
is | be | AUX
using | use | VERB
invisible | invisible | ADJ
ink | ink | NOUN
and | and | CCONJ
ultraviolet | ultraviolet | NOUN
readers | reader | NOUN
in | in | ADP
the | the | DET
country | country | NOUN
's | 's | PART
elections | election | NOUN
as | as | ADP
part | part | NOUN
of | of | ADP
a | a | DET
drive | drive | NOUN
to | to | PART
prevent | prevent | VERB
multiple | multiple | ADJ
voting | voting | NOUN
. | . | PUNCT


 | 

 | SPACE
This | this | DET
new | new | ADJ
technology | technology | NOUN
is | be | AUX
causing | cause | VERB
both | both | D

In [76]:
words=[token.text for token in doc]
vectors=[token.vector for token in doc]

from sklearn.decomposition import PCA

pca=PCA(n_components=2)
reduced_vectors=pca.fit_transform(vectors)

"""
plt.figure(figsize=(8,6))
for word,coord in zip(words,reduced_vectors):
    x,y=coord
    plt.scatter(x,y)
    plt.text(x+0.01,y+0.01,word,fontsize=12)

plt.title('Word Vector Visualisation')
plt.show()
"""

"\nplt.figure(figsize=(8,6))\nfor word,coord in zip(words,reduced_vectors):\n    x,y=coord\n    plt.scatter(x,y)\n    plt.text(x+0.01,y+0.01,word,fontsize=12)\n\nplt.title('Word Vector Visualisation')\nplt.show()\n"

<hr style="height:8px;border:none;color:#333;background-color:#333;" />

# *Word Vector Space Graph Visualisation using(t-distributed stochastic neighbour embedding)*

In [77]:
from sklearn.manifold import TSNE

vectors = np.array(vectors) 
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

In [78]:
import plotly.express as px
df=pd.DataFrame(reduced_vectors,columns=['x','y'])
df['word']=words
fig=px.scatter(df,x='x',y='y',hover_data=['word'],title='Word Vectors')
fig.update_traces(marker=dict(size=10,opacity=0.8))
fig.update_layout(showlegend=False)
fig.show()

<hr style="height:8px;border:none;color:#333;background-color:#333;" />

<div style="border: 2px solid #4CAF50; border-radius: 10px; padding: 15px; background-color: #F0FFF0; color: black;">
  <h3> Dependency Parse Components</h3>

  <ul>
    <li><b>token.dep_</b> → dependency relation of that word</li>
    <li><b>token.head.text</b> → the head word this token depends on</li>
    <li><b>token.head.pos_</b> → the part-of-speech of the head word</li>
    <li><b>token.children</b> → the list of tokens that depend on this token</li>
  </ul>
</div>

In [79]:
from IPython.display import HTML

#->In pandas tabular format

rows = []
for token in doc:
    rows.append({
        "Token": token.text,
        "Dep": token.dep_,
        "Head": token.head.text,
        "Head POS": token.head.pos_,
        "Children": ', '.join(child.text for child in token.children)
    })

df = pd.DataFrame(rows)
display(HTML(df.to_html(index=False)))

Token,Dep,Head,Head POS,Children
Ink,nsubj,helps,AUX,
helps,aux,using,VERB,"Ink, drive, ,"
drive,xcomp,helps,AUX,"democracy, in"
democracy,dobj,drive,VERB,
in,prep,drive,VERB,Republic
Asia,compound,Republic,PROPN,\n\n
\n\n,dep,Asia,PROPN,
The,det,Republic,PROPN,
Kyrgyz,compound,Republic,PROPN,
Republic,pobj,in,ADP,"Asia, The, Kyrgyz, ,, state"


<hr style="height:8px;border:none;color:#333;background-color:#333;" />

## *A few tokens need to be retokenized in the corpus example.*

In [80]:

for i in range(0,len(doc)):
    if doc[i].text=="The Kyrgyz Republic":
        print(i)
#->Use entity ruler or phrase matcher        
#->token.i if iterating token over doc
print(doc[7:10])

with doc.retokenize() as retokenizer:
    attrs = {"LEMMA": "Kyrgyz Republic", "POS": "PROPN", "TAG": "NNP"}
    retokenizer.merge(doc[7:10], attrs=attrs)

print(doc[7])

The Kyrgyz Republic
The Kyrgyz Republic
