In [1]:
!pip install spacy nltk pandas beautifulsoup4
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Artificial_intelligence"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text = ' '.join([p.text for p in soup.find_all('p')])
with open('wiki_article.txt', 'w') as f:
    f.write(text)

In [4]:
import pandas as pd
# For Kaggle dataset
df = pd.read_csv('wiki.csv')[:10]  # Use 10 articles for speed
# OR for scraped article
with open('wiki_article.txt') as f:
    text = f.read()

FileNotFoundError: [Errno 2] No such file or directory: 'wiki.csv'

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
pos_tags = [(token.text, token.pos_) for token in doc]
print(pos_tags[:10])

[('\n ', 'SPACE'), ('Artificial', 'PROPN'), ('intelligence', 'NOUN'), ('(', 'PUNCT'), ('AI', 'PROPN'), (')', 'PUNCT'), ('is', 'AUX'), ('the', 'DET'), ('capability', 'NOUN'), ('of', 'ADP')]


In [6]:
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
print(entities[:10])  # First 10 entities

[('AI', 'GPE'), ('Google Search', 'ORG'), ('YouTube', 'ORG'), ('Amazon', 'ORG'), ('Netflix', 'GPE'), ('Google', 'ORG'), ('Alexa', 'ORG'), ('Waymo', 'WORK_OF_ART'), ('AI', 'GPE'), ('AI', 'GPE')]


In [7]:
from spacy import displacy
doc = nlp(text[:200])  # Short sentence for visualization
displacy.render(doc, style="dep", jupyter=True)  # Use jupyter=False if not in Colab

In [8]:
import spacy
import pandas as pd
import re
from spacy import displacy

# Load SpaCy
nlp = spacy.load("en_core_web_sm")

# Load data
try:
    df = pd.read_csv('wiki.csv')[:10]
    texts = df['text']
except:
    with open('wiki_article.txt') as f:
        texts = [f.read()[:1000]]  # Limit for speed

# Preprocessing and extraction
data = []
for text in texts:
    cleaned = re.sub(r'http\S+|[^\x00-\x7F]+|[.,!?]', '', text.lower())
    doc = nlp(cleaned[:1000])  # Limit for speed
    pos_tags = [(token.text, token.pos_) for token in doc]
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    data.append([text[:100], pos_tags[:10], entities[:5]])  # Truncate for CSV

# Save to CSV
pd.DataFrame(data, columns=['text', 'pos_tags', 'entities']).to_csv('wiki_entities.csv')

# Visualize dependency tree
doc = nlp(texts[0][:200])  # First sentence
displacy.render(doc, style="dep", options={"compact": True}, page=True, minify=True)
with open('dep_tree.html', 'w') as f:
    f.write(displacy.render(doc, style="dep", page=True))

TypeError: write() argument must be str, not None