<a href="https://colab.research.google.com/github/jgumtau/datascience15/blob/main/Copy_of_NLP_Description_for_Students.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing



This project will give you practical experience using Natural Language Processing techniques. This project is in three parts:
- in part 1) you will use a traditional dataset in a CSV file
- in part 2) you will use the Wikipedia API to directly access content
on Wikipedia.
- in part 3) you will make your notebook interactive


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
%%capture
# Install Textblob
!pip install -U textblob

In [4]:
from textblob import TextBlob

In [5]:
%%capture
# Download Corpora
!python -m textblob.download_corpora

In [6]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

### Part 1)



- The CSV file is available at https://ddc-datascience.s3.amazonaws.com/Projects/Project.5-NLP/Data/NLP.csv
- The file contains a list of famous people and a brief overview.
- The goal of part 1) is provide the capability to
  - Take one person from the list as input and output the 10 other people who's overview are "closest" to the person in a Natural Language Processing sense
  - Also output the sentiment of the overview of the person



In [7]:
url = 'https://ddc-datascience.s3.amazonaws.com/Projects/Project.5-NLP/Data/NLP.csv'
!curl -s $url | wc -l

42786


In [8]:
# look at the first five rows
df = pd.read_csv(url)
df.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [9]:
df.tail()

Unnamed: 0,URI,name,text
42781,<http://dbpedia.org/resource/Motoaki_Takenouchi>,Motoaki Takenouchi,motoaki takenouchi born july 8 1967 saitama pr...
42782,<http://dbpedia.org/resource/Alan_Judge_(footb...,"Alan Judge (footballer, born 1960)",alan graham judge born 14 may 1960 is a retire...
42783,<http://dbpedia.org/resource/Eduardo_Lara>,Eduardo Lara,eduardo lara lozano born 4 september 1959 in c...
42784,<http://dbpedia.org/resource/Tatiana_Faberg%C3...,Tatiana Faberg%C3%A9,tatiana faberg is an author and faberg scholar...
42785,<http://dbpedia.org/resource/Kenneth_Thomas>,Kenneth Thomas,kenneth thomas born february 24 1938 was chief...


In [10]:
df.columns

Index(['URI', 'name', 'text'], dtype='object')

In [11]:
df.shape

(42786, 3)

In [12]:
# pick a person
person = 'Harpdog Brown'

# find index of the person
df[df['name'] == person].index

Index([2], dtype='int64')

In [13]:
# check the sentiment of the overview for harpdog brown
blob = TextBlob(df[df['name'] == person]['text'].values[0])
blob_sent = blob.sentiment

In [14]:
# find which people are closest in sentiment to harpdog brown
# exclude harpdog brown
df[df['name'] != person]['text'].apply(lambda x: TextBlob(x).sentiment)
sentiment_scores = []
for i in df['name']:
  blob = TextBlob(df[df['name'] == person]['text'].values[0])
  sentiment_scores.append(blob.sentiment)

In [15]:
sentiment_scores

[Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.247

In [16]:
import heapq

def find_closest_values(lst, target, num_closest=10):
    """
    Finds the N closest values in a list to a target value,
    considering the polarity of Sentiment objects.

    Args:
        lst: A list of values, which may include Sentiment objects.
        target: The target value to compare against. Should be a Sentiment object.
        num_closest: The number of closest values to return (default: 10).

    Returns:
        A list of the N closest values in the input list.
    """
    closest_values = heapq.nsmallest(
        num_closest, lst, key=lambda x: abs(x.polarity - target.polarity)
    )
    return closest_values

In [17]:
find_closest_values(sentiment_scores, blob_sent)

[Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098),
 Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098)]

In [18]:
closest_sentiments = find_closest_values(sentiment_scores, blob_sent, num_closest=11)
print(closest_sentiments)

[Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098), Sentiment(polarity=0.24754901960784317, subjectivity=0.3892156862745098)]


In [19]:
# add a sentiment column to the dataframe
df['sentiment'] = sentiment_scores

In [20]:
# Get the target person's sentiment
target_sentiment = df[df['name'] == person]['sentiment'].values[0]

# Calculate sentiment differences and store in a list of tuples (difference, name)
sentiment_differences = [
    (abs(target_sentiment.polarity - sentiment.polarity), name)
    for name, sentiment in zip(df['name'], df['sentiment'])
    if name != person  # Exclude the target person
]

# Find the 10 closest people using nsmallest
closest_people = heapq.nsmallest(10, sentiment_differences, key=lambda x: x[0])

# Extract the names of the closest people
closest_names = [person[1] for person in closest_people]

# Update other_people DataFrame
other_people = df[df['name'].isin(closest_names)]

In [21]:
other_people

Unnamed: 0,URI,name,text,sentiment
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"(0.24754901960784317, 0.3892156862745098)"
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"(0.24754901960784317, 0.3892156862745098)"
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"(0.24754901960784317, 0.3892156862745098)"
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"(0.24754901960784317, 0.3892156862745098)"
5,<http://dbpedia.org/resource/Sam_Henderson>,Sam Henderson,sam henderson born october 18 1969 is an ameri...,"(0.24754901960784317, 0.3892156862745098)"
6,<http://dbpedia.org/resource/Aaron_LaCrate>,Aaron LaCrate,aaron lacrate is an american music producer re...,"(0.24754901960784317, 0.3892156862745098)"
7,<http://dbpedia.org/resource/Trevor_Ferguson>,Trevor Ferguson,trevor ferguson aka john farrow born 11 novemb...,"(0.24754901960784317, 0.3892156862745098)"
8,<http://dbpedia.org/resource/Grant_Nelson>,Grant Nelson,grant nelson born 27 april 1971 in london also...,"(0.24754901960784317, 0.3892156862745098)"
9,<http://dbpedia.org/resource/Cathy_Caruth>,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes pro...,"(0.24754901960784317, 0.3892156862745098)"
10,<http://dbpedia.org/resource/Sophie_Crumb>,Sophie Crumb,sophia violet sophie crumb born september 27 1...,"(0.24754901960784317, 0.3892156862745098)"


### Part 2)



- For the same person from step 1), use the Wikipedia API to access the whole content of that person's Wikipedia page.
- The goal of part 2) is to produce the capability to:
  1. For that Wikipedia page determine the sentiment of the entire page
  1. Print out the Wikipedia article
  1. Collect the Wikipedia pages from the 10 nearest neighbors in Step 1)
  1. Determine the nearness ranking of these 10 to your main subject based on their entire Wikipedia page
  1. Compare the nearest ranking from Step 1) with the Wikipedia page nearness ranking



In [22]:
!curl -s https://ddc-datascience.s3.amazonaws.com/Projects/Project.5-NLP/Data/NLP.csv | wc -l

42786


In [23]:
import requests
import json
import re
from bs4 import BeautifulSoup

In [24]:
# determine sentiment of the entire wikipedia page
def wikiread(name):
  # match entry to name in df
  name = name.replace(' ', '_')
  name = name.replace("'", "")
  url = f'https://en.wikipedia.org/wiki/{name}'
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  text = soup.get_text()
  text = re.sub(r'\[.*?\]+', '', text)
  text
  blob = TextBlob(text)
  print (blob)
  print(blob.sentiment)

In [25]:
wikiread(person)





Harpdog Brown - Wikipedia




































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Early years








2
Career








3
Personal life and death








4
Awards and recognitions








5
Discography








6
References








7
External links


















Toggle the table of contents







Harpdog Brown



1 language




Simple English

Edit links











ArticleTalk





English

















ReadEditView history







Tools






In [26]:
# do wikiread in a for loop for the people present in other_people
for name in other_people['name']:
  wikiread(name)
  print("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Digby Morrell - Wikipedia



























Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
References








2
External links


















Toggle the table of contents







Digby Morrell



Add languages





Add links











ArticleTalk





English

















ReadEditView history







Tools





Tools
move to sidebar
hide



		Actions
	


ReadEditView history





		General
	


Wha

### Part 3)


Make an interactive notebook.

In addition to presenting the project slides, at the end of the presentation each student will demonstrate their code using a famous person suggested by the other students that exists in the DBpedia set.


In [27]:
import ipywidgets as widgets
from IPython.display import display

# Create sliders for subjectivity and sentiment
subjectivity_slider = widgets.FloatSlider(
    value=0.5,
    min=0,
    max=1,
    step=0.1,
    description='Subjectivity:',
    continuous_update=False,
)

sentiment_slider = widgets.FloatSlider(
    value=0,
    min=-1,
    max=1,
    step=0.1,
    description='Sentiment:',
    continuous_update=False,
)

# Create an output widget to display the selected person
output_widget = widgets.Output()

# Define a function to find the closest person
def find_closest_person(subjectivity, sentiment):
    # Calculate sentiment and subjectivity differences for each person
    df['sentiment_diff'] = abs(df['sentiment'].apply(lambda x: x.polarity) - sentiment)
    df['subjectivity_diff'] = abs(df['sentiment'].apply(lambda x: x.subjectivity) - subjectivity)

    # Find the person with the minimum combined difference
    closest_person = df.loc[
        (df['sentiment_diff'] + df['subjectivity_diff']).idxmin(), 'name'
    ]

    with output_widget:
        output_widget.clear_output()
        print(f"Closest person: {closest_person}")

# Define an event handler for the sliders
def on_slider_change(change):
    find_closest_person(subjectivity_slider.value, sentiment_slider.value)

# Observe slider changes and call the event handler
subjectivity_slider.observe(on_slider_change, names='value')
sentiment_slider.observe(on_slider_change, names='value')

# Display the widgets
display(subjectivity_slider, sentiment_slider, output_widget)

# Initial calculation
find_closest_person(subjectivity_slider.value, sentiment_slider.value)

FloatSlider(value=0.5, continuous_update=False, description='Subjectivity:', max=1.0)

FloatSlider(value=0.0, continuous_update=False, description='Sentiment:', max=1.0, min=-1.0)

Output()

In [28]:
name_input = widgets.Text(
    placeholder='Enter a name',
    description='Name:',
    disabled=False
)

In [29]:
output_widget = widgets.Output()

In [30]:
def search_name(name):
    """Searches for a name in the DataFrame and prints the name and sentiment."""
    try:
        result = df[df['name'] == name][['name', 'sentiment']].iloc[0]  # Select only name and sentiment
        with output_widget:
            output_widget.clear_output()
            print(f"Name: {result['name']}")
            print(f"Sentiment: {result['sentiment']}")
    except IndexError:
        with output_widget:
            output_widget.clear_output()
            print("Name not found in the DataFrame.")

In [31]:
def handle_submit(sender):
    """Handles the submit event of the input box."""
    search_name(name_input.value)

In [32]:
name_input.on_submit(handle_submit)

In [33]:
display(name_input, output_widget)

Text(value='', description='Name:', placeholder='Enter a name')

Output()