In [9]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [10]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
from bs4 import BeautifulSoup

In [37]:
def get_article_content(url):
    try:
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            article_content = soup.find('article')
            if article_content:
                return article_content.text.strip()
            else:
                return "No content found"
        
        else:
            print("Failed to fetch article content. Status code: ", response.status_code)
            return None
    
    except Exception as e:
        print("An error occurred whiile fetching NEWS article: ", str(e))
        return None

In [67]:
def get_yahoo_news():
    url = "https://news.yahoo.com/rss/"
    
    try:
        response = requests.get(url)
        
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            
            today = datetime.now().strftime('%Y-%m-%d')
            
            news_data = []
            for item in root.findall('.//item'):
                title = item.find('title').text
                link = item.find('link').text
                pub_date = item.find('pubDate').text
                
                if pub_date.startswith(today):
                    article_content = get_article_content(link)
                    # print(article_content)
                    news_data.append({
                        'pub_date': pub_date,
                        'link': link,
                        'title': title,
                        'body': article_content
                    })
            
            return news_data
        else:
            print("Falied to fetch data from Yahoo! News. Status Code: ", response.status_code)
            return None
    
    except Exception as e:
        print("An error occurred: ", str(e))
        return None

In [76]:
news = get_yahoo_news()

In [116]:
news

[{'pub_date': '2024-04-14T01:44:47Z',
  'link': 'https://www.yahoo.com/news/man-punches-9-old-girl-014447931.html',
  'title': 'Man punches 9-year-old girl in the face at Grand Central Terminal: NYPD',
  'body': 'WPIX New York City, NYMan punches 9-year-old girl in the face at Grand Central Terminal: NYPDMatthew EuzarragaApril 13, 2024 at 6:44 PM·1 min read250Link CopiedRead full articleMan punches 9-year-old girl in the face at Grand Central Terminal: NYPDNEW YORK (PIX11) – Police are looking for a man who punched a 9-year-old girl at Grand Central Terminal on Saturday morning, according to police.\xa0\xa0The young victim was standing near her mother near the dining concourse when the assailant approached and punched the child in the face in an unprovoked attack. The 9-year-old girl suffered dizziness and pain and was transported to NYU Langone Tisch Hospital for treatment, authorities said. NYC teacher’s aide arrested again, sent illicit texts to students: NYPD According to officials

In [84]:
import json
import pandas as pd

In [90]:
json_crime_data = json.dumps(news)
json_crime_data = json.loads(json_crime_data)

In [94]:
df = pd.DataFrame(json_crime_data)
df.to_csv("News_Data.csv", mode="a", header=False, index=False)

## Load Model

In [130]:
import pickle

In [133]:
# Load Model
with open("VotingClassifierModel.pkl", "rb") as fid:
  model = pickle.load(fid)

In [134]:
data = pd.read_csv("News_Data.csv")

In [135]:
data.head()

Unnamed: 0,pub_date,link,title,body
0,2024-04-14T01:44:47Z,https://www.yahoo.com/news/man-punches-9-old-g...,Man punches 9-year-old girl in the face at Gra...,"WPIX New York City, NYMan punches 9-year-old g..."
1,2024-04-14T03:17:07Z,https://www.yahoo.com/news/massive-failure-isr...,‘Massive failure of Israeli and American deter...,CNN‘Massive failure of Israeli and American de...
2,2024-04-14T01:44:47Z,https://www.yahoo.com/news/man-punches-9-old-g...,Man punches 9-year-old girl in the face at Gra...,"WPIX New York City, NYMan punches 9-year-old g..."
3,2024-04-14T03:17:07Z,https://www.yahoo.com/news/massive-failure-isr...,‘Massive failure of Israeli and American deter...,CNN‘Massive failure of Israeli and American de...


In [162]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import spacy
from geopy.geocoders import Nominatim
from tqdm import tqdm
tqdm.pandas()

In [163]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [164]:
nlp = spacy.load('en_core_web_sm')

def perform_ner(text):
  doc = nlp(text)
  entities = [(ent.text, ent.label_) for ent in doc.ents]
  return entities

In [165]:
geolocator = Nominatim(user_agent="crime_geocoder")

def geocode_locations(location_names):
  coordinates = []
  for location_name in location_names:
    try:
      location = geolocator.geocode(location_name)
      if location:
        coordinates.append((location.latitude, location.longitude))
    except Exception as e:
      print(f"Error geocoding {location_name}: {e}")

  return coordinates

In [166]:
def clean_text(text):
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  text = text.lower()
  stop_words = set(stopwords.words('english'))
  words = nltk.word_tokenize(text)
  filtered_words = [word for word in words if word not in stop_words]
  clean_text = ' '.join(filtered_words)
  return clean_text

def get_entities(clean_text):
  entities = perform_ner(clean_text)
  return entities

def get_coordinates(entities):
  location_entities = [entity for entity in entities if (entity[1] == "GPE" or entity[1] == "LOC")]
  coordinates = geocode_locations(location_entities)
  return coordinates

def process_test_data(data):
  data['combined_text'] = data['title'].astype(str).apply(clean_text) + ' ' + data['body'].astype(str).apply(clean_text)

  vectorizer = TfidfVectorizer(vocabulary=pickle.load(open("tfidf_vocab.pkl", "rb")))
  X = vectorizer.fit_transform(data['combined_text'])
  data['prediction'] = model.predict(X)
  data = data[data['prediction'] != 0]

  data['entities'] = data['combined_text'].apply(get_entities)
  data['coordinates'] = data['entities'].progress_apply(get_coordinates)

  return data

In [167]:
data = process_test_data(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['entities'] = data['combined_text'].apply(get_entities)
100%|██████████| 2/2 [00:04<00:00,  2.02s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['coordinates'] = data['entities'].progress_apply(get_coordinates)


In [168]:
data.head()

Unnamed: 0,pub_date,link,title,body,combined_text,prediction,entities,coordinates
0,2024-04-14T01:44:47Z,https://www.yahoo.com/news/man-punches-9-old-g...,Man punches 9-year-old girl in the face at Gra...,"WPIX New York City, NYMan punches 9-year-old g...",man punches yearold girl face grand central te...,1,"[(new york city, GPE), (nypdnew york, GPE), (s...","[(45.8419437, 1.2475966948424153)]"
2,2024-04-14T01:44:47Z,https://www.yahoo.com/news/man-punches-9-old-g...,Man punches 9-year-old girl in the face at Gra...,"WPIX New York City, NYMan punches 9-year-old g...",man punches yearold girl face grand central te...,1,"[(new york city, GPE), (nypdnew york, GPE), (s...","[(45.8419437, 1.2475966948424153)]"
