#### Project Idea : News Headlines (Canada, World, Business, Technology, Entertainment, Sports, Science, Health) ######

The idea for this project was to have a program scrape the Google News website for headlines. Each section of the site (Canada, World, Buisiness, Technology, Entertainment, Sports, Science, Health) was scraped and the headlines collected were subjected to cleaning

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

import json 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,mean_squared_error
from sklearn.metrics import silhouette_score

#### STEP 1. IMPORT URLS & CONTENT AND GET THE HEADLINES ####
8 MAJOR SECTIONS OF GOOGLE NEWS: Canada, World, Business, Technology, Entertainment, Sports, Science, Health

In [2]:
def scrape_headlines_from_urls(url_list):
    all_headlines = []

    for url in url_list:
        # Fetch content from URL
        response = requests.get(url)
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the content with BeautifulSoup
            soup = bs(response.text, 'html.parser')
            # Find h4 elements that might contain headlines
            headlines = soup.find_all('a',class_='gPFEn')
            
            # Extract the text from each headline and add to the list
            for headline in headlines:
                all_headlines.append(headline.text.strip())  # .strip() to remove leading/trailing whitespaces
        else:
            print(f'Failed to retrieve {url} with status code: {response.status_code}')

    return all_headlines

# Example usage:
urls = [
    #Canada
    'https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSkwyMHZNR1F3TmpCbkVnVmxiaTFIUWlnQVAB?hl=en-CA&gl=CA&ceid=CA%3Aen',
    #World: 
    'https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JXVnVMVWRDR2dKRFFTZ0FQAQ?hl=en-CA&gl=CA&ceid=CA%3Aen',
    #Business
    'https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JXVnVMVWRDR2dKRFFTZ0FQAQ?hl=en-CA&gl=CA&ceid=CA%3Aen',
    #Technology
    'https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGRqTVhZU0JXVnVMVWRDR2dKRFFTZ0FQAQ?hl=en-CA&gl=CA&ceid=CA%3Aen',
    #Entertainment
    'https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JXVnVMVWRDR2dKRFFTZ0FQAQ?hl=en-CA&gl=CA&ceid=CA%3Aen'
    #Sports
    'https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JXVnVMVWRDR2dKRFFTZ0FQAQ?hl=en-CA&gl=CA&ceid=CA%3Aen',
    #Science
    'https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp0Y1RjU0JXVnVMVWRDR2dKRFFTZ0FQAQ/sections/CAQiSkNCQVNNUW9JTDIwdk1EWnRjVGNTQldWdUxVZENHZ0pEUVNJT0NBUWFDZ29JTDIwdk1ETTJYeklxQ2hJSUwyMHZNRE0yWHpJb0FBKi4IACoqCAoiJENCQVNGUW9JTDIwdk1EWnRjVGNTQldWdUxVZENHZ0pEUVNnQVABUAE?hl=en-CA&gl=CA&ceid=CA%3Aen',
    # Health
    'https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNR3QwTlRFU0JXVnVMVWRDS0FBUAE?hl=en-CA&gl=CA&ceid=CA%3Aen'
]

# Call the function and print the results
headlines = scrape_headlines_from_urls(urls)
for headline in headlines:
    print(headline)
len(headlines)

Snow Day for Jan. 18: Which Metro Vancouver schools are open and closed
Parts of B.C. dealing with major snowfall, closed schools, difficult travel
STORM CENTRE: Lower Mainland blanketed with snow
Snow Day: Here is a list of live webcams showing current road conditions on Vancouver Island
Driver dead in Vancouver parkade crash at UBC: police - BC News
Car crashes through UBC parkade
Vehicle crashes through 2nd-storey parkade on UBC campus: Vancouver Fire Rescue
VFRS: SUV falls out of parkade at UBC, rescue service called
Prime Minister Justin Trudeau heads to Nunavut for signing on transfer of powers
Trudeau to sign long-awaited devolution agreement with Nunavut Thursday
Justin Trudeau in Nunavut to sign 'historic' agreement
Trudeau, Akeeagok to sign ‘largest land transfer in Canada’s history’
Canadian charged for allegedly lighting a fire that grew to be the largest in Nova Scotia's history
N.S. news: Man charged following Barrington Lake fire
Nova Scotia man charged with igniting mas

1092

#### STEP 2. CREATE DATAFRAME & FREEZE THE DATA GENERATED (CSV FILE) ####

In [3]:
df_=pd.DataFrame(headlines)
df_.columns=["Headline"]
df_.shape

#df_.to_csv("newsheadlines_11062023.csv")
#df_.to_csv("newsheadlines_11072023.csv")
df_.to_csv("newsheadlines_10122024.csv")

In [4]:
csv_files=["newsheadlines_10122024.csv"]

df_csv_append = pd.DataFrame()

for file in csv_files:
    df=pd.read_csv(file)
    df_csv_append=df_csv_append.concat(df, ignore_index=True)
    
df=df_csv_append
df #2349 headlines

AttributeError: 'DataFrame' object has no attribute 'concat'

#### STEP 3. CLEAN DATAFRAME  ####

In [None]:
# Remove duplicates
df=df.drop_duplicates(subset = "Headline", keep = 'first')
df #reduced to 2016

# Remove \n
df.loc[:,"Headline Cleaned"]= df["Headline"].replace(r'\n',' ', regex=True) 

# Remove Punctuation
df.loc[:,"Headline Cleaned"]=df["Headline Cleaned"].str.replace(r'[^\w\s]+', '')

# Remove Capital Letters
df["Headline Cleaned"]=df["Headline Cleaned"].str.lower()

# Remove Stop Words
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords=(stopwords.words('english'))
stopwords

df["Headline Cleaned"]=df["Headline Cleaned"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

# Remove Numbers
df["Headline Cleaned"]=df["Headline Cleaned"].str.replace('\d+', '',regex=True)


# Lemmatizing
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
lemmatized_string=[]

for row in range(0,len(df["Headline Cleaned"])):
    tokens=nltk.word_tokenize(df["Headline Cleaned"].iloc[row])
    lemmatized_string.append( ' '.join([lemmatizer.lemmatize(words) for words in tokens]))
    
df["Headline Cleaned"]=lemmatized_string

#Stemming
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
df['Headline Cleaned'] = df['Headline Cleaned'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['Headline'] = df['Headline'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

df

#### STEP 4. VECTORIZATION ####

In [None]:
tf=TfidfVectorizer()
X=df['Headline Cleaned']
X_=tf.fit_transform(X)
X_

words=tf.get_feature_names_out() #word associated to each vector
len(words) #3327 (without lemmatization and stemming, 4048)
# for word in words:
#      print(word)

#### STEP 5. CLUSTERING ####

In [None]:
#Determining optimal number of clusters

# Calculate WCSS for a range of k values
WCSS = []
silhouette_scores = []
K_range = range(2, 15)  # You can choose the max value of k based on your dataset and needs

for k in K_range:
    #Fit a kmeans model to X_:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init = 10)
    kmeans.fit(X_)
    #Calculate silhouette score
    silhouette_average = silhouette_score(X_, kmeans.labels_)
    silhouette_scores.append(silhouette_average)
    # Inertia method returns WCSS for that model
    WCSS.append(kmeans.inertia_)

# Plot for WCSS
plt.figure(figsize=(10, 5))
plt.plot(K_range, WCSS, 'bo-', markerfacecolor='red')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.xticks(K_range)
plt.grid(True)
plt.show()  # This shows the first plot for WCSS

# Plot for silhouette scores
plt.figure(figsize=(10, 5))
plt.plot(K_range, silhouette_scores, 'bo-', markerfacecolor='blue')
plt.title('Silhouette Scores For Different k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(K_range)
plt.grid(True)
plt.show()  # This shows the second plot for silhouette scores

In [None]:
#Kmean clustering: 

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Determine the number of clusters (e.g., using the elbow method)
k=8

# Apply KMeans clustering
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_)

# Dimensionality reduction for visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(X_.toarray())

# Plot the clusters
plt.figure(figsize=(10, 5))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=clusters, cmap='viridis', marker='o')
plt.title('KMeans Clustering Visualization')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')

# Plot the centroids
centroids = pca.transform(kmeans.cluster_centers_)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=150, c='red')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
import numpy as np

# Create a new plot
plt.figure(figsize=(10, 5))

colors = plt.cm.tab10(np.arange(k))  # Generate color array for each cluster

# Plot each cluster
for i in range(k):
    # Select the indices of points in the current cluster
    indices = np.where(clusters == i)
    #print(indices)
    
    # Select the points that form the cluster
    cluster_points = reduced_features[indices]
    
    # Plot the points
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], c=[colors[i]], label=f'Cluster {i}')
    
    # Create the convex hull
    if len(cluster_points) > 2:  # ConvexHull can't be created with fewer than 3 points
        hull = ConvexHull(cluster_points)
        
        # Draw the convex hull
        # Generate the coordinates for the hull points
        hull_points = cluster_points[hull.vertices]
        
        # Fill the convex hull with a semi-opaque color and no border
        plt.fill(hull_points[:,0], hull_points[:,1], alpha=0.3, c=colors[i], edgecolor='none')

#Plot the centroids
centroids = pca.transform(kmeans.cluster_centers_)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=150, c='red', label='Centroids')

# Final plot adjustments
plt.title('KMeans Clustering with Convex Hulls')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.legend()
plt.show()

In [None]:
common_words = kmeans.cluster_centers_.argsort()[:,-1:-21:-1] #converts each centroid to a list of the columns most highly-valued words
for index, centroid in enumerate(common_words):
    print('Cluster '+str(index)+"'s top key words" + ' : ' + ', '.join(words[word] for word in centroid))
    print('\n')

In [None]:
#Plot the words

import matplotlib.pyplot as plt
   
x_axis=reduced_features[:, 0]
y_axis=reduced_features[:, 1]

fig,plt=plt.subplots(figsize=(20,20))

plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=clusters, cmap='viridis', marker='o')

words=tf.get_feature_names_out() #Get output feature names for transformation

for index,word in enumerate(words):
    plt.annotate(word,(x_axis[index],y_axis[index]))
    #print(word,(x_axis[index],y_axis[index]))

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

# Let k be the optimal number of clusters determined from the last section
k =8  # Replace with the number you determined

#Perform Principal Component Analysis to reduce to 3 features:
pca = PCA(n_components=3)
reduced_features = pca.fit_transform(X_.toarray())

# Apply KMeans clustering to the reduced data
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(reduced_features)

# Create a new plot for 3D
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

colors = plt.cm.tab10(np.arange(10))  # Adjust range according to the number of clusters

# Plot each cluster in 3D
for i in range(k):
    # Select the indices of points in the current cluster
    indices = np.where(clusters == i)
    
    # Select the points that form the cluster
    cluster_points = reduced_features[indices]
    
    # Plot the points
    ax.scatter(cluster_points[:, 0], cluster_points[:, 1], cluster_points[:, 2], c=[colors[i]], label=f'Cluster {i}')

# Plotting centroids in 3D
centroids = pca.transform(kmeans.cluster_centers_)
ax.scatter(centroids[:, 0], centroids[:, 1], centroids[:, 2], marker='x', s=150, c='red', label='Centroids')

# Final plot adjustments
ax.set_title('3D KMeans Clustering')
ax.set_xlabel('PCA Feature 1')
ax.set_ylabel('PCA Feature 2')
ax.set_zlabel('PCA Feature 3')
plt.legend()
plt.show()