# Data Mining

We'll be learning (i) the basics of web scraping and (ii) how to analyze unstructured text such that you can identify suicides on reddit

<div style="text-align: center;"> <img src = "res/data_mining/reddit_suicides_logo.jpg" width="20%"/> </div>

To do this, we'll be undertaking web scraping & data cleaning (§1-3) before transforming the data (§4) and analyzing it via PCAs (§5)

<div style="text-align: center;"> <img src = "res/data_mining/reddit_suicides_pipeline.jpg" width="50%"/> </div>

## 0 | Google Colab Setup

In [None]:
import os
import shutil
import stat

In [None]:
def copy_safe(src, dst, max_len=200):
    """Copy files, skip long paths"""
    skipped = 0
    for root, dirs, files in os.walk(src):
        rel_path = os.path.relpath(root, src)
        dst_root = os.path.join(dst, rel_path) if rel_path != '.' else dst
        if len(dst_root) < max_len:
            os.makedirs(dst_root, exist_ok=True)
            for file in files:
                dst_file = os.path.join(dst_root, file)
                if len(dst_file) < max_len:
                    try: shutil.copy2(os.path.join(root, file), dst_file)
                    except: skipped += 1
                else: skipped += 1
        else: skipped += len(files)
    return skipped

In [None]:
# Setup resources if needed
setup_ran = False
if not os.path.exists('res'):
    print("Setting up resources...")
    setup_ran = True
    
    # Cleanup, clone, copy
    repo = 'deep_learning_resources'
    if os.path.exists(repo):
        shutil.rmtree(repo, onerror=lambda f,p,e: os.chmod(p, stat.S_IWRITE) or f(p))
    
    !git clone --depth=1 https://github.com/jjv31/deep_learning_resources
    
    if os.path.exists(f'{repo}/res'):
        skipped = copy_safe(f'{repo}/res', 'res')
        print(f"Setup complete! {'(' + str(skipped) + ' long filenames skipped)' if skipped else ''}")
    
    shutil.rmtree(repo, onerror=lambda f,p,e: os.chmod(p, stat.S_IWRITE) or f(p))

In [None]:
# Only refresh if we just downloaded resources
if setup_ran:
    from IPython.display import Javascript, display
    import time
    
    print("Refreshing images...")
    
    # Try browser refresh + aggressive image reload
    display(Javascript(f'''
    try {{ setTimeout(() => window.location.reload(true), 2000); }} catch(e) {{}}
    
    const t = {int(time.time())};
    document.querySelectorAll('img').forEach((img, i) => {{
        if (img.src.includes('res/')) {{
            const src = img.src.split('?')[0];
            setTimeout(() => img.src = src + '?v=' + t + '_' + i, i * 50);
        }}
    }});
    '''))
    
    print("If images don't appear, press Ctrl+Shift+R to hard refresh!")
else:
    print("Resources already exist, skipping setup.")

## 1 | Web Scraping Basics

In [None]:
#Text pre-processing and visualization, respectively
%pip install neattext wordcloud

In [None]:
%pip install spacy

### 1.1 | Accessing our first webpage

In [None]:
# Libraries for downloading webpages
import requests # Main library for sending/recieving requests
import urllib.request # For customizing requests (e.g., adding a header)
from urllib.request import urljoin
# Once we download our webpages, we need to make them legibile. We'll do this via BeautifulSoup
from bs4 import BeautifulSoup

# Saving our webpages to the system
import os
import shutil

In [None]:
url = "https://www.college.police.uk/app"
request = urllib.request.Request(url)
request

In [None]:
def crawl_webpage(request):
    try:
        with urllib.request.urlopen(request) as response:

            # Decode the HTML
            raw_response = response.read()
            print("Webpage crawled")
            return raw_response
            
    except Exception as e:
        print("FAILED")
        print(e)
        return None

In [None]:
crawl_webpage(request)

It thinks we're a bot because we're accessing the website without a web browser. We can fix this though via the 'headers' parameter.

In [None]:
url = "https://www.college.police.uk/app"
request = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0'})

crawl_webpage(request)

The webpage is read as a series of bytes, which makes it very hard to read. Let's convert it to characters via utf-8, the format most websites are encoded in 

In [None]:
# Decoding it makes it much more legible
# Try it a couple of times if you get an error. The CoP website is inconsistent when it comes to blocking webscrapers.

cleaned_webpage = crawl_webpage(request).decode('utf-8')
print(cleaned_webpage)

In [None]:
# It's filled with a lot of tags we don't need. We'll fix that with Beautiful Soup 
soup = BeautifulSoup(requests.get(url).text, "html.parser")
text = soup.get_text()
print(text)

### 1.2 | Save webpage as file

In [None]:
# First, let's create a folder to save our output to
FILE_PATH = "res/data_mining/first_webpage"

try:
    shutil.rmtree(FILE_PATH, ignore_errors=True)
    os.makedirs(FILE_PATH)
except:
    print(f"Error. The directory {FILE_PATH} already exists and cannot be deleted. Please delete it manually.")

In [None]:
# Let's save the website text as a file, so we can access it later
with open(f"{FILE_PATH}/website_text.txt", "w", encoding="utf-8") as file:
    file.write(text)

In [None]:
def find_and_save_website_images(soup):

    # Find all image tags
    img_tags = soup.find_all('img')
    print(img_tags)
        
    # Download and save each image
    for img_tag in img_tags:
    
        # Handle relative URLs by joining them with the base URL
        img_url = img_tag['src']
        img_url = urljoin(url, img_url)
    
        # Get the image data
        img_data = requests.get(img_url).content
    
        # Extract the image filename from the URL
        img_filename = os.path.join(FILE_PATH, os.path.basename(img_url))
    
        # Save the image to a file
        with open(img_filename, 'wb') as img_file:
            img_file.write(img_data)
            print(f"Image saved: {img_filename}")

find_and_save_website_images(soup)

### 1.3 | Web Scraping Exercise

In [None]:
# Replace the URL with your website
my_url = "https://www.netflix.com/gb/"

In [None]:
try:
    # First, let's get the website's text
    request = urllib.request.Request(my_url, headers={'User-Agent':'Mozilla/5.0'})
    soup = BeautifulSoup(requests.get(my_url).text, "html.parser")

    # Second, let's create a file to save it to
    FILE_PATH = "res/data_mining/my_webpage/"
    if not os.path.isdir(FILE_PATH):
        os.makedirs(FILE_PATH)

    # Third, let's save the website's text as a file
    with open(f"{FILE_PATH}/website_text.txt", "w", encoding="utf-8") as file:
        file.write( soup.get_text() )

    # Finally, let's save the website's images
    find_and_save_website_images(soup )
except Exception as e:
    print("Invalid website: ", my_url)
    print("Did you forget the 'https://' ?")
    print(e)

## 2 | Web Scraping for Deeply Linked Webpages

Larger websites are deeply linked: they have a lot of URLs that take you to other parts of their website. We will take this into account 

In [None]:
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import pandas as pd

### 2.1 | Scrape Website

Essentially, we'll be scraping the webpage as normal. However, we're going to also scrape any internal hyperlinks that it has. These are links to other parts of the webpage, and we'll scrape those as well. Scraping internal hyperlinks requires a lot of custom code that is beyond the scope of this class, so we'll stick to a high-level overview

In [None]:
# We're going to need to create our own HTML parser to extract all the hyperlinks
# This will require is to inherit from 'HTMLParser' and override a function.
# It's far beyond the scope of the class. Please just run this code, though feel free to ask questions.

# Create a class to parse the HTML and get the hyperlinks
HTTP_URL_PATTERN = r'^http[s]*://.+'

class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

In [None]:
# Function to get the hyperlinks from a URL. It's very similar to §1.1

def get_hyperlinks(url):

    request = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0'})
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(request) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

In [None]:
# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

In [None]:
def crawl(url, MAX_NUMBER_OF_URLS_TO_CRAWL = None, verbose = False):
    
    # Parse the URL and get the domain
    FILE_PATH = "res/data_mining/"
    local_domain = urlparse(url).netloc


    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    shutil.rmtree(f"{FILE_PATH}{local_domain}", ignore_errors=True)
    if not os.path.exists(f"{FILE_PATH}{local_domain}/"):
            os.mkdir(f"{FILE_PATH}{local_domain}/")

    if not os.path.exists(f"{FILE_PATH}{local_domain}/text_raw/"):
            os.mkdir(f"{FILE_PATH}{local_domain}/text_raw/")

    # Create a directory to store the csv files
    if not os.path.exists(f"{FILE_PATH}{local_domain}/text_cleaned/"):
            os.mkdir(f"{FILE_PATH}{local_domain}/text_cleaned/")

    # While the queue is not empty, continue crawling
    counter = 0
    while queue:

        # Get the next URL from the queue
        url = queue.pop()

        # Debugging
        print(f"URL #{counter} : {url}") # see progress
        counter += 1

        if MAX_NUMBER_OF_URLS_TO_CRAWL:
            if counter > MAX_NUMBER_OF_URLS_TO_CRAWL:
                break

        # Save text from the url to a <url>.txt file
        if len(url) < 500:
            try:
                with open(FILE_PATH+local_domain+'/text_raw/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

                    # Get the text from the URL using BeautifulSoup
                    soup = BeautifulSoup(requests.get(url).text, "html.parser")

                    # Get the text but remove the tags
                    text = soup.get_text()

                    # If the crawler gets to a page that requires JavaScript, it will stop the crawl
                    if ("You need to enable JavaScript to run this app." in text):
                        print("Unable to parse page " + url + " due to JavaScript being required")
                
                    # Otherwise, write the text to the file in the text directory
                    f.write(text)

                # Get the hyperlinks from the URL and add them to the queue
                for link in get_domain_hyperlinks(local_domain, url):
                    if link not in seen:
                        queue.append(link)
                        seen.add(link)

                        if verbose:
                            print(f"--> Found new hyperlink! {link}")
            except:
                print(f"URL not valid: {url}")

In [None]:
url = "https://www.reddit.com/r/SuicideWatch/"

crawl(url, MAX_NUMBER_OF_URLS_TO_CRAWL = 5, verbose = False)

## 3 | Data Cleaning

These webpages contain a lot of blank space and information we don't need, so we'll need to clean them

In [None]:
# Let's get all our files before we do anything
DIRECTORY = "res/data_mining/www.college.police.uk"
files = os.listdir(f"{DIRECTORY}/text_raw")

print(files)

### 3.1 | Problem of Blank Space, Illustrated

In [None]:
#Let's see how much blank space our files contain.
with open(f"{DIRECTORY}/text_raw/{files[0]}", "r", encoding="UTF-8") as f:
    text = f.read()
    print(text)

### 3.2 | Remove Blank Space (and some irrelevant information)

In [None]:
# Removes all the way new lines or redundant spaces could be present
def remove_newlines(text):
    text = text.replace('\n', ' ')
    text = text.replace('\\n', ' ')
    text = text.replace('  ', ' ')
    return text

In [None]:
cleaned_texts=[]

# Get all the text files in the text directory
for individual_file in files:

    # Ignores the checkpoints folder
    if ".ipynb_checkpoints" in individual_file:
        continue

    # Open the raw text file
    with open(f"{DIRECTORY}/text_raw/{individual_file}", "r", encoding="UTF-8") as f:
        text = f.read()

        # Cleans the raw text file
        text = remove_newlines(text)
        title = individual_file[11:-4].replace('-',' ').replace('_', ' ').replace('#update','')

        # Saves it to a list, so we can turn it into a dataframe later.
        cleaned_texts.append((title, text))


        # Saves it as a raw file
        with open(f"{DIRECTORY}/text_cleaned/{individual_file}", "w", encoding="utf-8") as file:
            file.write( text )


In [None]:
#The problem of blank space is now fixed! Let's see an example
with open(f"{DIRECTORY}/text_cleaned/{files[0]}", "r", encoding="UTF-8") as f:
    text = f.read()
    print(text)

### 3.3 | Saves our Cleaned Text as Pandas Dataframe

In [None]:
# Our cleaned_text is a list of touples in the format of (title, text). 
print("EXAMPLE")
print(f"Title = {cleaned_texts[0][0]}\nText = {cleaned_texts[0][1]}")

In [None]:
df = pd.DataFrame(cleaned_texts, columns = ['title', 'text'])
df.head(5)

In [None]:
df.to_csv(f"{DIRECTORY}/results.csv", index=False)

## 4 | Data Transformation

The college of policing website doesn't contain anything particularly juicy, in need of analysis. Instead, we'll turn to a far juicier place: Reddit! We'll look at posts that may contain suicidal messages. These are post that may deserve law enforcement attention. These posts have already been scraped (§2) and cleaned (§3)

In [None]:
import pandas as pd 
import neattext.functions as nfx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
import numpy as np

### 4.1 | Data Reddit Import

In [None]:
df=pd.read_csv('res/data_mining/reddit_suicides_abridged.csv', encoding='utf-8')
df.head(3)

This webscraped dataset looks pretty similar to the webscraped dataset that we scraped ourselves in §2.0, albeit the contents are different. Let's see what it looks like...

In [None]:
# Example of a non-suicidal text
df["text"][1]

In [None]:
# Example of a text that discusses suicdie
df["text"][4]

In [None]:
# Let's see how many posts we have
df["text"].shape

Wow. 1,000 posts. That's a bit too much to read by hand, so let's analyze it

### 4.2 | Transformation #1: Language Transformations 

There are a lot of meaningless words in language, words that tell us little about the content. We need to remove them

In [None]:
def clean_text(text):
    cleaned_text = []

    # Loops through all the text
    for ind_text in text:

        # Make all the text lowercase so our program doesn't think 'Wife', 'wIFE', and 'WIFE' are seperate words
        ind_text=ind_text.lower()

        # Removes odd characters
        ind_text=nfx.remove_special_characters(ind_text)

        # Removes insignificant words
        ind_text=nfx.remove_stopwords(ind_text)
        
        cleaned_text.append(ind_text)
        
    return cleaned_text

In [None]:
df["text_cleaned"] =clean_text(df["text"])

In [None]:
# Let's print out an example cleaned vs dirty text 

sample_dirty = df["text"][0]
sample_clean = df["text_cleaned"][0]

print(f"{'*'*30}\nSAMPLE DIRTY TEXT\n{'*'*30}\n{ sample_dirty }\n\n")
print(f"{'*'*30}\nSAMPLE CLEAN TEXT\n{'*'*30}\n{ sample_clean }\n\n")

### 4.3 | Transformatin #2: Text Vectorization

Machines have a hard time reading words. We'll convert them to numbers to make them more conducive to later analyses

In [None]:
# Assign each word a unique ID and count it via CountVectorizor
vectorizer = CountVectorizer()
vectorized_text_df = vectorizer.fit_transform(df["text_cleaned"])

# Temp is inaccessible, so we'll need to do some dataset conversions to use it
vectorized_text_df = pd.DataFrame.sparse.from_spmatrix(vectorized_text_df) # sparse matrix
vectorized_text_df = vectorized_text_df.sparse.to_dense() # dense matrix  (for PCA)

vectorized_text_df.head(5)

In [None]:
# Let's get the size
vectorized_text_df.shape

Looks like there were around 10,000 unique words, represented as unique IDs. Thus, each row now has about 10,000 columns, with each column counting the frequency of a particular word

It may look a little scary with all these zeros, but this is completely normal! There are ~10,000 unique words, and not every post will have 10,000 unique words.

In [None]:
# Let's look at columns with a non-zero value

# Get all columns with a non zero value for the first row (i.e., the first post)
indices_of_interest = vectorized_text_df.columns.get_indexer( vectorized_text_df.columns[vectorized_text_df.iloc[0] != 0] )

# Print them
vectorized_text_df.iloc[0:2, indices_of_interest]

## 5 | Run & Interpret PCA

### 5.1 | Runs the PCA

In [None]:
# Runs the PCA
pca = PCA(n_components=2)
pca_results = pca.fit_transform(vectorized_text_df)

In [None]:
# Results are saved as a numpy array, so let's convert it to a pandas dataframe for legibility
pca_df = pd.DataFrame(data = {"PCA1" : pca_results[: , 0],
                             "PCA2" : pca_results[: , 1],
                             "raw_text" : df["text"] })
pca_df.head(5)

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.figure(figsize=(8,8))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("Principal Component Analysis of Text Data",fontsize=20)

#Suicides
plt.scatter(pca_df["PCA1"], pca_df["PCA2"], c='black', s=1)


plt.xlim(-.5, 0)
plt.ylim(-.5, 0)  

### 5.2 | Interpret PCA

This is what our unstructured text look like. They appear to exist on a spectrum, from the lower left to upper right. It's unclear what this spectrum means. We'd need to sample text on both sides of the spectrum

In [None]:
# Let's look at a post from the lower left
list_of_extreme_datapoints = pca_df[(pca_df["PCA1"] < -0.4) & (pca_df["PCA2"] < -0.4)].index
pca_df["raw_text"][list_of_extreme_datapoints[2]]

This looks extremely dark. Maybe the lower left are those texts that are the most suicidal. Let's try the other extreme

In [None]:
# Let's look at a post from the upper right
list_of_extreme_datapoints = pca_df[(pca_df["PCA1"] > -0.2) & (pca_df["PCA2"] > -0.2)].index
pca_df["raw_text"][list_of_extreme_datapoints[3]]

This text looks pretty innocent! It looks like the PCA arranged this text based on HOW SUICIDAL IT IS: the more suicidal a post is, the closer to the lower left it will be, and the smaller the PCA1 and PCA2 values will be 

### 5.3 | Double Check our PCA

There is something we failed to disclosed earlier: this text data is ALREADY LABELLED! In other words, someone already went through these post and labelled them, one by one, to determine if they are suicidal. We hid the label in a different file, in the addendum file, so that you can learn how to analyze unstructured text data without any aid whatsoever. However, now that you already analyzed this data, let's use it to double check our work

In [None]:
answer_key = pd.read_csv('res/data_mining/reddit_suicides_abridged_addendum.csv', encoding='utf-8')
pca_df["answer_key"] = answer_key

pca_df.head(3)

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("Principal Component Analysis of Text Data",fontsize=20)

#Suicides
plt.scatter(pca_df[pca_df["answer_key"] == "suicide"]["PCA1"],
            pca_df[pca_df["answer_key"] == "suicide"]["PCA2"],
            c='r', s=1)

#Non Suicides
plt.scatter(pca_df[pca_df["answer_key"] == "non-suicide"]["PCA1"],
            pca_df[pca_df["answer_key"] == "non-suicide"]["PCA2"],
            c='g', s=1)

plt.xlim(-.5, 0)
plt.ylim(-.5, 0)  

plt.legend(["Suicides", "Non-Suicides"],prop={'size': 15})

Well done. It looks like our interpretation is faithful to the answeer key

## 6 | Miscellaneous: Visualize Words (for fun)

In [None]:
from wordcloud import WordCloud

# Gets non suicides text
suicides = pca_df[pca_df['answer_key'] == 'suicide']
suicides_text = ' '.join(suicides['raw_text'].astype(str))

# Displays non suicides
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(suicides_text)
plt.figure(figsize=(5, 3))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis labels
plt.show()

In [None]:
# Gets non suicides text
non_suicides = pca_df[pca_df['answer_key'] == 'non-suicide']
non_suicides_text = ' '.join(non_suicides['raw_text'].astype(str))

# Displays non suicides
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(non_suicides_text)
plt.figure(figsize=(5, 3))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()