In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=fc89da9ec4b312f6bbe9f450742852a40c11e9808c3bf1ec9400ef071335a6c3
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Imports

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from textblob import TextBlob
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

import re

from gensim import corpora
from gensim.models import LdaModel

import datetime

#Preprocessing

For the sake of saving time and staying relatively within one era of hockey to avoid dramatic topic changes in posts, I chose to select the first 100000 observations from the dataset for training and testing.

In [None]:
df = pd.read_json('/content/drive/My Drive/filtered_posts.jsonl', lines=True)

In [None]:
# prompt: save this dataset to drive

df.to_csv('/content/drive/My Drive/filtered_posts.csv', index=False)


In [None]:
# Function to check if a tweet is in English
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Function to perform spell checking
def spell_check(text):
    blob = TextBlob(text)
    return str(blob.correct())

In [None]:
text_data = df.loc[:99999, ['title', 'created_utc']]
text_data['title'] = df['title'].str.lower()

## Spell check

No matter the size of the dataset, spellcheck took more than 2 hours in most cases, and when I needed to restart my kernel, it made preprocessing an incredibly time-inefficient task.

In [None]:
print(text_data['title'][20])
print(spell_check(text_data['title'][20]))

let's play hockey - the hockey show
let's play hockey - the hockey show


In [None]:
text_data['title'] = text_data['title'].apply(lambda x: spell_check(x) if is_english(x) else x)

## Filtering out non-english

The same could be said for filtering out non-english posts. Reddit is typically more well-curated than twitter, meaning language is uniform and typically correct with few typos.

In [None]:
text_data = text_data[text_data.apply(is_english)]

## Lemmatization

I decided to remove team names, as they seemed to muddy the waters when creating categories with LDA (each team name tended to be found in every category). I kept numbers as well because they provide meaning when talking about statistics in hockey.

In [None]:
teams = [
    "ducks", "coyotes", "bruins", "sabres", "flames", "hurricanes",
    "blackhawks", "avalanche", "blue jackets", "stars", "red wings",
    "oilers", "panthers", "kings", "wild", "canadiens", "predators",
    "devils", "islanders", "rangers", "senators", "flyers", "penguins",
    "sharks", "kraken", "blues", "lightning", "maple leafs", "canucks",
    "golden knights", "capitals", "jets", "leafs", "knights", "jackets", "wings"
]

teams_with_boundaries = [r'\b' + team + r'\b' for team in teams]

teams_with_boundaries = '|'.join(teams_with_boundaries)

In [None]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess tweet text for machine learning.

    Done by removing URLs and special characters, tokenizing, removing
    stop words, and lemmatizing the words.
    """

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove "&amp;" an HTML entity
    text = re.sub(r'&amp;', '', text)

    # Remove hockey related words
    text = re.sub(r'\bhockey\b|\bnhl\b|\bv\b|\b'+teams_with_boundaries, '', text, flags=re.IGNORECASE)

    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Initialize the lemmatizer and stop words
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Remove stop words and lemmatize
    processed_tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens if token.lower() not in stop_words
    ]

    return ' '.join(processed_tokens)

In [None]:
text_data['preprocessed_title'] = text_data['title'].apply(preprocess_text)

## Perform LDA

In [None]:
def perform_lda(series, num_topics=3, passes=15, no_below=2, no_above=0.5, top_words=10):
    """
    Perform Latent Dirichlet Allocation (LDA) on a pandas Series of preprocessed text.

    Parameters:
    - series: pandas Series containing preprocessed text.
    - num_topics: Number of topics to identify.
    - passes: Number of passes through the corpus during training.
    - no_below: Keep tokens which are contained in at least `no_below` documents.
    - no_above: Keep tokens which are contained in no more than `no_above` documents (fraction of total corpus size).

    Returns:
    - topics: A list of topics with their top words.
    """

    texts = [text.split() for text in series]

    # Create a dictionary representation of the documents.
    dictionary = corpora.Dictionary(texts)

    # Filter out words that occur less than `no_below` documents, or more than `no_above` fraction of the documents.
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)

    # Create the corpus: a list of bags of words
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Build the LDA model
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    # Extract topics and their top words
    topics = lda_model.print_topics(num_words=10)

    return topics, lda_model, dictionary, corpus

In [None]:
# apply LDA to the processed tweets
topics, model, dictionary, corpus = perform_lda(text_data['preprocessed_title'], num_topics=4, passes=25, no_below=100, no_above=0.75, top_words=25)

It should be noted that the topics produced in this notebook may exactly match the topics in the NN training notebook. This is because I continued to attenpt to refine my categories by tuning the LDA model after saving the first satisfactory dataset I recieved. I did save the topic dictionaries from that model and they are printed in a markdown cell in the NN training notebook. The most difficult part of this whole project, for me, was fitting an LDA model that provided good categories. This is partially because the data itself is hard to divide well into categories, as there are so many topics to discuss in the world of hockey that overlap in very nuanced ways, as well as changing drastically over time as new news arrives around the nhl and the sport as a whole.

## Label Topics

In [None]:
# identify the topic for a tweet
def get_topic_for_tweet(tweet, model, dictionary):
    """
    Get the topic for a given tweet using the LDA model.

    Parameters:
    - tweet: The tweet text.
    - model: The trained LDA model.
    - dictionary: The dictionary used in the LDA model.

    Returns:
    - topic: The topic number assigned to the tweet.
    """
    # Tweets should be preprosessed in the same way as the training data!!

    # Convert the tweet to bag-of-words format
    bow = dictionary.doc2bow(tweet.split())

    # Get the topic distribution for the tweet
    topic_distribution = model.get_document_topics(bow)

    # Get the most probable topic
    topic = max(topic_distribution, key=lambda x: x[1])[0]

    return topic

In [None]:
# add column in dataframe for topic
text_data['topic'] = text_data['preprocessed_title'].apply(lambda x: get_topic_for_tweet(x, model, dictionary))

## Topic Analysis

In [None]:
topics_df = pd.DataFrame(topics, columns=['Topic ID', 'Words'])
# Convert the words in each topic to a dictionary of word frequencies
topics_df['Words'] = topics_df['Words'].apply(lambda x: dict([word.split('*') for word in x.split(' + ')]))


In [None]:
# display words and frequencies in each topic
for index, row in topics_df.iterrows():
    print(f"Topic {row['Topic ID']}: <br>")
    for word, freq in row['Words'].items():
        print(f"-  {word}: {freq}")
    print()

Topic 0: <br>
-  0.042: "player"
-  0.041: "new"
-  0.027: "team"
-  0.026: "jersey"
-  0.020: "r"
-  0.017: "anyone"
-  0.016: "league"
-  0.014: "xpost"
-  0.013: "get"

Topic 1: <br>
-  0.042: "year"
-  0.023: "sign"
-  0.019: "goal"
-  0.017: "draft"
-  0.015: "2"
-  0.014: "deal"
-  0.012: "hit"

Topic 2: <br>
-  0.043: "team"
-  0.039: "fan"
-  0.023: "one"
-  0.020: "best"
-  0.018: "like"
-  0.016: "guy"
-  0.012: "look"
-  0.011: "trade"

Topic 3: <br>
-  0.091: "game"
-  0.030: "cup"
-  0.026: "playoff"
-  0.023: "thread"
-  0.022: "2013"
-  0.019: "stanley"
-  0.017: "last"
-  0.016: "season"
-  0.015: "time"



In [None]:
# Specify the path to save the CSV file in Google Drive.
file_path = '/content/drive/My Drive/text_data.csv'

# Save the DataFrame to a CSV file.
text_data.to_csv(file_path, index=False)  # index=False prevents writing row indices to the file.

print(f"DataFrame saved to {file_path}")


DataFrame saved to /content/drive/My Drive/text_data.csv


In [None]:
# Saving test set
test_hockey_data = df.loc[100000: 110000, ['title', 'created_utc']]
test_hockey_data['title'] = df['title'].str.lower()

file_path = '/content/drive/My Drive/test_hockey_data.csv'

# Save the DataFrame to a CSV file.
test_hockey_data.to_csv(file_path, index=False)  # index=False prevents writing row indices to the file.

In [None]:
# process tweets into tokens
test_hockey_data['preprocessed_title'] = test_hockey_data['title'].apply(preprocess_text)
print(test_hockey_data['preprocessed_title'][10:20]) # peek at some tweets