# NLP Processing of Avalanche Forecasts

In [4]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
import torch

In [5]:
all_data = pd.read_csv('output_data/incomplete_All_Zones_Current_Season_reports_data.csv')
all_data.columns = columns=['date', 'zone', 'overall_risk', 'above_treeline_risk', 'near_treeline_risk', 'below_treeline_risk', 'bottom_line_text', 'problem_type_text', 'forecast_discussion_text']
all_data.head()

Unnamed: 0,date,zone,overall_risk,above_treeline_risk,near_treeline_risk,below_treeline_risk,bottom_line_text,problem_type_text,forecast_discussion_text
0,2023-04-15,Mt Hood,MODERATE,2.0,2.0,1.0,"As new snow starts piling up, think about avoi...",An approaching storm will bring moderate preci...,This incoming system is expected to hit the Mt...
1,2023-04-15,East Slopes South,LOW,1.0,1.0,1.0,Generally safe avalanche conditions exist. Ho...,A skiff of new snow and a good dose of strong ...,The main story for Sunday will likely be the g...
2,2023-04-15,East Slopes North,MODERATE,2.0,1.0,1.0,A weak storm will bring light rain below treel...,"Sunday's storm will bring more wind than snow,...",Access in the East North zone is difficult and...
3,2023-04-15,West Slopes South,MODERATE,2.0,2.0,1.0,You may see the wind building new slabs throug...,You may not find any wind slabs to start the d...,"Right off the bat in the morning, you may not ..."
4,2023-04-15,Snoqualmie Pass,MODERATE,2.0,1.0,1.0,A couple of inches of new snow in the afternoo...,You may see a wind slab problem start to devel...,"First thing Sunday morning, you may find gener..."


### Creating and using an LDA Model on the data:
- Should compare the differences between teh tree columns


Preparing the data for the LDA Model:

In [6]:
# Import necessary libraries
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from gensim import corpora, models

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess(text):
    stop_words = set(stopwords.words('english'))
    # adding days of the week to stop words
    stop_words.update(['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'])
    # adding months to stop words
    stop_words.update(['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
                       'september', 'october', 'november', 'december'])
    lemmatizer = WordNetLemmatizer()
    
    words = word_tokenize(text.lower())
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    words = simple_preprocess(str(words), deacc=True)
    
    return words

def prepare_text_column(column):
    """
    Prepares a text column for LDA analysis.
    """
    column = [str(item) for item in column]
    processed = [preprocess(doc) for doc in column]

    # Create a dictionary of terms and their frequency
    dictionary = corpora.Dictionary(processed)

    # Create a document-term matrix
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed]

    return doc_term_matrix, dictionary

# Load the dataset
bl_matrix, bl_dict = prepare_text_column(all_data['bottom_line_text'].to_list())
pt_matrix, pt_dict = prepare_text_column(all_data['problem_type_text'].to_list())
fd_matrix, fd_dict = prepare_text_column(all_data['forecast_discussion_text'].to_list())

[nltk_data] Downloading package stopwords to /home/jaymin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jaymin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jaymin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Creating the LDA Model:

In [8]:
# Create the LDA models
bl_model = LdaModel(bl_matrix, num_topics=5, id2word=bl_dict, passes=10)
pd_model = LdaModel(pt_matrix, num_topics=5, id2word=pt_dict, passes=10)
fd_model = LdaModel(fd_matrix, num_topics=10, id2word=fd_dict, passes=10)

In [None]:
# Print the topics and their corresponding keywords
for idx, topic in pd_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.061*"wind" + 0.048*"slab" + 0.031*"snow" + 0.014*"slope" + 0.012*"new" + 0.011*"avalanche" + 0.011*"could" + 0.010*"terrain" + 0.009*"loaded" + 0.008*"find"
Topic: 1 
Words: 0.049*"wind" + 0.044*"snow" + 0.028*"slab" + 0.028*"slope" + 0.018*"avalanche" + 0.018*"terrain" + 0.013*"steep" + 0.012*"surface" + 0.011*"trigger" + 0.011*"could"
Topic: 2 
Words: 0.039*"snow" + 0.039*"avalanche" + 0.036*"wet" + 0.023*"slope" + 0.017*"could" + 0.016*"loose" + 0.013*"slab" + 0.012*"slide" + 0.011*"surface" + 0.011*"steep"
Topic: 3 
Words: 0.023*"avalanche" + 0.021*"day" + 0.020*"slope" + 0.020*"storm" + 0.014*"snow" + 0.013*"large" + 0.013*"slab" + 0.013*"could" + 0.012*"wind" + 0.010*"steep"
Topic: 4 
Words: 0.039*"snow" + 0.034*"avalanche" + 0.022*"wind" + 0.021*"slab" + 0.021*"could" + 0.016*"slope" + 0.014*"steep" + 0.011*"terrain" + 0.010*"wet" + 0.010*"new"
