# Thematic Analysis

## Import Required Libraries

In [1]:
import pandas as pd
import plotly.express as px
# HTML Library to parse html special characters
import html

# NLP requirements
import nltk 
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
  
# spacy for lemmatization
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joelchoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/joelchoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read in filings data and save down for easy Loading

In [None]:
filings_df = pd.read_csv("https://storage.googleapis.com/iig-ds-test-data/all_filings_and_sections.csv",index_col = 0)

In [None]:
filings_df.to_csv("data/filings.csv")

In [2]:
filings_df = pd.read_csv("data/filings.csv",index_col = 0)

# General information about dataset

In [3]:
filings_df.head()

Unnamed: 0,ticker,companyName,formType,description,filedAt,linkToFilingDetails,Section1,Section1A,Section7
0,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-17T16:12:13-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...
1,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2022-02-22T16:24:39-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...
2,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2021-02-19T16:44:57-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...
3,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2020-02-19T17:13:43-05:00,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...
4,AIZ,ASSURANT INC,10-K,Form 10-K - Annual report [Section 13 and 15(d...,2019-02-22T16:48:45-05:00,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...


In [4]:
filings_df.dtypes

ticker                 object
companyName            object
formType               object
description            object
filedAt                object
linkToFilingDetails    object
Section1               object
Section1A              object
Section7               object
dtype: object

### Data Transformation


#### Extract Date and Convert Datetime

In [5]:
filings_df['date'] = filings_df.filedAt.apply(lambda x: x.split("T")[0])
filings_df['date']= pd.to_datetime(filings_df.date)
filings_df["yearMonth"] = filings_df.date.dt.to_period('M').dt.to_timestamp()
print(f"data types {filings_df.dtypes}")
filings_df.head()



data types ticker                         object
companyName                    object
formType                       object
description                    object
filedAt                        object
linkToFilingDetails            object
Section1                       object
Section1A                      object
Section7                       object
date                   datetime64[ns]
yearMonth              datetime64[ns]
dtype: object


Unnamed: 0,ticker,companyName,formType,description,filedAt,linkToFilingDetails,Section1,Section1A,Section7,date,yearMonth
0,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-17T16:12:13-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...,2023-02-17,2023-02-01
1,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2022-02-22T16:24:39-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...,2022-02-22,2022-02-01
2,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2021-02-19T16:44:57-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...,2021-02-19,2021-02-01
3,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2020-02-19T17:13:43-05:00,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...,2020-02-19,2020-02-01
4,AIZ,ASSURANT INC,10-K,Form 10-K - Annual report [Section 13 and 15(d...,2019-02-22T16:48:45-05:00,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management&#8217;s Discussion and Ana...,2019-02-22,2019-02-01


#### convert html special characters

In [6]:
filings_df.Section1 = filings_df.Section1.apply(lambda x: html.unescape(str(x)))
filings_df.Section1A = filings_df.Section1A.apply(lambda x: html.unescape(str(x)))
filings_df.Section7 = filings_df.Section7.apply(lambda x: html.unescape(str(x)))
filings_df

Unnamed: 0,ticker,companyName,formType,description,filedAt,linkToFilingDetails,Section1,Section1A,Section7,date,yearMonth
0,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2023-02-17T16:12:13-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management’s Discussion and Analysis ...,2023-02-17,2023-02-01
1,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2022-02-22T16:24:39-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management’s Discussion and Analysis ...,2022-02-22,2022-02-01
2,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2021-02-19T16:44:57-05:00,https://www.sec.gov/Archives/edgar/data/126723...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management’s Discussion and Analysis ...,2021-02-19,2021-02-01
3,AIZ,"ASSURANT, INC.",10-K,Form 10-K - Annual report [Section 13 and 15(d...,2020-02-19T17:13:43-05:00,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management’s Discussion and Analysis ...,2020-02-19,2020-02-01
4,AIZ,ASSURANT INC,10-K,Form 10-K - Annual report [Section 13 and 15(d...,2019-02-22T16:48:45-05:00,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"Item 1. Business \n\nAssurant, Inc. was incor...",Item 1A. Risk Factors \n\nCertain factors may...,Item 7. Management’s Discussion and Analysis ...,2019-02-22,2019-02-01
...,...,...,...,...,...,...,...,...,...,...,...
2415,HWM,Arconic Inc.,10-K,Form 10-K - Annual report [Section 13 and 15(d...,2019-02-21T17:24:56-05:00,https://www.sec.gov/Archives/edgar/data/4281/0...,Item 1. Business. \n\nGeneral \n\nArconic Inc...,Item 1A. Risk Factors. \n\nArconic’s business...,Item 7. Management’s Discussion and Analysis ...,2019-02-21,2019-02-01
2416,SNPS,SYNOPSYS INC,10-K,Form 10-K - Annual report [Section 13 and 15(d...,2022-12-12T17:28:23-05:00,https://www.sec.gov/Archives/edgar/data/883241...,Item 1. Business \n\n##TABLE_END\n\nCompany a...,Item 1A. Risk Factors \n\n##TABLE_END\n\nA de...,Item 7. Management’s Discussion and Analysis ...,2022-12-12,2022-12-01
2417,SNPS,SYNOPSYS INC,10-K,Form 10-K - Annual report [Section 13 and 15(d...,2021-12-13T17:22:49-05:00,https://www.sec.gov/Archives/edgar/data/883241...,Item 1. Business \n\n##TABLE_END\n\nCompany a...,Item 1A. Risk Factors \n\n##TABLE_END\n\nA de...,Item 7. Management’s Discussion and Analysis ...,2021-12-13,2021-12-01
2418,SNPS,SYNOPSYS INC,10-K,Form 10-K - Annual report [Section 13 and 15(d...,2020-12-14T20:00:04-05:00,https://www.sec.gov/Archives/edgar/data/883241...,Item 1. Business \n\n##TABLE_END\n\nCompany a...,Item 1A. Risk Factors \n\n##TABLE_END\n\nA de...,Item 7. Management’s Discussion and Analysis ...,2020-12-14,2020-12-01


In [7]:
plot_df = filings_df.groupby("yearMonth")["ticker"].count().to_frame()
plot_df.columns = ["numOfFilings"]
plot_df = plot_df.reset_index()

fig = px.line(
    plot_df, x="yearMonth", y="numOfFilings",
    title='Number of Filings in each month',
)

# Customize the layout
fig.update_layout(
    xaxis_title='Month Year',
    yaxis_title='Number of Filings',
)

# Show the plot
fig.show()

## Start modelling: We use topic modelling to extract themes from SEC filings 

## Data Preprocessing to create bigram and trigram model

Preprocessing functions are refactored in production code. Refer to src folder for details


In [8]:
stop_words = stopwords.words('english')
# Add words as neccessary
#  stop_words.extend([""])

# convert content to list
content = filings_df.Section1.values.tolist()

# Remove new line characters
content = [re.sub('\s+', ' ', s) for s in content]

# A fast was to remove numbers , punctuations etc that doesnt add any value to the model
content = [gensim.utils.simple_preprocess(str(s), deacc=True) for s in content]
content[0]



['item',
 'business',
 'assurant',
 'inc',
 'was',
 'incorporated',
 'as',
 'delaware',
 'corporation',
 'in',
 'we',
 'are',
 'leading',
 'global',
 'business',
 'services',
 'company',
 'that',
 'supports',
 'protects',
 'and',
 'connects',
 'major',
 'consumer',
 'purchases',
 'we',
 'support',
 'the',
 'advancement',
 'of',
 'the',
 'connected',
 'world',
 'by',
 'partnering',
 'with',
 'the',
 'world',
 'leading',
 'brands',
 'to',
 'develop',
 'innovative',
 'solutions',
 'and',
 'to',
 'deliver',
 'an',
 'enhanced',
 'customer',
 'experience',
 'we',
 'operate',
 'in',
 'north',
 'america',
 'latin',
 'america',
 'europe',
 'and',
 'asia',
 'pacific',
 'through',
 'two',
 'operating',
 'segments',
 'global',
 'lifestyle',
 'and',
 'global',
 'housing',
 'through',
 'our',
 'global',
 'lifestyle',
 'segment',
 'we',
 'provide',
 'mobile',
 'device',
 'solutions',
 'extended',
 'service',
 'products',
 'and',
 'related',
 'services',
 'for',
 'consumer',
 'electronics',
 'and',
 '

In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(content, min_count=5, threshold=100) # higher threshold fewer phrases.
# trigram = gensim.models.Phrases(bigram[content], threshold=100)  

# # Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)


In [10]:
# Save / load an exported collocation model.
bigram_mod.save("models/bigram_mod.pkl")
# trigram_mod.save("models/trigram_mod.pkl")
# bigram_reloaded = Phraser.load("/tmp/my_bigram_model.pkl")
# bigram_reloaded[['trees', 'graph', 'minors']]  

Preprocessing functions are refactored in production code. Refer to src folder for details

In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(content)

# # Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# # Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['item', 'incorporate', 'lead', 'global', 'business', 'service', 'company', 'support', 'protect', 'connect', 'major', 'consumer', 'purchase', 'support', 'advancement', 'connect', 'world', 'partnering', 'world', 'lead', 'brand', 'develop', 'innovative', 'solution', 'deliver', 'enhanced', 'customer', 'experience', 'operate', 'asia_pacific', 'operate', 'segment', 'global', 'lifestyle', 'global', 'lifestyle', 'segment', 'provide', 'mobile', 'device', 'solution', 'extend', 'service', 'product', 'relate', 'service', 'consumer', 'electronic', 'appliance', 'credit', 'insurance', 'product', 'refer', 'connected', 'live', 'vehicle', 'protection', 'lease', 'finance', 'solution', 'relate', 'service', 'refer', 'global', 'automotive', 'global', 'housing', 'segment', 'provide', 'lender_place', 'homeowner', 'insurance', 'lender_place', 'manufacture', 'housing', 'insurance', 'lender_place', 'flood', 'insurance', 'refer', 'lender_placed', 'insurance', 'renter', 'insurance', 'relate', 'product', 'refer',

## Create corpus (word_id, word_frequency)
Preprocessing functions are refactored in production code. Refer to src folder for details

In [39]:
len(data_lemmatized[0:2])

2

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 6), (3, 1), (4, 5), (5, 1), (6, 1), (7, 2), (8, 1), (9, 2), (10, 2), (11, 6), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 3), (20, 5), (21, 10), (22, 1), (23, 2), (24, 10), (25, 4), (26, 1), (27, 3), (28, 5), (29, 17), (30, 17), (31, 4), (32, 4), (33, 2), (34, 3), (35, 1), (36, 1), (37, 1), (38, 4), (39, 1), (40, 3), (41, 2), (42, 7), (43, 3), (44, 5), (45, 5), (46, 1), (47, 1), (48, 5), (49, 3), (50, 1), (51, 2), (52, 3), (53, 1), (54, 2), (55, 1), (56, 1), (57, 6), (58, 3), (59, 2), (60, 1), (61, 6), (62, 5), (63, 2), (64, 4), (65, 1), (66, 1), (67, 8), (68, 4), (69, 1), (70, 2), (71, 12), (72, 1), (73, 22), (74, 1), (75, 1), (76, 2), (77, 1), (78, 1), (79, 4), (80, 2), (81, 1), (82, 6), (83, 1), (84, 2), (85, 2), (86, 1), (87, 2), (88, 1), (89, 4), (90, 1), (91, 5), (92, 1), (93, 1), (94, 3), (95, 6), (96, 1), (97, 1), (98, 1), (99, 2), (100, 6), (101, 4), (102, 1), (103, 10), (104, 2), (105, 4), (106, 2), (107, 4), (108, 2), (109, 3), (

## Build Model

In [13]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
# import pprint
lda_model.print_topics()


[(0,
  '0.026*"employee" + 0.016*"company" + 0.013*"include" + 0.010*"program" + 0.008*"work" + 0.008*"business" + 0.007*"support" + 0.006*"diversity" + 0.006*"well" + 0.006*"continue"'),
 (1,
  '0.042*"product" + 0.024*"treatment" + 0.021*"patient" + 0.015*"patent" + 0.013*"development" + 0.012*"clinical" + 0.012*"drug" + 0.011*"include" + 0.009*"phase" + 0.009*"use"'),
 (2,
  '0.036*"product" + 0.027*"system" + 0.024*"design" + 0.015*"technology" + 0.015*"software" + 0.013*"include" + 0.011*"use" + 0.011*"device" + 0.010*"application" + 0.010*"customer"'),
 (3,
  '0.069*"market" + 0.039*"trading" + 0.037*"exchange" + 0.021*"trade" + 0.018*"datum" + 0.016*"clearing" + 0.015*"platform" + 0.014*"list" + 0.013*"option" + 0.013*"seller"'),
 (4,
  '0.019*"contract" + 0.018*"company" + 0.014*"production" + 0.013*"total" + 0.012*"cost" + 0.012*"operation" + 0.011*"estimate" + 0.011*"government" + 0.011*"water" + 0.011*"include"'),
 (5,
  '0.044*"officer" + 0.044*"vice" + 0.033*"chief" + 0.02

In [15]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.468696818644215

Coherence Score:  0.5070954598633468


## Determine optimise number of themes

In [16]:
def compute_coherence_values(id2word, corpus, texts, limit, start, step):
    """
    Compute c_v coherence for different number of themes

    Parameters:
    ----------
    id2word: Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Maximum number of topics

    Returns:
    -------
    model_list : List of topic models 
    coherence_values : Coherence scores with respect to each model
    """
    coherence_list = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
        coherence_list.append(coherencemodel.get_coherence())

    return model_list, coherence_list

model_list, coherence_values = compute_coherence_values(id2word=id2word, corpus=corpus, texts=data_lemmatized, start=6, limit=24, step=2)

## Select best model based on coherence score

In [17]:
import matplotlib.pyplot as plt
%matplotlib inline
x = range(6, 24, 2)
df = pd.DataFrame({"numberOfThemes":range(6,24,2),"coherenceScore":coherence_values})
# plt.plot(x, coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()

fig = px.line(df, x="numberOfThemes", y="coherenceScore", title='Coherence Score wrt Number of Themes')
fig.show()

## Save down model and dictionary

In [18]:
best_model = model_list[-2]
best_model.save("models/lda_model")


In [19]:
## Test loading

from gensim.models import LdaModel
from gensim.corpora import Dictionary

# Load the LDA model
loaded_lda_model = LdaModel.load('models/lda_model')

# Load the dictionary
loaded_dictionary = Dictionary.load('models/lda_model.id2word')


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment intensity analyzer
analyzer = SentimentIntensityAnalyzer()

# Analyze the sentiment of a sample text
sentiment_scores = analyzer.polarity_scores(content[1000])
print(sentiment_scores["compound"])
content[1000]

## Interpreting Topics

0. **Employee Programs and Support**
   - Keywords: employee, company, program, work, business, support, diversity
   - Theme: This topic seems to be about employee-related programs, support, and diversity initiatives within the company.
   
1. **Medical Treatment and Drug Development**
   - Keywords: product, treatment, patient, patent, development, clinical, drug
   - Theme: This topic revolves around medical treatments, drug development, and clinical trials.

2. **Technology and Software Design**
   - Keywords: product, system, design, technology, software, device, application
   - Theme: This topic focuses on technology and software design, including systems and applications.

3. **Financial Markets and Trading**
   - Keywords: market, trading, exchange, trade, datum, clearing, platform
   - Theme: This topic relates to financial markets, trading, and exchange platforms.

4. **Production and Operational Costs**
   - Keywords: contract, company, production, cost, operation, estimate, government
   - Theme: This topic likely concerns production, contracts, and operational costs.

5. **Executive Positions and Leadership**
   - Keywords: officer, vice, chief, serve, executive, president, senior, director
   - Theme: This topic pertains to executive positions and leadership roles within the organization.

6. **Customer Services and Solutions**
   - Keywords: service, customer, solution, business, technology, provide
   - Theme: This topic covers customer services and solutions, including technology-related offerings.

7. **Energy and Resource Management**
   - Keywords: gas, natural, pipeline, oil, facility, operation, capacity
   - Theme: This topic revolves around energy resources, including gas, oil, and facility operations.

8. **Hospitality and Travel Industry**
   - Keywords: hotel, brand, travel, gaming, ship, resort, restaurant, guest
   - Theme: This topic is about the hospitality and travel industry, including hotels, resorts, and gaming.

9. **Transportation and Logistics**
   - Keywords: service, carrier, customer, shipment, transportation, airline, aircraft
   - Theme: This topic relates to transportation services, carriers, and logistics.

10. **Insurance and Risk Management**
    - Keywords: insurance, company, risk, business, loss, claim, coverage
    - Theme: This topic focuses on insurance-related matters, including risk and coverage.

11. **Property and Investment**
    - Keywords: property, income, investment, lease, tax
    - Theme: This topic likely concerns property, investments, and income-related matters.

12. **Retail Services & Sales**
    - Keywords: product, store, brand, sale, customer, sell
    - Theme: This topic relates to retail services and sales.

13. **Content and Entertainment Industry**
    - Keywords: service, content, network, company, license, entertainment, advertising
    - Theme: This topic relates to content and entertainment services, including networks and licenses.

14. **Healthcare Services and Programs**
    - Keywords: health, care, service, medical, dental, program, healthcare, provider
    - Theme: This topic is about healthcare services, care programs, and medical providers.

15. **Financial Services and Asset Management**
    - Keywords: client, investment, service, management, fund, asset, financial
    - Theme: This topic focuses on financial services, investment management, and assets.

16. **Capital and Financial Regulation**
    - Keywords: capital, financial, bank, risk, company, rule, banking
    - Theme: This topic pertains to capital, financial institutions, and regulations.

17. **Regulations and Legal Considerations**
    - Keywords: regulation, law, subject, requirement, risk
    - Theme: This topic concerns regulations, legal considerations, and associated risks.

18. **Product Segmentation and Market**
    - Keywords: product, company, segment, market, business, material, customer
    - Theme: This topic is about product segmentation, markets, and customer considerations.

19. **Energy Utility and Customer Service**
    - Keywords: energy, utility, customer, electric, gas, service, power, cost
    - Theme: This topic covers energy utilities, customer services, and power-related aspects.


In [20]:
id2theme = {0: 'Employee Programs and Support',
  1: 'Medical Treatment and Drug Development',
  2: 'Technology and Software Design',
  3: 'Financial Markets and Trading',
  4: 'Production and Operational Costs',
  5: 'Executive Positions and Leadership',
  6: 'Customer Services and Solutions',
  7: 'Energy and Resource Management',
  8: 'Hospitality and Travel Industry',
  9: 'Transportation and Logistics',
  10: 'Insurance and Risk Management',
  11: 'Property and Investment',
  12: 'Retail Services & Sales',
  13: 'Content and Entertainment Industry',
  14: 'Healthcare Services and Programs',
  15: 'Financial Services and Asset Management',
  16: 'Capital and Financial Regulation',
  17: 'Regulations and Legal Considerations',
  18: 'Product Segmentation and Market',
  19: 'Energy Utility and Customer Service'}
id2theme

{0: 'Employee Programs and Support',
 1: 'Medical Treatment and Drug Development',
 2: 'Technology and Software Design',
 3: 'Financial Markets and Trading',
 4: 'Production and Operational Costs',
 5: 'Executive Positions and Leadership',
 6: 'Customer Services and Solutions',
 7: 'Energy and Resource Management',
 8: 'Hospitality and Travel Industry',
 9: 'Transportation and Logistics',
 10: 'Insurance and Risk Management',
 11: 'Property and Investment',
 12: 'Retail Services & Sales',
 13: 'Content and Entertainment Industry',
 14: 'Healthcare Services and Programs',
 15: 'Financial Services and Asset Management',
 16: 'Capital and Financial Regulation',
 17: 'Regulations and Legal Considerations',
 18: 'Product Segmentation and Market',
 19: 'Energy Utility and Customer Service'}

In [21]:
best_model.print_topics()

[(0,
  '0.026*"employee" + 0.016*"company" + 0.013*"include" + 0.010*"program" + 0.008*"work" + 0.008*"business" + 0.007*"support" + 0.006*"diversity" + 0.006*"well" + 0.006*"continue"'),
 (1,
  '0.042*"product" + 0.024*"treatment" + 0.021*"patient" + 0.015*"patent" + 0.013*"development" + 0.012*"clinical" + 0.012*"drug" + 0.011*"include" + 0.009*"phase" + 0.009*"use"'),
 (2,
  '0.036*"product" + 0.027*"system" + 0.024*"design" + 0.015*"technology" + 0.015*"software" + 0.013*"include" + 0.011*"use" + 0.011*"device" + 0.010*"application" + 0.010*"customer"'),
 (3,
  '0.069*"market" + 0.039*"trading" + 0.037*"exchange" + 0.021*"trade" + 0.018*"datum" + 0.016*"clearing" + 0.015*"platform" + 0.014*"list" + 0.013*"option" + 0.013*"seller"'),
 (4,
  '0.019*"contract" + 0.018*"company" + 0.014*"production" + 0.013*"total" + 0.012*"cost" + 0.012*"operation" + 0.011*"estimate" + 0.011*"government" + 0.011*"water" + 0.011*"include"'),
 (5,
  '0.044*"officer" + 0.044*"vice" + 0.033*"chief" + 0.02

Function to predict theme based on corpus

In [22]:
def predict_topic(corpus: list[tuple]) -> int:
    '''
    corpus:: bag of words
    '''
    # Get the topic distribution for the new document
    topic_distribution = best_model.get_document_topics(corpus)

    # Select the dominant topic (topic with the highest probability)
    dominant_topic = max(topic_distribution, key=lambda x: x[1])

    # Print the dominant topic and its probability
    topic_id, topic_prob = dominant_topic
    # print(f"Dominant Topic: {topic_id}, Probability: {topic_prob}")
    return id2theme[topic_id]

## Predict all filings

In [23]:

filings_df["corpus"] = corpus
filings_df['predictedTheme'] = filings_df['corpus'].apply(predict_topic)
filings_df['year'] = filings_df['yearMonth'].dt.year.apply(str)


## Q1. Common Topics

In [24]:
theme_counts = filings_df['predictedTheme'].value_counts().reset_index()
theme_counts.columns = ['predictedTheme', 'count']

# Sort the data in descending order
theme_counts = theme_counts.sort_values(by='count', ascending=False)

# Plot the bar chart using Plotly
bar_chart = px.bar(theme_counts, x='predictedTheme', y='count', title='Number of predictedTheme in Descending Order',
                   labels={'predictedTheme': 'Predicted Theme', 'count': 'Number of Occurrences'})

bar_chart.show()

## Q2/3 Declining and emerging themes/topics

In [25]:
import numpy as np
# Group by yearMonth and predictedTheme, then count occurrences
grouped_df = filings_df.groupby(['year', 'predictedTheme']).size().reset_index(name='count')
grouped_df["count"] = grouped_df["count"].apply(np.log)  
# Plot the line chart
fig = px.line(grouped_df, x='year', y='count', color='predictedTheme', 
              title='Number of predictedTheme across year', labels={'count': 'Log (Number of Occurrences)'})
fig.update_layout(height=800)
fig.show()

## Q4 Sentiment analysis
Due to time constraints, we use a ready to use model Blob. We conduct the sentiment analysis on Section 1A, Risk



In [26]:
from nltk.tokenize import RegexpTokenizer

def preprocess_senti_text(text):

    # Tokenize and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')

    # Convert to lowercase
    tokens = tokenizer.tokenize(text.lower())  

    # Remove numbers using a regular expression
    tokens = [re.sub(r'\d+','', token) for token in tokens]
    tokens = [i for i in tokens if i]

    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    return filtered_tokens

# Preprocess the sample text
filings_df['sentiInputText'] =  filings_df.Section1A.apply(lambda x: preprocess_senti_text(x))




In [None]:
from textblob import TextBlob
senti_list = []
for i in filings_df['sentiInputText']:


    text = ' '.join(i)
    blob = TextBlob(text)
    sentiment = blob.sentiment
    senti_list.append(sentiment)
    print(sentiment)

sentiment = [sentiment[0] for sentiment in senti_list]

In [37]:
filings_df['sentiment'] = sentiment

fig = px.box(filings_df , x="predictedTheme", y="sentiment", title="Sentiment Distribution for each Predicted Theme")
fig.update_layout(height=800)
fig.show()

In [65]:
import requests
from bs4 import BeautifulSoup
# from w3lib.html import replace_entities
# Specify the URL of the SEC filing
url = 'https://www.sec.gov/Archives/edgar/data/1267238/000126723822000006/aiz-20211231.htm'
headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
# Send an HTTP GET request to fetch the HTML content
response = requests.get(url, headers = headers)
html_content = response.content

# # Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")

all_text = soup.get_text()
all_text_processed = html.unescape(str(all_text))

ls = all_text_processed.encode('ascii', 'ignore').decode('ascii').split("Item1. Business")

item1Content = ls[1].split("Item1A. Risk Factors")[1]
item1Content


"Certain factors may have a material adverse effect on our business, financial condition, results of operations and cash flows. You should carefully consider them, along with the other information presented in this Report. It is not possible to predict or identify all such factors. Additional risks and uncertainties that are not yet identified or that we currently believe to be immaterial may also materially harm our business, financial condition, results of operations and cash flows.The following is a summary of the principal risks that could adversely affect our business, financial condition, results of operations and cash flows.Business, Strategic and Operational RisksOur revenues and profits may decline if we are unable to maintain relationships with significant clients, distributors and other parties, or renew contracts with them on favorable terms, or if those parties face financial, reputational or regulatory issues. 18Significant competitive pressures, changes in customer prefe