In [None]:
import pandas as pd
import numpy as np
!pip install -U gensim
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.models import Phrases
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
from sklearn.manifold import TSNE
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import spacy

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [None]:
!pip install pyLDAvis==2.1.2

import pyLDAvis
import pyLDAvis.gensim

In [None]:
fin_ind = pd.read_csv('fin_with_ind.csv')
id_ind = fin_ind[['id', 'Industry Sector', 'Primary Industry Classification']]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
corpus_root = '/content/drive/MyDrive/10-k'
corpus = PlaintextCorpusReader(corpus_root, '.*')

In [None]:
rows = []
for filename in corpus.fileids():
  rows.append((filename, corpus.raw(filename)))

df = pd.DataFrame(rows, columns=['name', 'text'])

In [None]:
def find_id(s):
  return int(''.join(x for x in s if x.isdigit()))

df['id'] = df['name'].apply(find_id)

In [None]:
data = df.merge(id_ind, on='id')

In [None]:
data.groupby('Industry Sector').count()

In [None]:
#df by sectors
industrials_df = data[data['Industry Sector'] == 'Industrials']
condiscret_df = data[data['Industry Sector'] == 'Consumer Discretionary']
it_df = data[data['Industry Sector'] == 'Information Technology']
energy_df = data[data['Industry Sector'] == 'Energy']
healthcare_df = data[data['Industry Sector'] == 'Health Care']
comservice_df = data[data['Industry Sector'] == 'Communication Services']
constaple_df = data[data['Industry Sector'] == 'Consumer Staples']
materials_df = data[data['Industry Sector'] == 'Materials']
realestate_df = data[data['Industry Sector'] == 'Real Estate']
utilities_df = data[data['Industry Sector'] == 'Utilities']

In [None]:
stop_words = stopwords.words('english')

def get_pos(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {'J':wordnet.ADJ,
              'N':wordnet.NOUN,
              'V':wordnet.VERB,
              'R':wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

def token_lemmatize(token):
  lemmatizer = WordNetLemmatizer()
  return lemmatizer.lemmatize(token, get_pos(token))

In [None]:
#noise from Industrials
utilities_corpus = []
for i in range(len(utilities_df)):  
  text = utilities_df.iloc[i]['text']
  text = simple_preprocess(text)
  lemmatized = [token_lemmatize(word) for word in text if word not in stop_words]
  result = [word for word in lemmatized if word not in stop_words]
  utilities_corpus.append(result)

#bigram_phrases = gensim.models.Phrases(constaple_corpus, min_count=5, threshold=50)
#bigram = gensim.models.phrases.Phraser(bigram_phrases)
#bigram_corpus = [bigram[doc] for doc in constaple_corpus]

id2word = corpora.Dictionary(utilities_corpus)
train_corpus = [id2word.doc2bow(text) for text in utilities_corpus]


In [None]:
tfidf = TfidfModel(train_corpus, id2word=id2word)

low_value = 0.04
drop_words = [] #dropped words
for i in range(len(train_corpus)):
  bow = train_corpus[i]
  low_value_words = []
  tfidf_ids = [id for id, value in tfidf[bow]]
  bow_ids = [id for id, value in bow]
  low_value_words = [id for id, value in tfidf[bow] if value < low_value]
  words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] #words with tf-idf score 0 will be missing
  drops = low_value_words + words_missing_in_tfidf
  for item in drops:
    drop_words.append(id2word[item])
  new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
  train_corpus[i] = new_bow

In [None]:
noise = pd.DataFrame(drop_words)[0].unique()
len(noise)

4334

In [None]:
' '.join(noise)

'ability able acceptable accepted access accordance accordingly account accounting accrue accumulate acequia achieve acquire acquisition acre act action activate actively actual actuarially addition additional additionally address adequate adjacent adjust adjustment adjusts administer administrative adopt adoption advance adversely advice affect affected affirmative agency aggregate agree aid aligns alliance allocation allow allows also alternative among amortization amortize amount analysis announce annual annually antenna anticipate anticipates antonio applicable application applies apply approach appropriate approval approve approximate approximately april area arise arizona arrangement asc aspect asphalt assess assignment associate assume assumption attract attributable audit austin authority authorization authorize availability available average award back bank base basin basis bear become becomes begin believe beneficiary benefit beyond bi bill billing blanco board bond borrower 

In [None]:
ldaModel = gensim.models.ldamodel.LdaModel(corpus=train_corpus, 
                                           id2word=id2word, 
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           update_every=1,
                                           passes=10)

In [None]:
doc_len = []
for i in range(len(train_corpus)):
  doc_len.append(len(train_corpus[i]))
np.mean(doc_len), np.median(doc_len), np.min(doc_len), np.max(doc_len)

(30.8125, 25.5, 11, 80)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(topic_model=ldaModel, corpus=train_corpus, dictionary=id2word, mds='mmds')
vis

In [None]:
len(vis.topic_info['Term'].unique())

320

In [None]:
sw = ' '.join(vis.topic_info['Term'].unique().tolist())
sw