In [0]:
import numpy as np

import pandas as pd
import glob
import os
import nltk
import string
import re

In [0]:
#Creating corpus :

file_list = glob.glob(os.path.join(os.getcwd(), "/Users/michaelwehbe/Desktop/MSCI data/10-K_i1&2", "*.txt"))

corpus = []

for file_path in file_list:
    with open(file_path) as f_input:
        corpus.append(f_input.read())



[]

In [0]:
#Importing extra stopwords:

file_list2 = glob.glob(os.path.join(os.getcwd(), "/Users/michaelwehbe/Desktop/MSCI data/extra_stop_words", "*.txt"))

extra_stop_words_corpus = []

for file_path1 in file_list2:
    with open(file_path1) as f_input1:
        extra_stop_words_corpus.append(f_input1.read())

#names_stop_words = extra_stop_words_corpus[0]
#geographic_stop_words = extra_stop_words_corpus[1]
#generic_stop_words = extra_stop_words_corpus[3]
#dates_stop_words = extra_stop_words_corpus[2]

In [0]:
extra_stop_words_corpus

[]

In [0]:
#Data preprocessing:

#Let's define some useful functions:

In [0]:
#Get all the text to lowercase:

def text_lowercase(text):
    return text.lower()

In [0]:
# Remove numbers 
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result 

In [0]:
# remove punctuation

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [0]:
# remove whitespace from text

def remove_whitespace(text):
    return " ".join(text.split())

In [0]:
# Let's clean the extra stop words lists:

def clean_extra_stop_words(extra_stop_words):
    a = text_lowercase(extra_stop_words)
    b = remove_numbers(a)
    a = remove_punctuation(b)
    b = remove_whitespace(a)
    return list(b.split())


names_stop_words_clean = clean_extra_stop_words(names_stop_words)
geographic_stop_words_clean = clean_extra_stop_words(geographic_stop_words)
generic_stop_words_clean = clean_extra_stop_words(generic_stop_words)
dates_stop_words_clean = clean_extra_stop_words(dates_stop_words)

NameError: ignored

In [0]:
#Dealing with stopwords:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#Creating stopwords full list
from collections import Counter
intersection1 = Counter(names_stop_words_clean) & Counter(stopwords.words("english"))
intersection2 = Counter(geographic_stop_words_clean) & Counter(stopwords.words("english"))
intersection3 = Counter(stopwords.words("english")) & Counter(generic_stop_words_clean)
intersection4 = Counter(stopwords.words("english")) & Counter(dates_stop_words_clean)

names_without_common = list(Counter(names_stop_words_clean) - intersection1)
geographic_without_common = list(Counter(geographic_stop_words_clean) - intersection2)
generic_without_common = list(Counter(generic_stop_words_clean) - intersection3)
dates_without_common = list(Counter(dates_stop_words_clean) - intersection4)

full_stop_words_list = stopwords.words("english") + names_without_common + geographic_without_common + generic_without_common + dates_without_common

# full_stop_words_list = list(set(stopwords.words("english") + names_without_common + geographic_without_common + generic_without_common + dates_without_common))

def remove_stopwords(text):
    stop_words = set(full_stop_words_list)
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

NameError: ignored

In [0]:
#Lemmatize the words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()


#Get word type for each word: 

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# lemmatize string
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos =get_wordnet_pos(word)) for word in word_tokens]
    return ' '.join(lemmas)


In [0]:
#Removing numeric of words:
def remove_lastfirst_numeric(txt):
    for j, i in enumerate(txt):
        if i[-1].isnumeric():
            txt[j] = i[:-1]
        if i[0].isnumeric():
            txt[j] = i[1:]
    return txt

In [0]:
#Define text preprocessing function:

def text_preprocessing(text):

    a = text_lowercase(text)
    b = remove_numbers(a)
    a = remove_punctuation(b)
    b = remove_whitespace(a)
    a = lemmatize_word(b)
    b = remove_stopwords(a)
    return remove_lastfirst_numeric(b)






In [0]:
#n_docs = len(corpus)
n_docs = 30

#Let's now create the preprocessed corpus:
clean_corpus = []
clean_corpus_tokenized = []
for i in range(n_docs):
    clean_corpus.append(' '.join(text_preprocessing(corpus[i])))  #We have a list of strings, each string represent one document 
    clean_corpus_tokenized.append(text_preprocessing(corpus[i]))

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_id_vectorizer = TfidfVectorizer(stop_words = 'english', strip_accents = 'ascii', min_df = 5, max_df = 0.5)

x = tf_id_vectorizer.fit_transform(clean_corpus)

## Let's visualize our data through a word cloud: ##

*   List item
*   List item



In [0]:
from wordcloud import WordCloud

In [0]:
# Join the different processed titles together.
long_string = ','.join(list(clean_corpus))

In [0]:
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

In [0]:
wordcloud.generate(long_string)

In [0]:
wordcloud.to_image()

In [0]:
## Exploratory data analysis for each document

In [0]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_gram(corpus, gram, n):
    vec = CountVectorizer(ngram_range=(gram, gram)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

for i in range(n_docs):   
    f = plt.figure(figsize=(15,3))
    ax1 = f.add_subplot(131)
    ax2 = f.add_subplot(132)
    ax3 = f.add_subplot(133)

    wordcloud = WordCloud(background_color="white",max_words=5000, contour_width=3, contour_color='steelblue').\
        generate(clean_corpus[i])
    ax1.imshow(wordcloud)
    ax1.axis('off')
    
    common_words = get_top_n_gram([clean_corpus[i]], 2,10)
    df2 = pd.DataFrame(common_words, columns = ['Text' , 'count'])
    ax2.bar(df2['Text'],df2['count'])
    ax2.tick_params(labelrotation=90)

    common_words = get_top_n_gram([clean_corpus[i]], 3,10)
    df3 = pd.DataFrame(common_words, columns = ['Text' , 'count'])
    ax3.bar(df3['Text'],df3['count'])
    ax3.tick_params(labelrotation=90)

    plt.show()

## LDA

In [0]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification

In [0]:
#Parameters: 

n_topics = 10
n_words = 10

In [0]:
lda = LatentDirichletAllocation(n_components= n_topics,random_state=9)

In [0]:
lda.fit(x)

In [0]:
#Function that yields the topics found by LDA:

def print_topics(model, tf_id_vectorizer, n_top_words):
    words = tf_id_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [0]:
print_topics(lda, tf_id_vectorizer, n_words)

#### LDA performance:

In [0]:
#Log Likelyhood: Higher the better
print("Log Likelihood: ", lda.score(x))

#### Grid search hyperparams to optimize LDA: 

Mainly "number of topics" and "learning decay"

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
# Define Search Param
search_params = {'n_components': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda_opt = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda_opt, param_grid=search_params)

# Do the Grid Search
model.fit(x)

In [0]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(x))

 Now the LDA model to use is "best_lda_model"

## Visualization of LDA results

In [0]:
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis

In [0]:
LDAvis_prepared = sklearn_lda.prepare(best_lda_model, x, tf_id_vectorizer)

In [0]:
pyLDAvis.display(LDAvis_prepared)

In [0]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(x)

# column names
topicnames = ["Topic" + str(i) for i in range(model.best_params_['n_components'])]

# index names
docnames = ["Doc" + str(i) for i in range(n_docs)]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)



In [0]:
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2))

In [0]:
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [0]:
# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)



In [0]:
# Apply Style
df_document_topics = df_document_topic.style.applymap(color_green).applymap(make_bold)
df_document_topics