In [5]:
import tomotopy as tp
import little_mallet_wrapper
import pandas as pd
from pathlib import Path
import pandas as pd
import re

In [6]:
article_df = pd.read_csv('../articles.csv')

# Convert dataframe to list
original_texts = article_df['text'].tolist()
titles = article_df['title'].tolist()

In [7]:
# Initialize empty list
training_data = []

# Process text using little mallet method - removes stop words and numbers
for article in original_texts:
    training_data.append(little_mallet_wrapper.process_string(article, numbers='remove'))

# Define topics and words per topic
num_topics = 5
num_topic_words = 30


# Initialize an LDA model with num_topics topics
model = tp.LDAModel(k=num_topics)

# Clean and add tokens to model for training
for text in training_data:
    model.add_doc(text.strip().split())

# Train model
iterations = 10
for i in range(0, 100, iterations):
    model.train(iterations)

# Retrieve and display identified topics and topic words
topics = []
topic_individual_words = []
for topic_number in range(0, num_topics):
    topic_words = ', '.join(word for word, prob in model.get_topic_words(topic_id=topic_number, top_n=num_topic_words))
    topics.append(topic_words)
    topic_individual_words.append(topic_words.split())
    print(f"Topic {topic_number}\n{topic_words}\n")

Topic 0
would, human, one, could, way, machine, like, system, future, learning, think, intelligence, world, make, something, new, google, even, game, really, research, language, going, take, artificial, learn, text, need, real, long

Topic 1
people, like, data, also, computer, new, time, work, many, companies, one, even, much, years, day, technology, well, user, great, use, might, information, product, would, app, questions, human, users, get, things

Topic 2
image, images, cnn, features, one, face, feature, different, object, process, step, see, deep, using, pixels, original, objects, computer, single, pixel, way, like, recognition, convolution, recognize, results, output, would, region, really

Topic 3
network, data, neural, learning, model, training, networks, function, time, use, deep, one, using, input, like, layer, example, code, much, get, also, first, set, see, output, two, need, used, train, number

Topic 4
learning, machine, data, course, python, science, free, hours, reviews

In [8]:
# Retrieve resulting distributions
topic_distributions = [list(doc.get_topic_dist()) for doc in model.docs]

# Iterate for display
for topic_index in range(0,(num_topics - 1)):
    # Sort based on probability
    sorted_data = sorted([(_distribution[topic_index], _document) for _distribution, _document in zip(topic_distributions, titles)], reverse=True)

    # Display identified topic words
    topic_words = topics[topic_index]
    print(f"Topic {topic_index}:\n{topic_words}\n")

    # Record seen documents to remove repeats and display document and probability
    seen_docs = []
    for probability, doc in sorted_data[:5]:
        if not doc in seen_docs:
            print(f'\nTopic Probability: {probability}  \nDocument: {doc}\n\n')
            seen_docs.append(doc)

Topic 0:
would, human, one, could, way, machine, like, system, future, learning, think, intelligence, world, make, something, new, google, even, game, really, research, language, going, take, artificial, learn, text, need, real, long


Topic Probability: 0.7948070764541626  
Document: The mind-blowing AI announcement from Google that you probably missed.



Topic Probability: 0.7661344408988953  
Document: Why AI Research Loves Pac-Man – Tommy Thompson – Medium


Topic 1:
people, like, data, also, computer, new, time, work, many, companies, one, even, much, years, day, technology, well, user, great, use, might, information, product, would, app, questions, human, users, get, things


Topic Probability: 0.7496511340141296  
Document: Do algorithms reveal sexual orientation or just expose our stereotypes?



Topic Probability: 0.6885490417480469  
Document: Did Google Duplex just pass the Turing Test? – Lance Ulanoff – Medium


Topic 2:
image, images, cnn, features, one, face, feature, di