# Install required Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install tomotopy
! pip install tomotopy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tomotopy
  Downloading tomotopy-0.12.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (16.5 MB)
[K     |████████████████████████████████| 16.5 MB 4.7 MB/s 
Installing collected packages: tomotopy
Successfully installed tomotopy-0.12.3


# Imports and Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import re
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # allow multiple outputs in a cell

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


# import tomptopy
import tomotopy as tp
import pickle

In [None]:
# Loading NLTK Modules
import nltk
# nltk.download('all')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
class topicModel:

  # perform pre-processing steps using lemmatization, stop-words and unnecessary punctuation removal
  def preprocess_article_text(self, doc_article):
    """
    Accept pandas series, then:
    1. Apply Word stemming
    2. Apply Stop Word removal
    """
    # clean
    doc_article = doc_article.lower()
    doc_article = doc_article.replace('\xa0', '')
    doc_article = re.sub('[!"#$%&\'()’*+,-./:;<=>?—@[\\]^_`{|}~’]', '', doc_article)
    # remove digits 
    doc_article = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", doc_article)
    # remove stop words
    words = nltk.word_tokenize(doc_article)
    stop_words = stopwords.words('english')
    stop_words = stop_words + ["said", "says", "just", "like", "would", "could", "use", "told", "new", "also", "thats", "even","dont"]
    words = [word for word in words if word not in stop_words and len(word) > 3]
    return ' '.join(words)

  def multi_grainLdaModel_train(self, doc_list):
    # k_g is th number of global topics, while k_l is the number of local topics
    mdl = tp.MGLDAModel(k_g=10, k_l=10, min_cf=20, min_df=30)
    for document in doc_list:
        mdl.add_doc(document.split())

    iterations = 10
    for i in range(0, 100, iterations):
            mdl.train(iterations)
            print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
    result_dict_train = self.extract_topic(mdl)
    return result_dict_train, mdl

  def extract_topic(self, mdl):
    result_dict = {}
    topic_dict = {}
    extractor = tp.label.PMIExtractor(min_cf=20, min_df=30, max_len=5, max_cand=10000)
    cands = extractor.extract(mdl)

    # ranking the candidates of labels for a specific topic
    labeler = tp.label.FoRelevance(mdl, cands, min_df=30, smoothing=1e-2, mu=0.25)

    # for k in range(mdl.k):
    #   print("== Topic #{} ==".format(k))
    #   print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    #   for word, prob in mdl.get_topic_words(k, top_n=10):
    #     print(word, prob, sep='\t')

    max_topic_num = 0
    for k in range(mdl.k_g):
        cur_topic = "topic#"+str(k)
        result_dict[cur_topic] = {}
        result_dict[cur_topic]["labels"] = (', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
        # result_dict[cur_topic]['topics'] = mdl.get_topic_words(k, top_n=10)
        result_dict[cur_topic]['topics'] = ' ,'.join([i[0] for i in mdl.get_topic_words(k, top_n=10)])
        max_topic_num +=1

    for k in range(mdl.k_l):
      cur_topic = "topic#"+str(max_topic_num+k)
      result_dict[cur_topic] = {}
      result_dict[cur_topic]["labels"] = (', '.join(label for label, score in labeler.get_topic_labels(mdl.k_g + k, top_n=5)))
      # result_dict[cur_topic]['topics'] = mdl.get_topic_words(mdl.k_g + k, top_n=10)
      result_dict[cur_topic]['topics'] = ' ,'.join([i[0] for i in mdl.get_topic_words(mdl.k_g + k, top_n=10)])
     
    return result_dict

  def multi_grainLdaModel_predict(self, doc_list, mdl):
    pred_result = {}
    docs_words = []
    for doc in doc_list:
      docs_words = docs_words + doc.strip().split()
    doc_inst = mdl.make_doc(docs_words)
    topic_dist, ll = mdl.infer(doc_inst)
    # sort the topic dist and take index
    topic_dist_arr = np.array(topic_dist)
    topic_dist_idx = topic_dist_arr.argsort()[::-1]
    mdl_topic = self.extract_topic(mdl)
    idx = 0
    for i in topic_dist_idx:
      if topic_dist[i]>0:
        pred_result["topic#"+str(idx)] = mdl_topic["topic#"+str(i)]
      idx+=1
    return pred_result

  def test_data_labels(self, train_emb, train_label, test_emb):
    from sklearn.ensemble import RandomForestClassifier
    X, y = train_emb, train_label
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X, y)
    y_test = clf.predict(test_emb)
    return y_test


In [None]:
data_path = '/content/drive/MyDrive/GLG_project/data/GLG_train_data_labeled.csv'
df_train = pd.read_csv(data_path, sep=',')

In [None]:
# Creat a class object to access all methods and instance of the class
topic_object = topicModel()

In [None]:
# Preprocess training data
df_train['preprocessed_article'] =  df_train['article'].apply(topic_object.preprocess_article_text)

In [None]:
df_train.head(2)

Unnamed: 0,date,year,month,day,title,article,url,section,publication,tech_health_tag,article_word_len,cluster_label,preprocessed_article
0,2018-08-09 09:11:14,2018,8.0,9,Psychologists’ Group Maintains Ban on Work at ...,MIND Members of the American Psychological Ass...,https://www.nytimes.com/2018/08/09/health/inte...,health,The New York Times,health,700,22,mind members american psychological associatio...
1,2016-04-26 00:00:00,2016,4.0,26,Prince autopsy: What examiners looked for,(CNN)Pop superstar Prince died from an accide...,https://www.cnn.com/2016/04/26/health/prince-d...,health,CNN,health,889,9,cnnpop superstar prince died accidental overdo...


In [None]:
# Run Multi Grain LDA Model for training data
model_result_train = {}

for cluster_label in sorted(df_train['cluster_label'].unique()):
  print('Starting training model {}'.format(cluster_label))
  cluster_doc = df_train[df_train['cluster_label']==cluster_label]['preprocessed_article'].tolist()
  model_result_train[cluster_label], mdl = topic_object.multi_grainLdaModel_train(cluster_doc)
  # save the model file
  mdl.save('/content/drive/MyDrive/GLG_project/GLG_topic_model/mdl_topic_model_' + str(cluster_label) + '.bin')

In [None]:
model_result_train[22]

In [None]:
# save model_result_train dictionary using the package pickel 
path_file = '/content/drive/MyDrive/GLG_project/GLG_topic_model/train_doc_result.pkl'
pickle.dump(model_result_train, open(path_file, 'wb')) 

# Test 

In [None]:
test_path = '/content/drive/MyDrive/GLG_project/data/GLG_test_data.csv'
df_test = pd.read_csv(test_path, sep=',')

In [None]:
train_embd_path = '/content/drive/MyDrive/GLG_project/data/GLG_train_data_emb.csv'
df_train_embd = pd.read_csv(train_embd_path, sep=',')

In [None]:
test_embd_path = '/content/drive/MyDrive/GLG_project/data/GLG_test_data_emb.csv'
df_test_embd = pd.read_csv(test_embd_path, sep=',')

In [None]:
train_data_labels = df_train['cluster_label']
test_data_labels = topic_object.test_data_labels(df_train_embd, train_data_labels, df_test_embd)
df_test['cluster_label'] = test_data_labels

In [None]:
df_test['cluster_label'].value_counts()

-1     610
 1     243
 17    159
 18     88
Name: cluster_label, dtype: int64

In [None]:
# Preprocess test data
df_test['preprocessed_article'] =  df_test['article'].apply(topic_object.preprocess_article_text)

In [None]:
test_prediction_result = {}

for cluster_label in set(test_data_labels):
  mdl = tp.MGLDAModel.load('/content/drive/MyDrive/GLG_project/GLG_topic_model/mdl_topic_model_' + str(cluster_label) + '.bin')
  test_docs = df_test[df_test['cluster_label']==cluster_label]['preprocessed_article'].tolist()
  test_prediction_result[str(cluster_label)] = topic_object.multi_grainLdaModel_predict(test_docs, mdl)

In [None]:
import json
print(json.dumps(test_prediction_result['18'], sort_keys=True, indent=4))

{
    "topic#0": {
        "labels": "youve, anything, things, getting, probably",
        "topics": "people ,want ,many ,much ,know ,well ,years ,around ,work ,service"
    },
    "topic#1": {
        "labels": "motherboard, professor, violating, violence, allows",
        "topics": "users ,people ,content ,post ,online ,still ,site ,public ,information ,email"
    },
    "topic#10": {
        "labels": "tweet, tweets, jack, followers, twitters",
        "topics": "twitter ,instagram ,account ,accounts ,tweets ,users ,twitters ,tweet ,user ,followers"
    },
    "topic#11": {
        "labels": "really, american, three, email, wrote",
        "topics": "accounts ,facebook ,researchers ,information ,million ,research ,pages ,account ,activity ,security"
    },
    "topic#12": {
        "labels": "content, publicly, experience, separate, look",
        "topics": "advertising ,digital ,google ,business ,microsoft ,technology ,executive ,people ,industry ,consumer"
    },
    "topic#13": {