In [1]:
# Necessary for importing modules from a sub-directory
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if os.getcwd().split('/')[-1] == 'notebooks':
    print("CHANGE DIR TO ROOT")
    os.chdir(r"../")

import pickle
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import csv
import time
from datetime import datetime
from tqdm import tqdm

from evaluation import aggregate_metrics, get_metric_umass, get_metric_cv
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora

CHANGE DIR TO ROOT


In [2]:
PROD = True

In [3]:
# Select and load dataset

if PROD:
    DATASET_TYPE = 'BN'
    DATASET_SIZE = '10000'
    DATASET_INDEX = '1'
    data_dir = f'var/data_prod_noun_propn'
    #corpus = pickle.load(open(f'{data_dir}/efselab_{DATASET_TYPE}_{DATASET_SIZE}_{DATASET_INDEX}.pkl', 'rb'))
    #NUM_ARTICLES = len(corpus)
    ext_corpus = pickle.load(open(f'{data_dir}/efselab_extrinsic_20000.pkl', 'rb'))

In [4]:
c1 = pickle.load(open(f'{data_dir}/efselab_{DATASET_TYPE}_{DATASET_SIZE}_1.pkl', 'rb'))
c2 = pickle.load(open(f'{data_dir}/efselab_{DATASET_TYPE}_{DATASET_SIZE}_2.pkl', 'rb'))
c3 = pickle.load(open(f'{data_dir}/efselab_{DATASET_TYPE}_{DATASET_SIZE}_3.pkl', 'rb'))
c4 = pickle.load(open(f'{data_dir}/efselab_{DATASET_TYPE}_{DATASET_SIZE}_4.pkl', 'rb'))
c5 = pickle.load(open(f'{data_dir}/efselab_{DATASET_TYPE}_{DATASET_SIZE}_5.pkl', 'rb'))
corpus = c1 + c2 + c3 + c4 + c5
NUM_ARTICLES = len(corpus)
len(corpus)


50005

In [5]:
# Preprocess variables
MIN_DF = 2   # float | int: 0.001-0.002 (~4000), 0.002 (~1000)
MAX_DF = 0.95 # float (0.95)

NGRAM = False # False | True
if NGRAM is True:
    NGRAM_OPT = 'ext+int' # 'ext+int' | 'int' 
else:
    NGRAM_OPT = '-'

# Run variables
NUM_TOPICS = range(10,151, 10)

# Model variables
NMF_NORM = 'frobenius' # ‘frobenius’ | ‘kullback-leibler’ | ‘itakura-saito’
NMF_SOLVER = 'mu' # ‘cd’ | ‘mu’
INIT = 'nndsvda' # None | ‘random’ | ‘nndsvd’ | ‘nndsvda’ | ‘nndsvdar’ | ‘custom’
ALPHA = 0    # float 0-1
L1_RATIO = 0.0 # float 0-1

MAX_ITERATIONS = 500
MODEL_NAME = f'NMF_{NMF_NORM}_{NMF_SOLVER}'
if INIT is not None:
    MODEL_NAME = f'{MODEL_NAME}_init-{str(INIT)}'
if (ALPHA != 0):
    MODEL_NAME = f'{MODEL_NAME}_alpha-{ALPHA}_L1-{L1_RATIO}'

if PROD:
    METRIC_FILENAME = f'metric_log/prod_noun_propn/{DATASET_TYPE}{DATASET_SIZE}/model={MODEL_NAME}_data={DATASET_TYPE}{DATASET_SIZE}_ALL.csv'
    MODEL_NAME = f'{MODEL_NAME}_i-ALL'
else:
    METRIC_FILENAME = f'metric_log/nmf_metrics_data={DATA_SET}{NUM_ARTICLES}_v10.csv'

print('MODEL_NAME: ', MODEL_NAME)
print('METRIC_FILE_NAME: ', METRIC_FILENAME)

print()
print('RUNNING IN PRODUCTION MODE!') if PROD else print('IN DEV MODE')

MODEL_NAME:  NMF_frobenius_mu_init-nndsvda_i-ALL
METRIC_FILE_NAME:  metric_log/prod_noun_propn/BN10000/model=NMF_frobenius_mu_init-nndsvda_data=BN10000_ALL.csv

RUNNING IN PRODUCTION MODE!


In [6]:
# Define NGRAM, if applicable
def get_ngram(corpus, level=3):
    if len(corpus) < 1000:
        bi_mc, bi_t = 10, 70
        tri_mc, tri_t = 15, 50
    elif (len(corpus) < 4000):
        bi_mc, bi_t = 25, 250
        tri_mc, tri_t = 25, 200
    else:
        bi_mc, bi_t = 45, 400
        tri_mc, tri_t = 45, 400
        
    ngram = None
    if level >= 2:
        phrases_bigram = Phrases(corpus, min_count=bi_mc, threshold=bi_t, delimiter=b'_')
        ngram = Phraser(phrases_bigram)
    if level >= 3:
        bigramed_corpus = [ngram[doc] for doc in corpus]
        phrases_trigram = Phrases(bigramed_corpus, min_count=tri_mc, threshold=tri_t, delimiter=b'_')
        ngram = Phraser(phrases_trigram)
    return ngram

#if (NGRAM is True):
    #if NGRAM_OPT == 'ext+int':
        #ngram = get_ngram(corpus+ext_corpus, level=3)
    #else:
        #ngram = get_ngram(corpus, level=3)
    #corpus = [ngram[doc] for doc in corpus]
    #ext_corpus = [ngram[doc] for doc in ext_corpus]

In [7]:
# Create TF-IDF
vectorizer = TfidfVectorizer(
    lowercase=False,
    tokenizer=lambda x: x, 
    max_df=MAX_DF, 
    min_df=MIN_DF, max_features=None, use_idf=True)
tf_idf = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

NUM_FEATURES = len(feature_names)
print(f'#Features from tfidf is {NUM_FEATURES}')

#Features from tfidf is 130026


In [8]:
# Get list of topics (nested list)
def get_NMF_topics(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        topics.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return topics

N_TOP_WORDS = 10

In [9]:
# Format Gensim components for Coherence metrics
ext_gensim_dict = corpora.Dictionary(ext_corpus)
tok2id = ext_gensim_dict.token2id

feature_terms_not_in_ext_corpus = [[term] for term in feature_names if term not in tok2id]
smooth_gensim_dict = corpora.Dictionary(ext_corpus + feature_terms_not_in_ext_corpus)
print(f'{len(feature_terms_not_in_ext_corpus)} terms not in extrinsic corpus added to smooth_gensim_dict')

int_gensim_dict = corpora.Dictionary(corpus)
int_gensim_corpus = [int_gensim_dict.doc2bow(text) for text in corpus]

62490 terms not in extrinsic corpus added to smooth_gensim_dict


In [10]:
for topic_index in tqdm(NUM_TOPICS):
    # Fit model
    t_start = time.time()

    model = NMF(random_state=0, tol=0.0001, verbose=0, shuffle=False,\
                n_components=topic_index, beta_loss=NMF_NORM, solver=NMF_SOLVER, \
                init=INIT, alpha=ALPHA, l1_ratio=L1_RATIO, max_iter=MAX_ITERATIONS)

    W = model.fit_transform(tf_idf, y=feature_names)
    H = model.components_

    RUN_TIME = round(time.time() - t_start, 3)
    ITERATIONS = model.n_iter_
    topics = get_NMF_topics(model, feature_names, N_TOP_WORDS)
    
    # Create smoothened corpus by extending ext corpus with 1-term documents containing missing topic terms
    flat_topic_terms = [term for topic in topics for term in topic]
    topic_terms_not_in_ext_corpus = [[term] for term in flat_topic_terms if term not in tok2id]
    smooth_corpus = ext_corpus + topic_terms_not_in_ext_corpus
    print(f'{len(topic_terms_not_in_ext_corpus)} number of terms not in extrinsic corpus added as separate one-term documents')


    # Calculate Coherence metrics
    m_cv = get_metric_cv(topics, smooth_corpus, smooth_gensim_dict)
    coh_cv = [e[0] for e in m_cv]
    std_cv = [e[1] for e in m_cv]
    m_umass = get_metric_umass(topics, int_gensim_dict, int_gensim_corpus)
    coh_umass = [e[0] for e in m_umass]
    std_umass = [e[1] for e in m_umass]

    reduced_cv = aggregate_metrics(coh_cv)
    reduced_cv['std'] = round(np.mean(std_cv), 4)
    reduced_umass = aggregate_metrics(coh_umass)
    reduced_umass['std'] = round(np.mean(std_umass), 4)
    
    top_n_topics = 1
    relative_sparseness = [sum(sorted(W[i], reverse=True)[:top_n_topics])/(sum(W[i]) if sum(W[i]) > 0 else 1) for i in range(len(W[:]))]
    reduced_rs = aggregate_metrics(relative_sparseness)

    # Save model to csv
    DATE = datetime.now().strftime('%m%d-%H:%M')

    #fields=[DATE, MODEL_NAME, NUM_ARTICLES, topic_index, ITERATIONS, RUN_TIME, NGRAM_OPT, \
    #        reduced_cv.get('avg'), reduced_cv.get('med'), reduced_cv.get('top'), reduced_cv.get('bot'), \
    #        reduced_umass.get('avg'), reduced_umass.get('med'), reduced_umass.get('top'), reduced_umass.get('bot'), \
    #        reduced_cv.get('std'), reduced_umass.get('std'), MIN_DF, NUM_FEATURES,'_']
    
    fields=[DATE, MODEL_NAME, NUM_ARTICLES, topic_index, ITERATIONS, RUN_TIME, NGRAM_OPT, NUM_FEATURES, \
        reduced_cv.get('avg'), reduced_cv.get('top'), reduced_cv.get('bot'), \
        reduced_umass.get('avg'), reduced_umass.get('top'), reduced_umass.get('bot'), \
        reduced_rs.get('avg'), reduced_rs.get('top'), reduced_rs.get('bot'), '_']
    
    if os.path.exists(METRIC_FILENAME):  
        with open(METRIC_FILENAME, 'a+', newline='\n', encoding='utf-8') as f:
            writer = csv.writer(f, delimiter=',')
            writer.writerow(fields)
        f.close()
    else:
        with open(METRIC_FILENAME, 'w', newline='\n', encoding='utf-8') as f:
            writer = csv.writer(f, delimiter=',')
            headers=['date', 'model', '#articles', '#topics', 'iterations', 'time', 'ngram', '#features', \
            'cv_avg', 'cv_top', 'cv_bot', \
            'umass_avg', 'umass_top', 'umass_bot', \
            'rs_avg', 'rs_top', 'rs_bot', \
             '_']
            writer.writerow(headers)
            writer.writerow(fields)
        f.close()

  0%|          | 0/15 [00:00<?, ?it/s]

0 number of terms not in extrinsic corpus added as separate one-term documents


  7%|▋         | 1/15 [00:38<09:05, 38.98s/it]

1 number of terms not in extrinsic corpus added as separate one-term documents


 13%|█▎        | 2/15 [01:32<09:21, 43.21s/it]

2 number of terms not in extrinsic corpus added as separate one-term documents


 20%|██        | 3/15 [02:40<10:07, 50.66s/it]

2 number of terms not in extrinsic corpus added as separate one-term documents


 27%|██▋       | 4/15 [03:57<10:44, 58.59s/it]

3 number of terms not in extrinsic corpus added as separate one-term documents


 33%|███▎      | 5/15 [05:23<11:10, 67.04s/it]

2 number of terms not in extrinsic corpus added as separate one-term documents


 40%|████      | 6/15 [07:04<11:35, 77.24s/it]

3 number of terms not in extrinsic corpus added as separate one-term documents


 47%|████▋     | 7/15 [09:03<11:56, 89.53s/it]

3 number of terms not in extrinsic corpus added as separate one-term documents


 53%|█████▎    | 8/15 [11:29<12:26, 106.64s/it]

4 number of terms not in extrinsic corpus added as separate one-term documents


 60%|██████    | 9/15 [14:19<12:33, 125.61s/it]

4 number of terms not in extrinsic corpus added as separate one-term documents


 67%|██████▋   | 10/15 [17:28<12:03, 144.68s/it]

5 number of terms not in extrinsic corpus added as separate one-term documents


 73%|███████▎  | 11/15 [20:48<10:45, 161.32s/it]

6 number of terms not in extrinsic corpus added as separate one-term documents


 80%|████████  | 12/15 [24:23<08:52, 177.38s/it]

5 number of terms not in extrinsic corpus added as separate one-term documents


 87%|████████▋ | 13/15 [28:10<06:24, 192.23s/it]

6 number of terms not in extrinsic corpus added as separate one-term documents


 93%|█████████▎| 14/15 [31:29<03:14, 194.33s/it]

6 number of terms not in extrinsic corpus added as separate one-term documents


100%|██████████| 15/15 [35:59<00:00, 143.94s/it]
