In [None]:
import pandas as pd
import numpy as np

# input/output file locations
file_preprocessed_data = '../../../Endpoint Mapping Data/Domain Data/Support Party Mapping/serp_merged_pre_processed.csv'
file_target_domains = '../../../Endpoint Mapping Data/Domain Data/v5_unique_domains.csv'
file_final_results = '../../../Endpoint Mapping Data/Domain Data/Support Party Mapping/SERP_NMF_RESULTS.csv'

# read serp dataset and target domains 
serp_merged = pd.read_csv(file_preprocessed_data)
target_domains = pd.read_csv(file_target_domains)
# only keep data for target domains 
serp_merged = target_domains.merge(serp_merged, on=['domain'], how='left')

## Extract feature vectors using TF-IDF and create an NMF Model 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vec = TfidfVectorizer(max_features=4000, stop_words="english", max_df=0.95, min_df=2)
features = vec.fit_transform(serp_merged['token'].values.astype('U'))

## Run Topic model
from sklearn.decomposition import NMF
n_topics = 30
random_state = 0

cls = NMF(n_components=n_topics, random_state=random_state, max_iter=600)
cls.fit(features)

target = cls.transform(vec.transform(serp_merged['token'])).argsort(axis=1)[:,-1] 
serp_merged['target'] = target

In [None]:
# list of unique words found by the vectorizer
# This list is used to create Table 11 in the appendix
# also this list is used to manually label clusters shown in Endpoint Mapping Data/Domain Data/Support Party Mapping/Target 1 TF-IDF NMF Clusters.txt

feature_names = vec.get_feature_names_out()

# number of most influencing words to display per topic
n_top_words = 10

for i, topic_vec in enumerate(cls.components_):
    print(i, '\t & \t', end=' ')
    top_words = ''
    for fid in topic_vec.argsort()[-1:-n_top_words-1:-1]:
        # print(feature_names[fid], end=' ')
        top_words = top_words + ' ' + feature_names[fid]
    print(top_words.ljust(95),  '\t & \t'.ljust(5) , '\\\\')

## Extract Feature using Count Feature and create NMF Model 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

vec2 = CountVectorizer(ngram_range=(1, 2), stop_words="english")
features2 = vec2.fit_transform(serp_merged.token)


n_topics = 30
random_state = 1

cls2 = NMF(n_components=n_topics, random_state=random_state, max_iter=300)
cls2.fit(features2)

target2 = cls2.transform(vec2.transform(serp_merged['token'])).argsort(axis=1)[:,-1] 
serp_merged['target2'] = target2

In [None]:
# list of unique words found by the vectorizer
# This list is used to create Table 11 in the appendix
# also this list is used to manually label clusters shown in Endpoint Mapping Data/Domain Data/Support Party Mapping/Target 2 Count NMF Cluster.txt

feature_names = vec2.get_feature_names_out()

# number of most influencing words to display per topic
n_top_words = 10

for i, topic_vec in enumerate(cls2.components_):
    print(i, '\t & \t', end=' ')
    top_words = ''
    for fid in topic_vec.argsort()[-1:-n_top_words-1:-1]:
        # print(feature_names[fid], end=' ')
        top_words = top_words + ' ' + feature_names[fid]
    print(top_words.ljust(105),  '\t & \t'.ljust(5) , '\\\\')

In [None]:
## todo Save Final Results
# serp_merged.to_csv(file_final_results, index=False)