# Latent Dirichlet Allocation

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from ast import literal_eval

import pandas as pd
import numpy as np
import json

In [2]:
data = pd.read_csv('../out/01-preprocessing.tsv', sep='\t')
data = data[['raw_content', 'username', 'mention', 'cleaned_content']]
data['mention'] = data['mention'].apply(literal_eval)

In [3]:
def extract_topic(tfidf_input, num_topics = 1, num_top_words = 20):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_input)
    lda_model = LDA(n_components=num_topics, random_state=48)
    lda_model.fit(tfidf_matrix)
    
    feature_names = tfidf_vectorizer.get_feature_names_out()

    extracted_topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        extracted_topics = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    
    return extracted_topics

In [4]:
data_by_mention = data.explode('mention').reset_index(drop=True).copy(deep=True)

In [5]:
data_output = {}

In [6]:
for mention, df in data_by_mention.groupby(by='mention'):
    # if len(df) == 1:
    #     continue
    content = list(df['cleaned_content'].apply(literal_eval))
    mentioned = list(df['username'])
    raw_content = list(df['raw_content'])
    
    tfidf_input = [" ".join(c) for c in content]
    topic = extract_topic(tfidf_input)
    
    data_output[mention.lower()] = {
        'topic': topic,
        'source': [{
            'username': m.lower(),
            'content': c
        } for m, c in zip(mentioned, raw_content)]
    }

In [7]:
for username, df in data.groupby(by='username'):
    # if len(df) == 1:
    #     continue
    content = list(df['cleaned_content'].apply(literal_eval))
    mention = list(df['mention'])
    raw_content = list(df['raw_content'])
    
    tfidf_input = [" ".join(c) for c in content]
    topic = extract_topic(tfidf_input)
    
    data_output[username.lower()] = {
        'topic': topic,
        'source': [{
            'username': username.lower(),
            'content': c
        } for m, c in zip(mention, raw_content)]
    }
    

In [14]:
output = {
    'data': [{'username': k, 'topic': v['topic'], 'source': v['source']} for k, v in data_output.items()]
}

In [15]:
with open('../out/02-lda.json', 'w') as f:
    json.dump(output, f, indent=4)