In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            # print(f'These are the top comments: {documents[doc_index]}')
            pass

df = pd.read_csv('expanded_comments.csv')

def preprocess(text):
    t = text.lower()
    # t = re.sub('_',r'',t)
    # t = re.sub('\d+',r'',t)
    t = re.sub(r'@[^ ]*',r'',t)
    t = re.sub(r'\W+',r' ',t)
    t = re.sub(r'(could|would|like|list|net|mailto|subject|http)', '', t)
    t = re.sub(r'\b\w{1,3}\b', '', t)
    stopwords_list = stopwords.words('english')
    txt = ' '.join([word for word in t.split() if word not in stopwords_list])
    return txt

lemmatizer = WordNetLemmatizer()


df['documents'] = [' '.join([lemmatizer.lemmatize(preprocess(email))])
                 .strip() for email in df['comment']]

documents = df['documents']

no_features = 1000


# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names_out()

no_topics = 5

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=1).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

no_top_words = 15
no_top_documents = 500
display_topics(lda_H, lda_W, tf_feature_names, df.comment, no_top_words, no_top_documents)


Topic 0:
unplug beginning progress advances turns know scared remember invent problems larger simply doubt facts able
Topic 1:
stupidity extinction destruction global industry defense assured public general energy warming example instead nuclear human
Topic 2:
companies people government development want regulation tech make money technology responsible power dangerous stop control
Topic 3:
government regulate humans regulation developing leaders continue self systems know human race computers machine mind
Topic 4:
people think technology human world humans need stop time going make know threat good better


In [119]:
topic = pd.DataFrame(lda_W.argmax(axis=1))

df_top = pd.concat([df,topic],axis=1)

df_top.columns = ['drop','id','name','location','comment','time','likes','rep_count','replies','documents','topic']

df_top.drop('drop',axis=1,inplace=True)

In [120]:
df_top

Unnamed: 0,id,name,location,comment,time,likes,rep_count,replies,documents,topic
0,125405146,Paul,Milwaukee,It is very possible that social media have alr...,2023-05-30 14:37:15,522,5,"[{'commentID': 125405281, 'status': 'approved'...",possible social media already cost american de...,4
1,125405170,jzu,port angeles,I wholeheartedly agree with the huge potential...,2023-05-30 14:38:52,145,1,"[{'commentID': 125406103, 'status': 'approved'...",wholeheartedly agree huge potential disruption...,2
2,125405175,CV Danes,Upstate NY,"""Leaders from OpenAI, Google Deepmind, Anthrop...",2023-05-30 14:39:23,1154,11,"[{'commentID': 125405606, 'status': 'approved'...",leaders openai google deepmind anthropic labs ...,1
3,125405267,Roger Reynolds,Barnesville OH,I wish we could get more specific facts and in...,2023-05-30 14:45:17,89,4,"[{'commentID': 125405506, 'status': 'approved'...",wish specific facts information extinction thr...,0
4,125405275,Stephen,Grosse Pointe,"It seems incongruous that ""industries leaders""...",2023-05-30 14:46:08,22,0,[],seems incongruous industries leaders warning e...,3
...,...,...,...,...,...,...,...,...,...,...
893,125431033,C,"N.,Y,",Purdue Pharma's Sackler family paid $6 billion...,2023-05-31 15:57:59,218,0,[],purdue pharma sackler family paid billion doll...,2
894,125430839,RjW,RollingPrairie,"Unless prevented, the nihilistic qualities of ...",2023-05-31 15:48:57,39,0,[],unless prevented nihiic qualities currently sm...,4
895,125428441,LB,U.S.,"Putting ""the"" government in charge of things i...",2023-05-31 13:27:24,100,0,[],putting government charge things hardly soluti...,3
896,125426057,JL Turriff,Concordia MO,This is just another example of 'extinction by...,2023-05-31 06:16:50,157,0,[],another example extinction stupidity much glob...,1


In [116]:
print(df_top.replies[1])

[{'commentID': 125406103, 'status': 'approved', 'commentSequence': 125406103, 'userID': 51063424, 'userDisplayName': 'MG', 'userLocation': 'NY', 'userTitle': 'NULL', 'userURL': 'NULL', 'picURL': None, 'commentTitle': '<br\\//>', 'commentBody': '@jzu \nYou say: "the question is how we keep AI out of the hands of bad people". Impossible. How do we keep nuclear weapons out of the hands of bad people? (North Korea, Iran, etc.) How do we keep guns out of the hands of bad people? (every mass murderer - which in this country is out of control!). This is faulty logic. What will we say, AI doesn\'t kill people, people kill people?', 'createDate': '1685449986', 'updateDate': '1686091933', 'approveDate': '1685450127', 'recommendations': 13, 'replyCount': 0, 'replies': [], 'editorsSelection': False, 'parentID': 125405170, 'parentUserDisplayName': 'jzu', 'depth': 2, 'commentType': 'userReply', 'trusted': 0, 'recommendedFlag': 0, 'permID': '125406103', 'isAnonymous': False}]


In [85]:
import plotly.graph_objects as go

import networkx as nx

In [103]:
G = nx.Graph()

for comment in df_top.comment:
    G.add_node(comment)

In [106]:
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    x, y = G.nodes[node]['pos']
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

ValueError: not enough values to unpack (expected 2, got 0)

In [88]:
node_adjacencies = []
node_text = []
for node, adjacencies in enumerate(G.adjacency()):
    node_adjacencies.append(len(adjacencies[1]))
    node_text.append('# of connections: '+str(len(adjacencies[1])))

node_trace.marker.color = node_adjacencies
node_trace.text = node_text

In [89]:
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Network graph made with Python',
                titlefont_size=16,
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="Python code: <a href='https://plotly.com/ipython-notebooks/network-graphs/'> https://plotly.com/ipython-notebooks/network-graphs/</a>",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                )
fig.show()