In [1]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
#https://drive.google.com/file/d/1nhFEB4katYxJbS8C9A0lPa7r-vumnHx5/view?usp=share_link
file = drive.CreateFile({'id':'1nhFEB4katYxJbS8C9A0lPa7r-vumnHx5'}) # replace the id with id of file you want to access
file.GetContentFile('merged_data.csv')  # tab-separated

In [3]:
import numpy as np
import pandas as pd
import nltk
# import gensim

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Load data into dataframe
df = pd.read_csv('merged_data.csv')

In [5]:
df.head(10)

Unnamed: 0,victim_name,text_combine
0,Aaron Brockway,text
1,Aaron Brockway,R.I.P Aaron Brockway Im lighting this one fo...
2,Aaron Brockway,Rip. Aaron Brockway 🙏
3,Aaron Carlos Martinez,text
4,Aaron Carlos Martinez,Aaron Carlos Martinez 18 : #IHateTimWaterman
5,Aaron Carlos Martinez,1st three listed RT @Scottelands36: 3 keepers...
6,Aaron Christopher Scott,text
7,Aaron Christopher Scott,Christopher Eccleston Domhnall Gleeson Aaro...
8,Aaron Harts,text
9,Aaron Harts,Check it out! I donated to My son Aaron Harts...


In [6]:
# Remove missing value
d = df.dropna(subset=['text_combine'])
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 611432 entries, 0 to 611509
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   victim_name   611432 non-null  object
 1   text_combine  611432 non-null  object
dtypes: object(2)
memory usage: 14.0+ MB


In [7]:
d.reset_index(inplace=True, drop=True)

In [8]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import WordNetLemmatizer 

stemmer = SnowballStemmer("english")

# tokenization and stemming
def tokenization_and_stemming(text):
    tokens = []
    # exclude stop words and tokenize the document, generate a list of string 
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        # print(token)
        if token == 'r.i.p' or token.isalpha():
            filtered_tokens.append(token)
            
    # stemming
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [10]:
text = d.loc[:, 'text_combine'].tolist()

In [11]:
text[:10]

[' text',
 ' R.I.P Aaron Brockway  Im lighting this one for you',
 ' Rip. Aaron Brockway 🙏',
 ' text',
 ' Aaron Carlos Martinez  18 : #IHateTimWaterman',
 ' 1st three listed RT @Scottelands36: 3 keepers AL only Roto 5x5. Price $23  JD Martinez $12  Salazar $6  Aaron Hicks $6  Carlos Rodon $6.',
 ' text',
 ' Christopher Eccleston  Domhnall Gleeson  Aaron Paul Wes Bentley  Ben Foster Scott Speedman https://t.co/8Y4Ljqeds3',
 ' text',
 ' Check it out! I donated to My son Aaron Harts final exspensive https://t.co/i6w7GhvlNG']

In [12]:
tokenization_and_stemming(text[1])

['r.i.p', 'aaron', 'brockway', 'im', 'light', 'one']

In [13]:
tokenization_and_stemming(text[2])

['rip', 'aaron', 'brockway']

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
# define vectorizer parameters
# TfidfVectorizer will help us to create tf-idf matrix
# max_df : maximum document frequency for the given word
# min_df : minimum document frequency for the given word
# max_features: maximum number of words
# use_idf: if not true, we only calculate tf
# stop_words : built-in stop words
# tokenizer: how to tokenize the document
# ngram_range: (min_value, max_value), eg. (1, 3) means the result will include 1-gram, 2-gram, 3-gram
tfidf_model = TfidfVectorizer(max_df=0.99, max_features=1000,
                                 min_df=0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,1))

tfidf_matrix = tfidf_model.fit_transform(text) #fit the vectorizer to synopses

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " reviews and " + str(tfidf_matrix.shape[1]) + " terms.")



In total, there are 611432 reviews and 136 terms.


In [15]:
tfidf_matrix

<611432x136 sparse matrix of type '<class 'numpy.float64'>'
	with 3641747 stored elements in Compressed Sparse Row format>

In [16]:
tfidf_matrix[0]

<1x136 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [17]:
tokenization_and_stemming(text[1])

['r.i.p', 'aaron', 'brockway', 'im', 'light', 'one']

In [18]:
# words
tf_selected_words = tfidf_model.get_feature_names_out()
tf_selected_words

array(['abraham', 'ahora', 'al', 'alberto', 'alcald', 'alejandro',
       'amigo', 'amp', 'andré', 'antonio', 'asesinado', 'año', 'brown',
       'caicedo', 'carlo', 'castillo', 'como', 'cortez', 'cruz',
       'cárdena', 'da', 'danc', 'daniel', 'david', 'dedic', 'del',
       'deputi', 'desd', 'director', 'día', 'díaz', 'e', 'el', 'en', 'es',
       'est', 'esta', 'está', 'father', 'fernando', 'flore', 'fue',
       'fueron', 'garcia', 'garcía', 'gonzález', 'ha', 'hace', 'hay',
       'hoy', 'http', 'https', 'instructor', 'jair', 'jean', 'jesú',
       'jorg', 'jose', 'josé', 'juan', 'julianroman', 'julio', 'kill',
       'la', 'las', 'le', 'leyder', 'león', 'lo', 'los', 'lui', 'lópez',
       'manuel', 'martinez', 'martín', 'martínez', 'mass', 'mendoza',
       'miguel', 'ming', 'montaño', 'monterey', 'moreno', 'más',
       'ndelriego', 'nuestro', 'omar', 'oscar', 'pablo', 'para', 'park',
       'paul', 'pedro', 'perlaza', 'pimentel', 'por', 'president',
       'pérez', 'que', 'ramo

In [52]:
# Use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 5)

# document topic matrix for tfidf_matrix_lda
lda_output = lda.fit_transform(tfidf_matrix)

## 5 topics

In [53]:
print(lda_output.shape)
print(lda_output)

(611432, 5)
[[0.2        0.2        0.2        0.2        0.2       ]
 [0.2        0.2        0.2        0.2        0.2       ]
 [0.2        0.2        0.2        0.2        0.2       ]
 ...
 [0.08387324 0.4346064  0.08637002 0.0844087  0.31074164]
 [0.1        0.59877135 0.10046769 0.10065886 0.10010211]
 [0.1        0.59877135 0.10046769 0.10065886 0.10010211]]


In [54]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]

# index names
doc_names = ["Doc" + str(i) for i in range(len(text))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)  #df.values():返回数组形式的数据
df_document_topic['topic'] = topic

df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,topic
Doc0,0.2,0.2,0.2,0.2,0.2,0
Doc1,0.2,0.2,0.2,0.2,0.2,0
Doc2,0.2,0.2,0.2,0.2,0.2,0
Doc3,0.2,0.2,0.2,0.2,0.2,0
Doc4,0.08,0.42,0.33,0.08,0.08,1
Doc5,0.07,0.36,0.43,0.07,0.07,2
Doc6,0.2,0.2,0.2,0.2,0.2,0
Doc7,0.42,0.33,0.08,0.08,0.08,0
Doc8,0.2,0.2,0.2,0.2,0.2,0
Doc9,0.09,0.1,0.64,0.09,0.09,2


In [55]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
2,181404
4,143100
1,116665
0,110961
3,59302


In [28]:
# topic-word matrix
df_topic_words = pd.DataFrame(lda.components_)

# column and index
df_topic_words.columns = tfidf_model.get_feature_names_out()
df_topic_words.index = topic_names

df_topic_words.head()

Unnamed: 0,abraham,ahora,al,alberto,alcald,alejandro,amigo,amp,andré,antonio,...,torr,twyman,una,valencia,venezuela,victim,wei,álvaro,ángel,óscar
Topic0,0.200044,0.20029,0.200312,0.200149,0.200067,0.200136,0.200094,9967.10788,0.200057,0.200166,...,0.20017,0.200163,0.200221,0.20019,0.200146,6087.583106,5562.315008,0.200126,0.200066,0.200144
Topic1,0.200093,3.040893,5470.165028,0.201574,0.200465,0.572698,0.201485,0.200243,1.315093,0.201153,...,6.637885,0.200005,8213.769635,0.201111,2701.716534,0.20003,0.200007,761.279695,8441.913617,917.663212
Topic2,0.200038,0.200133,0.214086,0.200124,0.20008,0.200136,0.200119,0.200297,15213.92498,0.200154,...,0.200483,0.200007,0.200184,0.20045,0.20019,0.200011,0.200006,2668.761913,0.20011,0.200345
Topic3,3778.489055,1169.052,9340.823317,3032.850327,2892.044982,4638.838334,2671.673162,0.200374,0.202424,6708.370155,...,4180.783674,0.200005,1278.733121,3954.900421,0.20126,0.20002,0.200004,0.201256,0.224877,2128.581154
Topic4,0.200095,2236.244272,0.204182,608.685253,0.200136,0.200823,0.200353,0.201591,0.200192,0.476417,...,0.200872,2983.209245,1.265824,0.201856,1052.480584,0.200124,0.200026,0.335596,0.200416,0.201017


In [29]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names_out())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words)) #array.take(): 从给出的index取出数据
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model, lda_model=lda, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,rt,amp,http,shoot,park,victim,danc,monterey,ming,wei,mass,thread,dedic,instructor,ndelriego
Topic 1,https,la,que,rt,el,en,los,miguel,del,es,le,para,se,por,su
Topic 2,año,manuel,andré,juan,paul,jean,montaño,jair,cortez,perlaza,cárdena,leyder,silva,david,rt
Topic 3,rt,en,el,lui,la,carlo,josé,https,del,al,garcía,por,se,garcia,antonio
Topic 4,rt,https,kill,smith,julio,shot,martinez,martínez,brown,desd,father,pedro,time,deputi,hay


topic 0 and topic 4 may pertain to homicide

## 10

In [56]:
lda_10 = LatentDirichletAllocation(n_components = 10)

# document topic matrix for tfidf_matrix_lda
lda_output = lda_10.fit_transform(tfidf_matrix)

In [64]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda_10.n_components)]

# index names
doc_names = ["Doc" + str(i) for i in range(len(text))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)  #df.values():返回数组形式的数据
df_document_topic['topic'] = topic

df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,topic
Doc0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc3,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc4,0.04,0.04,0.04,0.04,0.04,0.37,0.3,0.04,0.04,0.04,5
Doc5,0.04,0.04,0.04,0.04,0.04,0.3,0.42,0.04,0.04,0.04,6
Doc6,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc7,0.04,0.04,0.04,0.29,0.04,0.04,0.04,0.37,0.04,0.04,7
Doc8,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc9,0.04,0.04,0.04,0.17,0.04,0.04,0.04,0.04,0.49,0.04,8


In [65]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
2,132075
7,104129
4,57883
0,56607
8,56535
6,54281
5,40332
1,38999
9,35972
3,34619


In [69]:
# topic-word matrix
df_topic_words = pd.DataFrame(lda_10.components_)

# column and index
df_topic_words.columns = tfidf_model.get_feature_names_out()
df_topic_words.index = topic_names

df_topic_words

Unnamed: 0,abraham,ahora,al,alberto,alcald,alejandro,amigo,amp,andré,antonio,...,torr,twyman,una,valencia,venezuela,victim,wei,álvaro,ángel,óscar
Topic0,3778.389323,0.100001,0.100019,0.100005,0.100002,0.100005,0.100001,0.100003,0.10001,297.087008,...,0.100003,0.1,0.100005,0.100003,19.227221,0.1,0.1,0.100002,0.100002,3045.945856
Topic1,0.1,0.100002,200.122759,1227.79447,0.100001,0.100005,0.100002,0.100005,1444.143171,0.397151,...,4187.122056,0.1,0.100004,1039.743285,1158.498649,0.1,0.1,0.100004,0.100001,0.100002
Topic2,0.1,0.112482,9126.047297,614.003091,0.100003,0.100008,0.100007,0.100001,0.101329,33.828125,...,0.10001,0.1,7151.078864,0.107263,1479.154033,0.1,0.1,0.234419,0.100004,0.100003
Topic3,0.1,0.1,0.100006,0.100001,0.1,0.100002,0.1,0.100005,0.100001,3279.361539,...,0.100001,0.1,0.100005,2469.864613,0.100001,0.100001,0.1,0.100003,0.1,0.100001
Topic4,0.1,0.100005,2010.105127,5.222857,0.100001,0.100004,0.100004,0.128512,0.100023,1136.389389,...,0.100004,0.1,485.47451,0.100001,0.100005,0.1,0.1,897.985259,8441.839071,0.100003
Topic5,0.1,0.1,0.100002,0.100011,0.1,592.569611,0.1,982.907392,0.1,1.918541,...,0.100001,2983.109424,0.100001,0.100004,0.1,0.100001,0.1,0.1,0.1,0.100001
Topic6,0.1,3406.929445,816.534992,1175.769657,2529.417634,679.358771,2671.575195,0.100002,0.100011,1946.924899,...,0.100918,0.1,709.627692,445.388855,0.100003,0.1,0.1,0.100019,0.100003,0.100005
Topic7,0.1,0.1,0.100005,0.1,0.1,0.1,0.1,0.1,13759.446153,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,2435.181488,0.1,0.1
Topic8,0.1,0.995651,2658.296718,618.847335,362.62809,3367.383721,0.100003,0.100002,11.552047,13.341393,...,0.100089,0.1,1147.387904,0.100003,0.124008,0.1,0.1,96.777391,0.100004,0.100003
Topic9,0.1,0.1,0.1,0.1,0.1,0.1,0.1,8984.174462,0.100001,0.100001,...,0.1,0.1,0.1,0.1,1097.294793,6087.483288,5562.21505,0.1,0.1,0.1


In [71]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names_out())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words)) #array.take(): 从给出的index取出数据
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model, lda_model=lda_10, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,http,josé,pérez,roberto,la,rt,abraham,león,se,soto,en,oscar,óscar,hoy,díaz
Topic 1,david,silva,garcia,rt,https,torr,e,sergio,ramo,da,san,lui,se,carlo,andré
Topic 2,la,el,en,que,rt,del,para,es,se,al,por,los,su,una,lo
Topic 3,https,rt,antonio,valencia,la,si,en,el,está,que,por,se,josé,le,como
Topic 4,miguel,ángel,garcía,la,le,que,rodríguez,rt,richard,https,te,el,pablo,jorg,en
Topic 5,rt,kill,smith,https,shot,martinez,brown,father,time,deputi,ryan,twyman,lui,fernando,pedro
Topic 6,el,carlo,en,del,pedro,rt,sánchez,josé,jose,la,moreno,martínez,https,juan,ahora
Topic 7,año,rt,manuel,paul,andré,jean,montaño,jair,cortez,perlaza,juan,cárdena,leyder,julianroman,caicedo
Topic 8,los,la,gonzález,lópez,las,rt,daniel,en,el,son,https,martín,jesú,alejandro,flore
Topic 9,amp,park,victim,danc,shoot,monterey,ming,wei,mass,thread,dedic,instructor,ndelriego,rt,julio


topic 5 and topic 9 may pertain to homicide

##15

In [72]:
lda_15 = LatentDirichletAllocation(n_components = 15)

# document topic matrix for tfidf_matrix_lda
lda_output = lda_15.fit_transform(tfidf_matrix)

In [76]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda_15.n_components)]

# index names
doc_names = ["Doc" + str(i) for i in range(len(text))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)  #df.values():返回数组形式的数据
df_document_topic['topic'] = topic

df_document_topic.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,topic
Doc0,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0
Doc1,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0
Doc2,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0
Doc3,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0.07,0
Doc4,0.36,0.03,0.03,0.03,0.03,0.03,0.03,0.28,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0


In [77]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
10,88958
1,69234
9,61499
8,44421
7,43320
4,42294
11,41006
2,40434
5,36591
6,29983


In [79]:
# topic-word matrix
df_topic_words = pd.DataFrame(lda_15.components_)

# column and index
df_topic_words.columns = tfidf_model.get_feature_names_out()
df_topic_words.index = topic_names

df_topic_words

Unnamed: 0,abraham,ahora,al,alberto,alcald,alejandro,amigo,amp,andré,antonio,...,torr,twyman,una,valencia,venezuela,victim,wei,álvaro,ángel,óscar
Topic0,0.066667,0.066667,2.937865,0.066667,0.066667,0.066667,0.066667,5559.279798,0.066667,0.066873,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
Topic1,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,13395.87844,0.066667,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,2574.428988,0.066667,0.066667
Topic2,0.066667,0.066667,593.976154,0.57226,0.066667,3758.010626,0.066667,0.066667,0.320276,2.478003,...,0.066667,2983.076091,0.269859,0.066667,2923.260736,0.066667,0.066667,3.705476,0.066667,2232.626444
Topic3,0.066667,0.066667,434.961942,0.609966,0.066667,0.066667,0.066667,0.066667,664.520883,128.986369,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,52.192922,0.066667,0.066667
Topic4,3778.355991,0.066667,1075.391513,0.066668,0.066667,2.732746,118.298979,0.066667,59.970989,0.072993,...,0.079476,0.066667,289.486409,0.066667,53.273252,0.066667,0.066667,0.066667,0.066667,813.352762
Topic5,0.066667,0.066667,0.066667,2613.293847,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
Topic6,0.066667,0.066667,399.077426,10.197162,0.066667,5.494834,0.066667,0.066667,0.066687,406.498845,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,297.658861,0.066667,0.066667
Topic7,0.066667,0.066667,1072.025935,659.563895,0.066667,873.040586,0.066667,0.066667,189.27859,1552.652037,...,4187.076938,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.073244,0.066667,0.066667
Topic8,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,16.904908,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
Topic9,0.066667,0.066667,7097.698881,0.066744,1926.35968,0.066667,2553.308234,0.066667,0.066667,4554.766035,...,0.066667,0.066667,435.918043,3954.770694,0.066667,0.066667,0.066667,1.761281,0.066667,0.066667


In [80]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names_out())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words)) #array.take(): 从给出的index取出数据
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model, lda_model=lda_15, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,amp,martinez,pedro,deputi,rt,https,david,shot,la,los,por,en,carlo,al,del
Topic 1,año,manuel,paul,jean,montaño,andré,jair,cortez,perlaza,juan,cárdena,leyder,rt,julianroman,álvaro
Topic 2,la,en,https,time,gonzález,hoy,rt,alejandro,ryan,el,twyman,venezuela,esta,sobr,del
Topic 3,lópez,daniel,sergio,ramo,cruz,https,san,el,martín,rt,ramón,en,la,josé,su
Topic 4,que,lo,te,si,abraham,rt,pérez,el,soto,oscar,hace,los,todo,daniel,es
Topic 5,rt,alberto,el,la,asesinado,esta,por,al,en,martínez,pérez,https,gonzález,álvaro,del
Topic 6,roberto,garcía,pablo,jorg,castillo,moreno,https,el,rt,en,la,del,más,que,lui
Topic 7,carlo,lui,http,rt,torr,https,juan,ramírez,fernando,flore,la,garcia,garcía,en,para
Topic 8,https,rt,kill,smith,shot,jose,oscar,juan,los,la,el,antonio,del,josé,le
Topic 9,el,al,josé,rt,para,del,la,richard,en,león,antonio,valencia,sánchez,president,se
