In [22]:
# https://github.com/derekgreene/topic-model-tutorial/blob/master/1%20-%20Text%20Preprocessing.ipynb
from pathlib import Path
import operator, joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [23]:
in_path = Path("data") / "articles.txt"
raw_documents = []
snippets = []
with open(in_path, "r", encoding="utf8") as fin:
    for line in fin.readlines():
        text = line.strip()
        raw_documents.append( text )
        # keep a short snippet of up to 100 characters as a title for each article
        snippets.append(text[0:min(len(text),100)])
print("Read %d raw text documents" % len(raw_documents))

Read 171 raw text documents


In [24]:
custom_stop_words = []
with open( "stopwords.txt", "r", encoding="utf8") as fin:
    for line in fin.readlines():
        custom_stop_words.append(line.strip())
# note that we need to make it hashable
print("Stopword list has %d entries" % len(custom_stop_words))

Stopword list has 1298 entries


In [25]:
# use a custom stopwords list, set the minimum term-document frequency to 20
vectorizer = CountVectorizer(stop_words = custom_stop_words, min_df=20)
A = vectorizer.fit_transform(raw_documents)
print( "Created %d X %d document-term matrix" % (A.shape[0], A.shape[1]))

Created 171 X 78 document-term matrix


In [26]:
terms =  list(vectorizer.get_feature_names_out())
print("Vocabulary has %d distinct terms" % len(terms))

Vocabulary has 78 distinct terms


In [27]:
joblib.dump((A,terms,snippets), "articles-raw.pkl") 

['articles-raw.pkl']

In [28]:
# we can pass in the same preprocessing parameters
vectorizer = TfidfVectorizer(stop_words=custom_stop_words, min_df = 20)
A = vectorizer.fit_transform(raw_documents)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (A.shape[0], A.shape[1]) )

Created 171 X 78 TF-IDF-normalized document-term matrix


In [29]:
terms =  list(vectorizer.get_feature_names_out())
print("Vocabulary has %d distinct terms" % len(terms))

Vocabulary has 78 distinct terms


In [30]:
def rank_terms(A, term):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

In [32]:
ranking = rank_terms(A, terms)
for i, pair in enumerate(ranking[0:100]):
    print( "%02d. %s (%.2f)" % (i+1, pair[0], pair[1] ))

01. model (22.19)
02. software (18.99)
03. learning (18.85)
04. based (16.87)
05. devops (16.40)
06. systems (16.18)
07. testing (15.09)
08. engineering (14.77)
09. data (14.60)
10. models (14.38)
11. development (14.13)
12. continuous (13.08)
13. deployment (12.43)
14. driven (12.02)
15. integration (11.89)
16. approach (11.56)
17. cloud (11.26)
18. modeling (11.08)
19. applications (10.21)
20. process (10.09)
21. code (9.35)
22. requirements (9.09)
23. machine (9.08)
24. design (8.59)
25. architecture (8.49)
26. framework (8.42)
27. quality (8.36)
28. domain (7.98)
29. paper (7.94)
30. time (7.23)
31. analysis (6.88)
32. tools (6.82)
33. cyber (6.69)
34. physical (6.60)
35. specific (6.13)
36. techniques (6.00)
37. support (5.83)
38. application (5.82)
39. industrial (5.75)
40. management (5.66)
41. automation (5.65)
42. study (5.61)
43. challenges (5.53)
44. language (5.39)
45. automated (5.31)
46. proposed (5.08)
47. monitoring (5.03)
48. performance (5.00)
49. approaches (4.97)
50

In [33]:
joblib.dump((A,terms,snippets), "articles-tfidf.pkl") 

['articles-tfidf.pkl']