<p>Refs:</p>
<p>(1) https://pythonprogramminglanguage.com/kmeans-text-clustering/</p>
<p>(2) https://financetrain.com/k-means-algorithm-python-example/</p>
<p>(3) https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py</p>
<p>(4) https://en.wikipedia.org/wiki/List_of_S%26P_500_companies</p>

In [1]:
import bs4
import requests
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def obtain_parse_wiki_snp500(): 
    """ Download and parse the Wikipedia list of S&P500 constituents using requests and Beautiful Soup.
    """     
    
    response = requests.get( "http://en.wikipedia.org/wiki/List_of_S%26P_500_companies" )
    
    soup = bs4.BeautifulSoup(response.text)
    
    # This selects the first table, using CSS Selector syntax and then ignores the header row ([1:])
    symbolslist = soup.select('table')[0].select('tr')[1:]
    
    # Obtain the symbol information for each row in the S&P500 constituent table
    symbols = []
    for i, symbol in enumerate(symbolslist):
        tds = symbol.select('td')
        symbols.append((#tds[0].select('a')[0].text, # Ticker
                        tds[1].select('a')[0].text, # Name
                        tds[3].text, # Sector
                        tds[4].text # SubSector
                      )) 
 
    return symbols
 
tickers = obtain_parse_wiki_snp500()

In [3]:
documents = [' '.join([word for word in row]) for row in tickers]

In [4]:
documents

['3M Company Industrials Industrial Conglomerates',
 'Abbott Laboratories Health Care Health Care Equipment',
 'AbbVie Inc. Health Care Pharmaceuticals',
 'Abiomed Health Care Health Care Equipment',
 'Accenture Information Technology IT Consulting & Other Services',
 'Activision Blizzard Communication Services Interactive Home Entertainment',
 'Adobe Inc. Information Technology Application Software',
 'Advanced Micro Devices Information Technology Semiconductors',
 'Advance Auto Parts Consumer Discretionary Automotive Retail',
 'AES Corp Utilities Independent Power Producers & Energy Traders',
 'Aflac Financials Life & Health Insurance',
 'Agilent Technologies Health Care Health Care Equipment',
 'Air Products & Chemicals Materials Industrial Gases',
 'Akamai Technologies Information Technology Internet Services & Infrastructure',
 'Alaska Air Group Industrials Airlines',
 'Albemarle Corporation Materials Specialty Chemicals',
 'Alexandria Real Estate Equities Real Estate Office REITs

In [5]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [6]:
print(X.toarray())

[[1 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]


In [7]:
X.shape

(505, 811)

In [8]:
true_k = 11 #2 11 13
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=11, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [9]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    term = "Cluster %d:" % i
    print(term.replace(term, "\033[48;5;0m\033[38;5;226m {term} \033[0;0m".format(term=term)))    
    for ind in order_centroids[i, :]:
        print(' %s' % terms[ind])
    print

Top terms per cluster:
[48;5;0m[38;5;226m Cluster 0: [0;0m
 utilities
 energy
 electric
 multi
 corp
 power
 american
 water
 group
 edison
 producers
 traders
 independent
 dominion
 company
 dte
 duke
 cms
 ameren
 nisource
 atmos
 public
 nextera
 gas
 entergy
 enterprise
 centerpoint
 evergy
 sempra
 service
 eversource
 exelon
 firstenergy
 capital
 consolidated
 nrg
 southern
 pinnacle
 works
 aes
 int
 west
 wec
 xcel
 ppl
 pseg
 alliant
 eog
 equipment
 equinix
 dentsply
 exxon
 equifax
 environmental
 equity
 f5
 entertainment
 facebook
 depot
 enphase
 design
 equities
 decker
 exploration
 defense
 expeditors
 expedia
 express
 deere
 exchanges
 exchange
 delta
 essex
 dennison
 extra
 everest
 engineering
 estée
 estate
 etsy
 dish
 employment
 dwight
 dickinson
 digital
 drug
 drinks
 dow
 direct
 dover
 discover
 domino
 discovery
 dollar
 diversified
 distributors
 discretionary
 distillers
 dupont
 dxc
 disney
 dynamics
 eli
 electronics
 electronic
 electrical
 devi

 lyondellbasell
 luxury
 lumen
 lowe
 logistics
 loews
 lockheed
 lkq
 live
 lines
 line
 linde
 lincoln
 limited
 lilly
 lifesciences
 life
 lennar
 leisure
 leidos
 machines
 managed
 management
 masco
 meats
 mcmoran
 mclennan
 mckesson
 mcdonald
 mccormick
 maxim
 materials
 mastercard
 martin
 manufacturers
 marsh
 marriott
 markit
 markets
 marketing
 marketaxess
 marietta
 marathon
 manufacturing
 leggett
 lauren
 lauder
 int
 intuitive
 intuit
 interpublic
 internet
 international
 intercontinental
 interactive
 zions
 integrated
 insurance
 ipg
 instruments
 ingersoll
 ingalls
 infrastructure
 information
 industries
 industrials
 independent
 incyte
 invesco
 iqvia
 las
 kimberly
 lamb
 lam
 laboratory
 laboratories
 l3harris
 kroger
 kraft
 kla
 kinder
 keysight
 jack
 keycorp
 kellogg
 kansas
 juniper
 jpmorgan
 johnson
 jm
 james
 jacobs
 media
 mellon
 philip
 merchandise
 pacific
 paccar
 outsourced
 otis
 oracle
 oneok
 omnicom
 old
 oil
 occidental
 nxp
 nvr
 nvidia
 n

 resmed
 residential
 research
 republic
 rentals
 reits
 reinsurance
 reilly
 regions
 regional
 regeneron
 regency
 stryker
 super
 supply
 watson
 water
 waste
 walt
 walmart
 walgreens
 vulcan
 vornado
 visa
 vintners
 viatris
 viacomcbs
 vf
 vertex
 verizon
 verisk
 verisign
 ventas
 vegas
 valero
 utilities
 unum
 waters
 wec
 united
 wells
 zimmer
 zebra
 yum
 york
 xylem
 xilinx
 xcel
 wynn
 worldwide
 works
 wireless
 willis
 williams
 wholesale
 whirlpool
 weyerhaeuser
 westrock
 weston
 westinghouse
 western
 welltower
 unitedhealth
 union
 support
 tobacco
 thrifts
 thermo
 textron
 texas
 tesla
 teradyne
 teleflex
 teledyne
 telecommunication
 technology
 technologies
 te
 target
 tapestry
 systems
 sysco
 synopsys
 synchrony
 svc
 svb
 surgical
 tjx
 toledo
 ulta
 tool
 udr
 tyson
 tyler
 twitter
 trust
 truist
 trucks
 trucking
 trimble
 tree
 travelers
 transportation
 transport
 transdigm
 trane
 trading
 traders
 tractor
 towers
 tower
 tools
 producers
 processing
 i

 insurance
 ingersoll
 ingalls
 jm
 johnson
 jpmorgan
 lamb
 lifesciences
 life
 lennar
 leisure
 leidos
 leggett
 lauren
 lauder
 las
 laboratory
 kansas
 laboratories
 l3harris
 kroger
 kraft
 kinder
 kimco
 kimberly
 keycorp
 kellogg
 microchip
 microsoft
 ppl
 penn
 payments
 paycom
 paychex
 parts
 parker
 parcel
 paper
 palmolive
 packard
 packaging
 packaged
 pacific
 paccar
 outsourced
 otis
 oracle
 oneok
 omnicom
 old
 oil
 office
 paypal
 pentair
 mid
 people
 ppg
 power
 pool
 pnc
 plc
 platt
 pizza
 pioneer
 pinnacle
 photonics
 phillips
 philip
 pharmaceuticals
 pharmaceutical
 pfizer
 petroleum
 personal
 perrigo
 perkinelmer
 peripherals
 pepsico
 occidental
 nxp
 nvr
 nvidia
 myers
 multi
 msci
 movies
 mountain
 motors
 motor
 mosaic
 mortgage
 morris
 morgan
 moody
 monster
 monolithic
 mondelez
 molson
 mohawk
 mobile
 mobil
 mills
 midland
 nasdaq
 nation
 national
 nike
 nucor
 nrg
 nov
 norwegian
 nortonlifelock
 northrop
 northern
 norfolk
 nisource
 nielsen
 na

 food
 fmc
 flir
 fleetcor
 flavors
 fisher
 diamondback
 zoetis
 dexcom
 beauty
 ball
 baker
 avery
 avalonbay
 autozone
 automotive
 automobile
 automation
 automatic
 autodesk
 auto
 atmos
 associates
 arts
 armour
 arista
 archer
 aptiv
 applied
 baxter
 becton
 appliances
 best
 broadridge
 broadcom
 broadcasting
 bristol
 brewers
 brands
 brake
 boston
 borgwarner
 boots
 booking
 boeing
 blizzard
 black
 biotechnology
 biomet
 biogen
 bio
 beverage
 application
 apple
 building
 alexandria
 alaska
 akamai
 airlines
 air
 agricultural
 agilent
 aes
 aerospace
 advertising
 advanced
 advance
 adobe
 activision
 accessories
 accenture
 abiomed
 abbvie
 abbott
 66
 albemarle
 alexion
 apparel
 align
 apartments
 apa
 anthem
 ansys
 analytics
 analog
 amphenol
 amgen
 ametek
 amerisourcebergen
 ameren
 amcor
 amazon
 altria
 alternative
 alphabet
 alliant
 alliance
 allegion
 brown
 business
 devon
 corning
 copart
 coors
 cooper
 controls
 containers
 host
 consulting
 construction


In [10]:
print("\n")
print("Prediction")

Y = vectorizer.transform(["PicPay Financials Financial Exchanges & Data"])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["Vale Industrials Industrial Conglomerates"])
prediction = model.predict(Y)
print(prediction)



Prediction
[7]
[8]


In [11]:
for i in range(true_k):
    term = "===Cluster %d===" % i
    print(term.replace(term, "\033[48;5;0m\033[38;5;226m {term} \033[0;0m".format(term=term)))
    for j in range(len(documents)):
        if model.labels_[j] == i:
            print(model.labels_[j], ' ', tickers[j][0], '\t\t', tickers[j][1])

[48;5;0m[38;5;226m ===Cluster 0=== [0;0m
0   AES Corp 		 Utilities
0   Alliant Energy 		 Utilities
0   Ameren Corp 		 Utilities
0   American Electric Power 		 Utilities
0   American Water Works 		 Utilities
0   Atmos Energy 		 Utilities
0   CenterPoint Energy 		 Utilities
0   CMS Energy 		 Utilities
0   Consolidated Edison 		 Utilities
0   Dominion Energy 		 Utilities
0   DTE Energy Co. 		 Utilities
0   Duke Energy 		 Utilities
0   Edison Int'l 		 Utilities
0   Entergy Corp. 		 Utilities
0   Evergy 		 Utilities
0   Eversource Energy 		 Utilities
0   Exelon Corp. 		 Utilities
0   FirstEnergy Corp 		 Utilities
0   NextEra Energy 		 Utilities
0   NiSource Inc. 		 Utilities
0   NRG Energy 		 Utilities
0   Pinnacle West Capital 		 Utilities
0   PPL Corp. 		 Utilities
0   Public Service Enterprise Group (PSEG) 		 Utilities
0   Sempra Energy 		 Utilities
0   Southern Company 		 Utilities
0   WEC Energy Group 		 Utilities
0   Xcel Energy Inc 		 Utilities
[48;5;0m[38;5;226m ===Cluster 1===

10   ResMed 		 Health Care
10   Steris 		 Health Care
10   Stryker Corp. 		 Health Care
10   Teleflex 		 Health Care
10   Thermo Fisher Scientific 		 Health Care
10   UnitedHealth Group Inc. 		 Health Care
10   Vertex Pharmaceuticals Inc 		 Health Care
10   Viatris 		 Health Care
10   Waters Corporation 		 Health Care
10   Zimmer Biomet 		 Health Care
10   Zoetis 		 Health Care
