https://scikit-learn.org/stable/auto_examples/cluster/plot_inductive_clustering.html
<br>
https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import numpy as np
import bs4
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, clone
from sklearn.utils.metaestimators import if_delegate_has_method

  from numpy.core.umath_tests import inner1d


In [2]:
class InductiveClusterer(BaseEstimator):
    def __init__(self, clusterer, classifier):
        self.clusterer = clusterer
        self.classifier = classifier

    def fit(self, X, y=None):
        self.clusterer_ = clone(self.clusterer)
        self.classifier_ = clone(self.classifier)
        y = self.clusterer_.fit_predict(X)
        self.classifier_.fit(X, y)
        return self

    @if_delegate_has_method(delegate='classifier_')
    def predict(self, X):
        return self.classifier_.predict(X)

    @if_delegate_has_method(delegate='classifier_')
    def decision_function(self, X):
        return self.classifier_.decision_function(X)

In [3]:
def obtain_parse_wiki_snp500(): 
    """ Download and parse the Wikipedia list of S&P500 constituents using requests and Beautiful Soup.
    """     
    
    response = requests.get( "http://en.wikipedia.org/wiki/List_of_S%26P_500_companies" )
    
    soup = bs4.BeautifulSoup(response.text)
    
    # This selects the first table, using CSS Selector syntax and then ignores the header row ([1:])
    symbolslist = soup.select('table')[0].select('tr')[1:]
    
    # Obtain the symbol information for each row in the S&P500 constituent table
    symbols = []
    for i, symbol in enumerate(symbolslist):
        tds = symbol.select('td')
        symbols.append((#tds[0].select('a')[0].text, # Ticker
                        tds[1].select('a')[0].text, # Name
                        tds[3].text, # Sector
                        tds[4].text # SubSector
                      )) 
 
    return symbols
 
tickers = obtain_parse_wiki_snp500()

In [4]:
documents = [' '.join([word for word in row]) for row in tickers]

In [5]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [6]:
#linkage  -> ward, complete, average, single
#affinity -> euclidean, l1, l2, manhattan, cosine
clusterer = AgglomerativeClustering(n_clusters=11, affinity='euclidean', linkage='ward')
clusterer.fit(X.toarray())

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=11,
            pooling_func=<function mean at 0x000002A047D71950>)

In [7]:
# Declare the inductive learning model that it will be used to
# predict cluster membership for unknown instances
classifier = RandomForestClassifier()
inductive_learner = InductiveClusterer(clusterer, classifier).fit(X.toarray())

print("\n")
print("Prediction")

Y = vectorizer.transform(["PicPay Financials Financial Exchanges & Data"])
prediction = inductive_learner.predict(Y.toarray())
print(prediction)

Y = vectorizer.transform(["Vale Industrials Industrial Conglomerates"])
prediction = inductive_learner.predict(Y.toarray())
print(prediction)



Prediction
[2]
[3]


In [8]:
for i in range(11):
    term = "===Cluster %d===" % i
    print(term.replace(term, "\033[48;5;0m\033[38;5;226m {term} \033[0;0m".format(term=term)))
    for j in range(len(documents)):
        if clusterer.labels_[j] == i:
            print(clusterer.labels_[j], ' ', tickers[j][0], '\t\t', tickers[j][1])

[48;5;0m[38;5;226m ===Cluster 0=== [0;0m
0   Accenture 		 Information Technology
0   Adobe Inc. 		 Information Technology
0   Advanced Micro Devices 		 Information Technology
0   Akamai Technologies 		 Information Technology
0   Amphenol Corp 		 Information Technology
0   Analog Devices, Inc. 		 Information Technology
0   ANSYS, Inc. 		 Information Technology
0   Apple Inc. 		 Information Technology
0   Applied Materials Inc. 		 Information Technology
0   Arista Networks 		 Information Technology
0   Autodesk Inc. 		 Information Technology
0   Automatic Data Processing 		 Information Technology
0   Broadcom Inc. 		 Information Technology
0   Broadridge Financial Solutions 		 Information Technology
0   Cadence Design Systems 		 Information Technology
0   CDW 		 Information Technology
0   Cisco Systems 		 Information Technology
0   Citrix Systems 		 Information Technology
0   Cognizant Technology Solutions 		 Information Technology
0   Corning Inc. 		 Information Technology
0   DXC Te

10   Dow Inc. 		 Materials
10   DuPont de Nemours Inc 		 Materials
10   Eastman Chemical 		 Materials
10   Ecolab Inc. 		 Materials
10   FMC Corporation 		 Materials
10   Freeport-McMoRan Inc. 		 Materials
10   International Paper 		 Materials
10   International Flavors & Fragrances 		 Materials
10   Linde plc 		 Materials
10   LyondellBasell 		 Materials
10   Martin Marietta Materials 		 Materials
10   The Mosaic Company 		 Materials
10   Newmont Corporation 		 Materials
10   Nucor Corp. 		 Materials
10   Packaging Corporation of America 		 Materials
10   PPG Industries 		 Materials
10   Sealed Air 		 Materials
10   Sherwin-Williams 		 Materials
10   Vulcan Materials 		 Materials
10   WestRock 		 Materials
