# K-Means Segmentation

## Step 01: Download the Data

In [1]:
#Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import yfinance as yf
from yahoofinancials import YahooFinancials

In [2]:
#list of companies
companies_dict = {
    'Amazon':'AMZN',
    'Apple':'AAPL',
    'Walgreen':'WBA',
    'Northrop Grumman':'NOC',
    'Boeing':'BA',
    'Lockheed Martin':'LMT',
    'McDonalds':'MCD',
    'Intel':'INTC',
    'Navistar':'NAV',
    'IBM':'IBM',
    'Texas Instruments':'TXN',
    'MasterCard':'MA',
    'Microsoft':'MSFT',
    'General Electrics':'GE',
    'Symantec':'SYMC',
    'American Express':'AXP',
    'Pepsi':'PEP',
    'Coca Cola':'KO',
    'Johnson & Johnson':'JNJ',
    'Toyota':'TM',
    'Honda':'HMC',
    'Mistubishi':'MSBHY',
    'Sony':'SNE',
    'Exxon':'XOM',
    'Chevron':'CVX',
    'Valero Energy':'VLO',
    'Ford':'F',
    'Bank of America':'BAC'}

In [4]:
#Extracting the data for a specified time period
from datetime import datetime as dt

start = '2021-01-01'
end = dt.now().strftime('%Y-%m-%d')
p = yf.download(list(companies_dict.values()), start, end)

print(f'Data is from {start} to {end}')

[*********************100%***********************]  28 of 28 completed

4 Failed downloads:
['NAV', 'MSBHY', 'SYMC', 'SNE']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')


Data is from 2021-01-01 to 2024-10-14


In [5]:
#Let's import the scikit-learn libraries for K-Means clustering
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans

## Step 02: Calculating Daily Movements

In [10]:
movements = p['Close']- p['Open']
movements = movements.T #Transposing for easier manipulation (by company name)
movements

Date,2021-01-04 00:00:00+00:00,2021-01-05 00:00:00+00:00,2021-01-06 00:00:00+00:00,2021-01-07 00:00:00+00:00,2021-01-08 00:00:00+00:00,2021-01-11 00:00:00+00:00,2021-01-12 00:00:00+00:00,2021-01-13 00:00:00+00:00,2021-01-14 00:00:00+00:00,2021-01-15 00:00:00+00:00,...,2024-09-30 00:00:00+00:00,2024-10-01 00:00:00+00:00,2024-10-02 00:00:00+00:00,2024-10-03 00:00:00+00:00,2024-10-04 00:00:00+00:00,2024-10-07 00:00:00+00:00,2024-10-08 00:00:00+00:00,2024-10-09 00:00:00+00:00,2024-10-10 00:00:00+00:00,2024-10-11 00:00:00+00:00
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAPL,-4.110001,2.119995,-1.120003,2.559998,-0.37999,-0.210007,0.300003,2.130005,-1.889999,-1.639999,...,2.960007,-3.309998,0.889999,0.529999,-1.099991,-2.809998,1.470001,4.309998,1.259995,-1.75
AMZN,-4.168503,2.625,-0.404999,0.257996,0.134995,-1.690002,0.041504,1.872498,-2.002502,-0.938507,...,-0.809998,0.230011,0.319992,-1.089996,0.759995,-2.149994,0.800003,2.349991,-0.480011,2.190002
AXP,-3.260002,0.409996,2.059998,-2.439995,-0.290001,0.639999,-0.379997,0.800003,1.129997,-0.330002,...,1.100006,-2.679993,2.320007,-1.050018,3.720001,-0.559998,-2.140015,2.610016,-0.459991,4.779999
BA,-7.279999,6.889999,0.809998,-0.679993,-3.710007,3.759995,1.76001,0.060013,1.12001,-4.639999,...,-2.75,2.759995,-0.869995,-1.599991,2.740005,0.910004,-1.340012,-2.680008,-2.450012,4.720001
BAC,-0.429998,0.09,0.250002,-0.450001,-0.389999,0.98,0.240002,0.029999,0.510002,-0.280003,...,0.450001,-0.289997,-0.02,0.27,0.18,-0.25,-0.119999,0.32,-0.039997,1.450001
CVX,-0.440002,1.510002,0.800003,-0.169998,-1.0,2.479996,0.679993,0.050003,1.549995,-2.130005,...,1.900009,3.349991,-0.729996,-0.059998,-1.73999,-0.380005,-0.449997,2.139999,-0.059998,1.039993
F,-0.29,0.179999,0.05,0.120001,-0.1,0.43,0.48,-0.13,0.36,-0.21,...,0.080001,0.03,-0.25,0.04,-0.02,-0.06,-0.01,0.13,0.1,0.08
GE,-2.093338,1.694607,2.641594,-1.4454,0.249207,1.694607,1.345715,-1.046665,0.249207,-1.345715,...,3.5,-1.190002,0.869995,-2.610001,0.5,0.020004,-0.149994,2.649994,-0.029999,1.75
HMC,-0.24,0.15,0.190001,-0.049999,-0.129999,0.27,0.18,-0.119999,0.1,-0.01,...,-0.210001,-0.17,0.27,-0.049999,0.48,-0.200001,-0.019999,0.18,0.129999,0.32
IBM,-1.826004,1.080307,2.284897,-1.003822,-0.038239,0.602295,0.114723,-2.131927,0.908218,0.105164,...,0.430008,-1.279999,1.419998,3.220001,2.25,1.73999,0.509995,5.100006,-2.080002,0.009995


## Step 03: Normalizing the movements

In [13]:
#Drop na first
movements.dropna(inplace= True)

norm_engine = Normalizer()
norm_movements = norm_engine.fit_transform(movements)

In [16]:
norm_movements.shape

(24, 950)

In [17]:
norm_movements[0:5]

array([[-0.05819297,  0.03001674, -0.01585798, ...,  0.06102471,
         0.0178401 , -0.02477803],
       [-0.05694872,  0.03586189, -0.00553296, ...,  0.03210481,
        -0.00655775,  0.0299191 ],
       [-0.04181616,  0.00525903,  0.02642366, ...,  0.03347877,
        -0.00590033,  0.06131321],
       [-0.06358845,  0.06018193,  0.00707507, ..., -0.02340901,
        -0.02140007,  0.04122769],
       [-0.02774038,  0.00580616,  0.01612831, ...,  0.02064406,
        -0.00258032,  0.09354353]])

## Step 04: Applying K-Means

In [18]:
#Setting the number of clusters and applying the k-means clusterer
num_of_clusters = 3
kmeans = KMeans(n_clusters = num_of_clusters, random_state=42) #Random_state is Random_seed in numpy
kmeans.fit(norm_movements)

## Step 05: Analyze the Cluster Results

In [24]:
#Cluster labels for each company 
labels = kmeans.labels_
labels

array([2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 2, 1, 2, 1, 1, 0, 2, 0,
       0, 0], dtype=int32)

In [29]:
#Convert the lables and comapnies into a dataframe
company_clusters = pd.DataFrame({'Company' : movements.index, 'Cluster' : labels})
company_clusters

Unnamed: 0,Company,Cluster
0,AAPL,2
1,AMZN,2
2,AXP,0
3,BA,0
4,BAC,0
5,CVX,0
6,F,0
7,GE,0
8,HMC,0
9,IBM,0


In [35]:
#Printing companies in each cluster
for cluster in range(num_of_clusters):
    print(f"\nCluster {cluster + 1}:")
    print(company_clusters[company_clusters['Cluster'] == cluster]['Company'].values)


Cluster 1:
['AXP' 'BA' 'BAC' 'CVX' 'F' 'GE' 'HMC' 'IBM' 'TM' 'VLO' 'WBA' 'XOM']

Cluster 2:
['JNJ' 'KO' 'LMT' 'MCD' 'NOC' 'PEP']

Cluster 3:
['AAPL' 'AMZN' 'INTC' 'MA' 'MSFT' 'TXN']
