<a href="https://colab.research.google.com/github/lilyontherocks/lilyontherocks/blob/main/Clustering_Linda_Z.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from collections import Counter
from scipy.stats import mstats


Data Preprocessing:


In [2]:
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/My Drive/AdultDataset.csv') #for this i saved the file on my google drive
data.head()

Mounted at /content/drive


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


**Defining a dataframe and removing question marks**

In [3]:
data = data[data['workclass'] != '?']
data = data[data['education'] != '?']
data = data[data['marital-status'] != '?']
data = data[data['occupation'] != '?']
data = data[data['relationship'] != '?']
data = data[data['race'] != '?']
data = data[data['sex'] != '?']

data = data[data['capital-gain'] != '?']
data = data[data['capital-loss'] != '?']
data = data[data['hours-per-week'] != '?']
data = data[data['native-country'] != '?']
data = data[data['income'] != '?']
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


Factorising to obtain numerical values for K-Means clustering by:

1.   Turning the categories(e.g. Private, local-gov) into unique numbers, and obtaining a mapping array.
2.   Using  this array for mapping the category and unique numbers to return the category.

In [4]:
cluster_data = data.copy()
cluster_data['workclass'], workclass = pd.factorize(cluster_data['workclass'])
cluster_data['education'], education = pd.factorize(cluster_data['education'])
cluster_data['marital-status'], marital_status = pd.factorize(cluster_data['marital-status'])
cluster_data['occupation'], occupation = pd.factorize(cluster_data['occupation'])
cluster_data['relationship'], relationship = pd.factorize(cluster_data['relationship'])
cluster_data['race'], race = pd.factorize(cluster_data['race'])
cluster_data['sex'], sex = pd.factorize(cluster_data['sex'])
cluster_data['native-country'], country = pd.factorize(cluster_data['native-country'])

def score(x):
  if x == '<=50K': # earning <=50k as 1
    return 0
  elif x == '>50K': # earning >50k as 0
    return 1;

cluster_data['income'] = cluster_data['income'].apply(score)
cluster_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,0,226802,0,7,0,0,0,0,0,0,0,40,0,0
1,38,0,89814,1,9,1,1,1,1,0,0,0,50,0,0
2,28,1,336951,2,12,1,2,1,1,0,0,0,40,0,1
3,44,0,160323,3,10,1,0,1,0,0,7688,0,40,0,1
5,34,0,198693,4,6,0,3,2,1,0,0,0,30,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,0,257302,2,12,1,8,4,1,1,0,0,38,0,0
48838,40,0,154374,1,9,1,0,1,1,0,0,0,40,0,1
48839,58,0,151910,1,9,2,6,3,1,1,0,0,40,0,0
48840,22,0,201490,1,9,0,6,0,1,0,0,0,20,0,0


Implementation of Standardisation of the different, independent numerical data to make sure everything is weighted equally.

In [8]:
cluster_data_std = StandardScaler().fit_transform(cluster_data)
cluster_data_std

array([[-1.02498291, -0.50785209,  0.35088942, ..., -0.07812006,
        -0.24197389, -0.57403082],
       [-0.04145504, -0.50785209, -0.94587846, ...,  0.7547014 ,
        -0.24197389, -0.57403082],
       [-0.79801494,  0.23179645,  1.39359159, ..., -0.07812006,
        -0.24197389,  1.74206675],
       ...,
       [ 1.47166476, -0.50785209, -0.35805983, ..., -0.07812006,
        -0.24197389, -0.57403082],
       [-1.25195088, -0.50785209,  0.11127873, ..., -1.74376299,
        -0.24197389, -0.57403082],
       [ 1.01772882,  3.19039064,  0.92951628, ..., -0.07812006,
        -0.24197389,  1.74206675]])

In [9]:
kmean2 = KMeans(n_clusters=2, init='k-means++', random_state = 0, n_init="auto")
kmean2.fit(cluster_data_std)

Error value

In [10]:
kmean2.inertia_

613784.7123263436

In [11]:
kmean2.n_iter_

18

The center points for each cluster

In [12]:
kmean2.cluster_centers_

array([[ 0.45000881,  0.40844326, -0.05140215,  0.4183835 ,  0.6646066 ,
        -0.04440492,  0.05910191, -0.17876288,  0.12127675, -0.43860846,
         0.31148503,  0.30304193,  0.50526788,  0.02802596,  1.34827717],
       [-0.18371996, -0.16675047,  0.02098537, -0.17080866, -0.27133136,
         0.01812869, -0.02412886,  0.07298148, -0.04951228,  0.17906567,
        -0.12716644, -0.12371947, -0.20627995, -0.01144184, -0.55044574]])

Each rows is assigned to the dedicated cluster and a new feature is introduced named "Segment K-means"

In [13]:
cluster_kmean2 = cluster_data.copy()
cluster_kmean2['Segment K-means'] = kmean2.labels_
cluster_kmean2['Segment'] = cluster_kmean2['Segment K-means'].map({0:'first', 1:'second'})
cluster_kmean2

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,Segment K-means,Segment
0,25,0,226802,0,7,0,0,0,0,0,0,0,40,0,0,1,second
1,38,0,89814,1,9,1,1,1,1,0,0,0,50,0,0,1,second
2,28,1,336951,2,12,1,2,1,1,0,0,0,40,0,1,0,first
3,44,0,160323,3,10,1,0,1,0,0,7688,0,40,0,1,0,first
5,34,0,198693,4,6,0,3,2,1,0,0,0,30,0,0,1,second
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,0,257302,2,12,1,8,4,1,1,0,0,38,0,0,1,second
48838,40,0,154374,1,9,1,0,1,1,0,0,0,40,0,1,0,first
48839,58,0,151910,1,9,2,6,3,1,1,0,0,40,0,0,1,second
48840,22,0,201490,1,9,0,6,0,1,0,0,0,20,0,0,1,second


Income and Cluster is taken to count the number of rows with income smaller and greater than 50k for each cluster.
First output(0, 1) explanation:
0 means income less than 50k
1 means the cluster 1
31823 row count with the income less than 50k and cluster 1

Second output (1, 0 ):
1 means income more than 50k
0 means cluster 0
10882 row count with the income more than 50k and cluster 0




In [16]:
zipped = list(zip(cluster_kmean2['income'], cluster_kmean2['Segment K-means']))
Counter(zipped)

Counter({(0, 1): 31823, (1, 0): 10882, (0, 0): 2191, (1, 1): 326})

There is 1 cluster for the income greater than 50k, because of the row count is so much larger fitting this description than not fitting.

In [17]:
kmean4 = KMeans(n_clusters=4, init='k-means++', random_state = 0, n_init="auto")
kmean4.fit(cluster_data_std)

In [18]:
kmean4.inertia_

538547.027237887

In [19]:
kmean4.n_iter_

13

In [20]:
kmean4.cluster_centers_

array([[ 0.4554042 ,  2.18837029, -0.16237973,  0.2300128 ,  0.41251275,
        -0.10972591, -0.06761487, -0.18711499,  0.09063804, -0.34636662,
         0.10046059,  0.04344439,  0.35880286, -0.0087038 ,  0.19854616],
       [ 0.35299759, -0.22434139, -0.02895337, -0.13917149, -0.20618789,
         1.2665953 , -0.08448004,  1.07718425, -0.0869966 ,  0.918464  ,
        -0.11210031, -0.09912397, -0.19314924,  0.0528898 , -0.37492536],
       [-0.48375016, -0.3844076 ,  0.06119724, -0.14206135, -0.26477996,
        -0.57351612,  0.02532119, -0.37510839, -0.01608344, -0.14039916,
        -0.12939937, -0.11721301, -0.20382139, -0.02117516, -0.57220855],
       [ 0.40682914, -0.2568035 , -0.0023999 ,  0.34650005,  0.59290964,
        -0.10783135,  0.08565902, -0.29228401,  0.08231528, -0.54453993,
         0.3686876 ,  0.3618472 ,  0.47073664, -0.0082688 ,  1.64538659]])

In [21]:
cluster_kmean4 = cluster_data.copy()
cluster_kmean4['Segment K-means'] = kmean4.labels_
cluster_kmean4['Segment'] = cluster_kmean4['Segment K-means'].map({0:'first', 1:'second', 2:'third', 3:'fourth'})
cluster_kmean4

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,Segment K-means,Segment
0,25,0,226802,0,7,0,0,0,0,0,0,0,40,0,0,2,third
1,38,0,89814,1,9,1,1,1,1,0,0,0,50,0,0,2,third
2,28,1,336951,2,12,1,2,1,1,0,0,0,40,0,1,3,fourth
3,44,0,160323,3,10,1,0,1,0,0,7688,0,40,0,1,3,fourth
5,34,0,198693,4,6,0,3,2,1,0,0,0,30,0,0,2,third
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,0,257302,2,12,1,8,4,1,1,0,0,38,0,0,1,second
48838,40,0,154374,1,9,1,0,1,1,0,0,0,40,0,1,3,fourth
48839,58,0,151910,1,9,2,6,3,1,1,0,0,40,0,0,1,second
48840,22,0,201490,1,9,0,6,0,1,0,0,0,20,0,0,2,third


In [22]:
zipped = list(zip(cluster_kmean4['income'], cluster_kmean4['Segment K-means']))
Counter(zipped)

Counter({(0, 2): 20325,
         (1, 3): 8411,
         (0, 0): 3786,
         (0, 1): 9544,
         (1, 0): 1891,
         (1, 1): 891,
         (0, 3): 359,
         (1, 2): 15})

In [23]:
kmean10 = KMeans(n_clusters=10, init='k-means++', random_state = 42, n_init="auto")
kmean10.fit(cluster_data_std)

In [24]:
kmean10.inertia_

381293.2529705341

In [25]:
kmean10.n_iter_

35

In [26]:
kmean10.cluster_centers_

array([[ 3.96628068e-01, -7.81730574e-02, -3.85323233e-02,
        -2.92101960e-01, -8.87774561e-02,  1.82429616e+00,
        -5.07559221e-02,  7.02655737e-01, -1.64883291e-01,
         7.76903923e-01, -1.17750240e-01, -2.10051409e-01,
        -1.06618025e-01, -1.96658426e-01, -5.03826524e-01],
       [-8.69947136e-01, -3.17872803e-01,  1.25173365e-01,
        -6.72645477e-01, -3.82922641e-01, -6.18396367e-01,
         4.41915469e-02, -7.14022174e-01, -1.00763009e-01,
        -2.61973141e-01, -1.30917287e-01, -2.18496043e-01,
        -3.92885845e-01, -1.97060193e-01, -5.69148654e-01],
       [ 6.55227965e-01,  2.96374229e-01, -1.42074720e-01,
        -6.59492442e-01, -3.31087784e-01, -1.69162301e-01,
         1.10087702e-01, -4.18100611e-01, -4.16373743e-02,
        -6.77066338e-01, -3.26304248e-02, -2.18439864e-01,
         4.44532432e-01, -2.22628259e-01,  5.23115056e-01],
       [-3.85779410e-01, -1.74118039e-01,  3.88526304e-03,
        -3.96450552e-01, -1.26858138e-01, -5.90680725

In [27]:
cluster_kmean10 = cluster_data.copy()
cluster_kmean10['Segment K-means'] = kmean10.labels_
cluster_kmean10['Segment'] = cluster_kmean10['Segment K-means'].map({0:'first', 1:'second', 2:'third', 3:'fourth', 4:'fifth', 5:'sixth', 6:'seventh', 7:'eigth', 8:'ninth', 9:'tenth', 10:'eleventh'})
cluster_kmean10

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,Segment K-means,Segment
0,25,0,226802,0,7,0,0,0,0,0,0,0,40,0,0,1,second
1,38,0,89814,1,9,1,1,1,1,0,0,0,50,0,0,2,third
2,28,1,336951,2,12,1,2,1,1,0,0,0,40,0,1,8,ninth
3,44,0,160323,3,10,1,0,1,0,0,7688,0,40,0,1,2,third
5,34,0,198693,4,6,0,3,2,1,0,0,0,30,0,0,1,second
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,0,257302,2,12,1,8,4,1,1,0,0,38,0,0,3,fourth
48838,40,0,154374,1,9,1,0,1,1,0,0,0,40,0,1,2,third
48839,58,0,151910,1,9,2,6,3,1,1,0,0,40,0,0,0,first
48840,22,0,201490,1,9,0,6,0,1,0,0,0,20,0,0,1,second


In [28]:
zipped = list(zip(cluster_kmean10['income'], cluster_kmean10['Segment K-means']))
Counter(zipped)

Counter({(0, 1): 9488,
         (0, 2): 4072,
         (1, 8): 4645,
         (1, 2): 3688,
         (0, 3): 3866,
         (0, 7): 2327,
         (0, 6): 4862,
         (0, 0): 6974,
         (1, 3): 736,
         (0, 4): 956,
         (1, 0): 218,
         (1, 4): 1072,
         (0, 9): 1454,
         (1, 7): 72,
         (1, 5): 229,
         (1, 9): 528,
         (1, 1): 20,
         (0, 8): 15})

In [29]:
over_50k = cluster_kmean10[cluster_kmean10['income'] == 1]
over_50k

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,Segment K-means,Segment
2,28,1,336951,2,12,1,2,1,1,0,0,0,40,0,1,8,ninth
3,44,0,160323,3,10,1,0,1,0,0,7688,0,40,0,1,2,third
7,63,2,104626,5,15,1,4,1,1,0,3103,0,32,0,1,8,ninth
10,65,0,184454,1,9,1,0,1,1,0,6418,0,40,0,1,2,third
14,48,0,279724,1,9,1,0,1,1,0,3103,0,48,0,1,2,third
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48819,38,0,139180,7,13,4,4,3,0,1,15020,0,45,0,1,0,first
48826,39,1,111499,2,12,1,6,4,1,1,0,0,20,0,1,3,fourth
48835,53,0,321865,8,14,1,7,1,1,0,0,0,40,0,1,8,ninth
48838,40,0,154374,1,9,1,0,1,1,0,0,0,40,0,1,2,third


In [30]:
under_50k = cluster_kmean10[cluster_kmean10['income'] == 0]
under_50k

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,Segment K-means,Segment
0,25,0,226802,0,7,0,0,0,0,0,0,0,40,0,0,1,second
1,38,0,89814,1,9,1,1,1,1,0,0,0,50,0,0,2,third
5,34,0,198693,4,6,0,3,2,1,0,0,0,30,0,0,1,second
8,24,0,369667,3,10,0,3,3,1,1,0,0,40,0,0,3,fourth
9,55,0,104996,6,4,1,5,1,1,0,0,0,10,0,0,7,eigth
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48834,32,0,116138,8,14,0,8,2,4,0,0,0,11,22,0,9,tenth
48836,22,0,310152,3,10,0,2,2,1,0,0,0,40,0,0,1,second
48837,27,0,257302,2,12,1,8,4,1,1,0,0,38,0,0,3,fourth
48839,58,0,151910,1,9,2,6,3,1,1,0,0,40,0,0,0,first


In [31]:
cluster_over = list()
for i in range(10):
  cluster_over.append(over_50k[over_50k['Segment K-means'] == i])

In [32]:
cluster_under = list()
for i in range(10):
  cluster_under.append(under_50k[under_50k['Segment K-means'] == i])

Clusters feature analysis for the income greater than 50k:
These are the features that appear the most for cluster with the income greater than 50k

In [38]:
columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
lookup = [workclass, education, marital_status, occupation, relationship, race, sex, country]
result = [list() for i in cluster_over]

for findex, feature in enumerate(columns):
  for i in range(10):
    a = cluster_over[i][feature]
    mode, count = mstats.mode(a)
    l = lookup[findex][int(mode[0])]
    result[i].append([l,count[0]])
result

[[['Private', 171.0],
  ['Some-college', 72.0],
  ['Divorced', 173.0],
  ['Exec-managerial', 58.0],
  ['Not-in-family', 106.0],
  ['White', 187.0],
  ['Female', 172.0],
  ['United-States', 213.0]],
 [['Private', 18.0],
  ['Some-college', 12.0],
  ['Married-civ-spouse', 12.0],
  ['Prof-specialty', 5.0],
  ['Own-child', 11.0],
  ['White', 20.0],
  ['Male', 15.0],
  ['United-States', 19.0]],
 [['Private', 2533.0],
  ['HS-grad', 1846.0],
  ['Married-civ-spouse', 3428.0],
  ['Craft-repair', 885.0],
  ['Husband', 3417.0],
  ['White', 3422.0],
  ['Male', 3654.0],
  ['United-States', 3631.0]],
 [['Private', 528.0],
  ['HS-grad', 200.0],
  ['Married-civ-spouse', 694.0],
  ['Adm-clerical', 209.0],
  ['Wife', 681.0],
  ['White', 662.0],
  ['Female', 728.0],
  ['United-States', 719.0]],
 [['Private', 683.0],
  ['Bachelors', 326.0],
  ['Married-civ-spouse', 942.0],
  ['Exec-managerial', 321.0],
  ['Husband', 842.0],
  ['White', 1000.0],
  ['Male', 924.0],
  ['United-States', 1025.0]],
 [['Private',

Clusters feature analysis for the income less than 50k:
These are the features that appear the most for cluster with the income less than 50k

In [40]:
columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
lookup = [workclass, education, marital_status, occupation, relationship, race, sex, country]
result = [list() for i in cluster_over]

for findex, feature in enumerate(columns):
  for i in range(10):
    a = cluster_under[i][feature]
    mode, count = mstats.mode(a)
    l = lookup[findex][int(mode[0])]
    result[i].append([l,count[0]])
result

[[['Private', 5253.0],
  ['HS-grad', 2981.0],
  ['Divorced', 4989.0],
  ['Adm-clerical', 1503.0],
  ['Not-in-family', 3564.0],
  ['White', 5829.0],
  ['Female', 4782.0],
  ['United-States', 6745.0]],
 [['Private', 8401.0],
  ['HS-grad', 4573.0],
  ['Never-married', 6106.0],
  ['Craft-repair', 1785.0],
  ['Own-child', 4464.0],
  ['White', 8280.0],
  ['Male', 7575.0],
  ['United-States', 9115.0]],
 [['Private', 2166.0],
  ['HS-grad', 2316.0],
  ['Married-civ-spouse', 3426.0],
  ['Craft-repair', 902.0],
  ['Husband', 3422.0],
  ['White', 3718.0],
  ['Male', 4045.0],
  ['United-States', 4006.0]],
 [['Private', 3219.0],
  ['HS-grad', 1737.0],
  ['Never-married', 2746.0],
  ['Adm-clerical', 1011.0],
  ['Not-in-family', 1392.0],
  ['White', 2991.0],
  ['Female', 3378.0],
  ['United-States', 3655.0]],
 [['Private', 682.0],
  ['HS-grad', 334.0],
  ['Married-civ-spouse', 362.0],
  ['Craft-repair', 155.0],
  ['Not-in-family', 368.0],
  ['White', 846.0],
  ['Male', 655.0],
  ['United-States', 911.