In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture

In [3]:
data = pd.read_csv('General Health by Religion by Sex by Age.csv')

In [4]:
data.drop(columns=['date', 'geography code'], axis=1, inplace=True)

In [5]:
X = data.iloc[:,1:]
Y = data.iloc[:,0]

In [6]:
encoder = LabelEncoder()

In [7]:
encoder.fit(Y)

LabelEncoder()

In [8]:
scaler = StandardScaler()

In [9]:
scaled_data = scaler.fit_transform(X)

In [10]:
gmm = GaussianMixture(n_components=3, covariance_type='spherical', random_state =0)

In [11]:
y_pred = gmm.fit_predict(scaled_data)

In [12]:
def transform_labels(predictions):

    label = {i:f"CLUSTER {i}" for i in np.unique(predictions)}
    
    return [label[i] for i in predictions]

In [13]:
new_labels = transform_labels(y_pred)
print(new_labels[:10])

['CLUSTER 2', 'CLUSTER 1', 'CLUSTER 0', 'CLUSTER 2', 'CLUSTER 1', 'CLUSTER 0', 'CLUSTER 2', 'CLUSTER 1', 'CLUSTER 2', 'CLUSTER 1']


In [14]:
data['Cluster'] = new_labels

# Data Pre-Processing

In [15]:
#Check if every column name ends with "measures: value".
name_lst = data.columns

for i in range(1, len(name_lst)):
    if name_lst[i].endswith('measures: Value') == False:
        print("{} does not ends with 'measures: value'. ".format(name_lst[i]))

Cluster does not ends with 'measures: value'. 


In [16]:
data.columns = data.columns.str.replace('; measures: Value', '')

In [17]:
data

Unnamed: 0,geography,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: All categories: Religion,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: Christian,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: Buddhist,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: Hindu,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: Jewish,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: Muslim,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: Sikh,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: Other religion,Sex: All persons; Age: All categories: Age; General Health: All categories: General health; Religion: No religion,...,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: Christian,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: Buddhist,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: Hindu,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: Jewish,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: Muslim,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: Sikh,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: Other religion,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: No religion,Sex: Females; Age: Age 65 and over; General Health: Bad or very bad health; Religion: Religion not stated,Cluster
0,AL - St Albans,250427,146941,1504,4528,2442,7907,675,939,67263,...,2062,3,18,12,52,4,5,163,282,CLUSTER 2
1,B - Birmingham,1904658,1015781,6476,32857,2801,267511,61540,8872,386177,...,23871,44,432,56,2395,968,101,1364,2601,CLUSTER 1
2,BA - Bath,434166,260769,1880,856,464,2238,204,2917,129285,...,4537,6,3,8,8,3,26,374,550,CLUSTER 0
3,BB - Blackburn,488917,294116,1054,1078,148,75194,278,1350,86713,...,6240,4,14,6,590,4,21,321,648,CLUSTER 2
4,BD - Bradford,578336,276991,1148,5131,277,130111,5546,1890,120858,...,5506,6,64,8,1014,96,24,408,732,CLUSTER 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,CF - Cardiff,1005334,530981,3202,5899,1067,26489,1756,3874,359280,...,15848,22,35,28,197,13,63,1847,2206,CLUSTER 1
101,LD - Llandrindod Wells,49792,30604,311,245,27,84,18,283,13758,...,724,0,0,0,0,1,2,68,78,CLUSTER 2
102,LL - Llandudno,537467,339502,1562,1157,246,3542,193,2007,146538,...,7859,13,5,5,8,0,26,607,973,CLUSTER 0
103,NP - Newport,488368,271818,1137,1222,221,7672,354,1784,166810,...,8264,12,11,1,39,3,26,881,1026,CLUSTER 0


In [18]:
def col_split(x : pd.Series):
    y = x['variable'].split(';')
    
    x['Sex'] = y[0]
    x['Age'] = y[1]
    x['General Health'] = y[2]
    x['Religion'] = y[3]
    
    return x

In [19]:
new_data = data.melt(id_vars=['geography', 'Cluster']).apply(col_split, axis = 1)
#data.melt(id_vars=['geography']).apply(col_split, axis = 1)

In [20]:
#Dropping the column 'variable' as it is of no use now.
new_data.drop('variable', axis=1, inplace=True)

In [21]:
#Removing the common name 'Sex: ' from 'Sex' column as it is of no use.
new_data['Sex'] =  new_data['Sex'].str.replace('Sex: ', '')

#Removing the common name 'Age: ' from 'Age' column as it is of no use.
new_data['Age'] =  new_data['Age'].str.replace('Age: ', '')

#Removing the common name 'General Health: ' from 'General Health' column as it is of no use.
new_data['General Health'] =  new_data['General Health'].str.replace('General Health: ', '')

#Removing the common name 'Religion: ' from 'Religion' column as it is of no use.
new_data['Religion'] =  new_data['Religion'].str.replace('Religion: ', '')

In [22]:
#bringing the 'value' column at the end of dataframe
new_data['Counts'] = new_data['value']
new_data.drop('value', axis=1, inplace=True)

In [23]:
new_data['Clusters'] = new_data['Cluster']
new_data.drop('Cluster', axis=1, inplace=True)

In [24]:
new_data[['area code', 'area name']] = new_data['geography'].str.split('-', 1, expand=True)

In [25]:
new_data

Unnamed: 0,geography,Sex,Age,General Health,Religion,Counts,Clusters,area code,area name
0,AL - St Albans,All persons,All categories: Age,All categories: General health,All categories: Religion,250427,CLUSTER 2,AL,St Albans
1,B - Birmingham,All persons,All categories: Age,All categories: General health,All categories: Religion,1904658,CLUSTER 1,B,Birmingham
2,BA - Bath,All persons,All categories: Age,All categories: General health,All categories: Religion,434166,CLUSTER 0,BA,Bath
3,BB - Blackburn,All persons,All categories: Age,All categories: General health,All categories: Religion,488917,CLUSTER 2,BB,Blackburn
4,BD - Bradford,All persons,All categories: Age,All categories: General health,All categories: Religion,578336,CLUSTER 1,BD,Bradford
...,...,...,...,...,...,...,...,...,...
62995,CF - Cardiff,Females,Age 65 and over,Bad or very bad health,Religion not stated,2206,CLUSTER 1,CF,Cardiff
62996,LD - Llandrindod Wells,Females,Age 65 and over,Bad or very bad health,Religion not stated,78,CLUSTER 2,LD,Llandrindod Wells
62997,LL - Llandudno,Females,Age 65 and over,Bad or very bad health,Religion not stated,973,CLUSTER 0,LL,Llandudno
62998,NP - Newport,Females,Age 65 and over,Bad or very bad health,Religion not stated,1026,CLUSTER 0,NP,Newport


In [26]:
new_data.to_csv('General Health by Religion and Sex GMM data.csv', index=False)