In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import matplotlib

from sklearn.decomposition import PCA

In [8]:
# Exploring the data
train = pd.read_csv('cervical_cancer.csv')
train.head()

FileNotFoundError: [Errno 2] File cervical_cancer.csv does not exist: 'cervical_cancer.csv'

In [None]:
print(train.shape)

In [None]:
## The 36 variables included are a mix of continuous (i.e. "Number of sexual partners") 
# and discrete (i.e. "Dx: HPV").
# Before we can perform PCA, we must ensure every variable is
# coded numerically and search for potential problematic or null values.
train.dtypes

In [None]:
# Converting "object" variables to numeric typesfor i in range(1,28)
for i in range(1,28):
    train.iloc[:,i]=pd.to_numeric(train.iloc[:,i], errors='coerce')
train.dtypes

In [None]:
# Checking for null values
train.isnull().sum()

In [None]:
# Method 1: Removing any columns with null values and performing PCA on remaining data
train2 = train[train.columns[28:37]]
train2.reset_index(drop=True, inplace=True)
train3= train[["Age"]]
train3.reset_index(drop=True, inplace=True)
train4=train[["STDs: Number of diagnosis"]]
train4.reset_index(drop=True, inplace=True)
frames=[train2,train3,train4]
first_PCA = pd.concat(frames, axis=1)
first_PCA.head()

In [None]:
first_PCA.isnull().sum()

In [None]:
# Applying PCA with a total of 6 components on the incomplete data
pca = PCA(n_components=6)
pca.fit(first_PCA)
pca_output = pca.transform(first_PCA)
ps = pd.DataFrame(pca_output)
ps.head()

In [None]:
# Checking how much variation is captured by each component
pca.explained_variance_ratio_

In [None]:
# Plotting the resulting scatterplot from the first two principal components
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
two_comp = pd.DataFrame(ps[[0,1]])

fig = plt.figure(figsize=(8,8))
plt.plot(two_comp[0], two_comp[1], 'x', markersize=6, color='blue', alpha=0.5)


plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()

In [None]:
# Performing K-means clustering to see if data falls neatly into groups 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

clusterer = KMeans(n_clusters=3,random_state=42).fit(two_comp)
centers = clusterer.cluster_centers_
pred = clusterer.predict(two_comp)

fig = plt.figure(figsize=(8,8))
colors = ['orange','blue','green']
colored = [colors[k] for k in pred]

plt.scatter(two_comp[0],two_comp[1],  color = colored)
for i,c in enumerate(centers):
    plt.plot(c[0], c[1], 'X', markersize=10, color='red', alpha=0.9, label=''+str(i))

plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.legend()
plt.show()

In [None]:
# Check how closely age corresponds with clustering
first_PCA['cluster']=pred
first_PCA.head(10)

In [None]:
# Plot age, cluster to verify hypothesis that age has a large influence on this instance of PCA
fig = plt.figure(figsize=(8,8))
plt.plot(first_PCA['cluster'], first_PCA['Age'], 'x', markersize=6, color='blue')

plt.xlabel('Cluster')
plt.ylabel('Age')
plt.show()

In [None]:
# Method 2: Imputing null values (using population level data from CDC when available) 
# and keeping all columns to perform PCA

# Continuous variable imputation (using median)
train['Number of sexual partners'].fillna(train['Number of sexual partners'].median(), inplace=True)
train['First sexual intercourse'].fillna(train['First sexual intercourse'].median(), inplace=True)
train['Num of pregnancies'].fillna(train['Num of pregnancies'].median(), inplace=True)
train['Smokes (years)'].fillna(train['Smokes (years)'].median(), inplace=True)
train['Smokes (packs/year)'].fillna(train['Smokes (packs/year)'].median(), inplace=True)
train['Hormonal Contraceptives (years)'].fillna(train['Hormonal Contraceptives (years)'].median(), inplace=True)
train['STDs (number)'].fillna(train['STDs (number)'].median(), inplace=True)
train['STDs: Time since first diagnosis'].fillna(train['STDs: Time since first diagnosis'].median(), inplace=True)
train['STDs: Time since last diagnosis'].fillna(train['STDs: Time since last diagnosis'].median(), inplace=True)
train['IUD (years)'].fillna(train['IUD (years)'].median(), inplace=True)

# Discrete variable imputation (without population estimates)
train['STDs:condylomatosis'].fillna(train['STDs:condylomatosis'].median(), inplace=True)
train['STDs:cervical condylomatosis'].fillna(train['STDs:cervical condylomatosis'].median(), inplace=True)
train['STDs:vaginal condylomatosis'].fillna(train['STDs:vaginal condylomatosis'].median(), inplace=True)
train['STDs:vulvo-perineal condylomatosis'].fillna(train['STDs:vulvo-perineal condylomatosis'].median(), inplace=True)
train['STDs:syphilis'].fillna(train['STDs:syphilis'].median(), inplace=True)
train['STDs:molluscum contagiosum'].fillna(train['STDs:molluscum contagiosum'].median(), inplace=True)
train['STDs'].fillna(train['STDs'].median(), inplace=True)
train['STDs:AIDS'].fillna(train['STDs:AIDS'].median(), inplace=True)
train['STDs:HIV'].fillna(train['STDs:HIV'].median(), inplace=True)

In [None]:
# Discrete variable imputation (with population estimates)
s = int(.136*train.shape[0])
smokes = np.hstack((np.ones(s), np.zeros(train.shape[0]-s)))
np.random.shuffle(smokes)
train['Smokes'].fillna(pd.Series(smokes), axis=0, inplace=True)

s = int(.103*train.shape[0])
iud = np.hstack((np.ones(s), np.zeros(train.shape[0]-s)))
np.random.shuffle(iud)
train['IUD'].fillna(pd.Series(iud), axis=0, inplace=True)

s = int(.229*train.shape[0])
hc = np.hstack((np.ones(s), np.zeros(train.shape[0]-s)))
np.random.shuffle(hc)
train['Hormonal Contraceptives'].fillna(pd.Series(hc), axis=0, inplace=True)

s = int(.399*train.shape[0])
hpv = np.hstack((np.ones(s), np.zeros(train.shape[0]-s)))
np.random.shuffle(hpv)
train['STDs:HPV'].fillna(pd.Series(hpv), axis=0, inplace=True)

s = int(.034*train.shape[0])
hep = np.hstack((np.ones(s), np.zeros(train.shape[0]-s)))
np.random.shuffle(hep)
train['STDs:Hepatitis B'].fillna(pd.Series(hep), axis=0, inplace=True)

s = int(.159*train.shape[0])
gen = np.hstack((np.ones(s), np.zeros(train.shape[0]-s)))
np.random.shuffle(gen)
train['STDs:genital herpes'].fillna(pd.Series(gen), axis=0, inplace=True)

s = int(.044*train.shape[0])
pid = np.hstack((np.ones(s), np.zeros(train.shape[0]-s)))
np.random.shuffle(pid)
train['STDs:pelvic inflammatory disease'].fillna(pd.Series(pid), axis=0, inplace=True)

In [None]:
# Checking if all null values have been removed
train.isnull().sum()

In [None]:
# Re-running PCA with imputed values
pca = PCA(n_components=6)
pca.fit(train)
pca_output2 = pca.transform(train)
ps2 = pd.DataFrame(pca_output2)
ps2.head()

In [None]:
pca.explained_variance_ratio_ # First component no longer explains an overwhelming amount of variation

In [None]:
# Plotting the resulting scatterplot from the first two principal components
two_comp = pd.DataFrame(ps2[[0,1]])
fig = plt.figure(figsize=(8,8))
plt.plot(two_comp[0], two_comp[1], 'x', markersize=6, color='blue')

plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()

In [None]:
# Performing K-means clustering to see if data falls neatly into groups
tocluster = pd.DataFrame(ps2[[0,1]])
clusterer = KMeans(n_clusters=4,random_state=42).fit(tocluster)
centers = clusterer.cluster_centers_
pred = clusterer.predict(tocluster)

fig = plt.figure(figsize=(8,8))
colors = ['orange','blue','green','purple']
colored = [colors[k] for k in pred]

plt.scatter(two_comp[0],two_comp[1],  color = colored)
for i,c in enumerate(centers):
    plt.plot(c[0], c[1], 'X', markersize=10, color='red', alpha=0.9, label=''+str(i))

plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.legend()
plt.show()

In [None]:
# Checking to see if age still has strong correlation with cluster
train['cluster']=pred

fig = plt.figure(figsize=(8,8))
plt.plot(train['cluster'], train['Age'], 'x', markersize=6, color='blue')

plt.xlabel('Cluster')
plt.ylabel('Age')
plt.show()