In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
file_path = Path("Resources/myopia.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [3]:
df.columns

Index(['AGE', 'SPHEQ', 'AL', 'ACD', 'LT', 'VCD', 'SPORTHR', 'READHR', 'COMPHR',
       'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY', 'MYOPIC'],
      dtype='object')

In [4]:
df["MYOPIC"].value_counts()

0    537
1     81
Name: MYOPIC, dtype: int64

In [5]:
df = df.drop(columns=["MYOPIC"])


In [6]:
df.head(5)

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['AGE', 'SPHEQ', 'AL', 'ACD', 'LT', 'VCD', 'SPORTHR', 'READHR', 'COMPHR',
       'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY']])

In [8]:
df.columns

Index(['AGE', 'SPHEQ', 'AL', 'ACD', 'LT', 'VCD', 'SPORTHR', 'READHR', 'COMPHR',
       'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY'],
      dtype='object')

In [9]:
new_df = pd.DataFrame(scaled_data, columns=df.columns[:])
new_df['AGE'] = df['AGE']
new_df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,6,-1.363917,-0.892861,0.483784,-0.281443,-1.019792,4.150661,1.69745,-0.689311,-0.672996,0.184058,0.498304,0.987138,1.003241
1,6,-0.308612,-0.17184,0.53591,-0.967997,-0.130763,-0.998898,-0.912062,-0.361875,-0.221409,-0.340932,-0.875088,0.987138,1.003241
2,6,0.604386,-0.009977,-0.506628,-0.177812,0.215809,0.257092,-0.912062,-0.034439,-0.672996,0.184058,-0.750234,-1.01303,-0.996769
3,6,-0.441325,-0.436703,1.230936,0.456927,-0.974587,0.759488,2.676017,-0.689311,-0.672996,-0.865922,0.685585,-1.01303,1.003241
4,5,-0.166306,1.167204,0.42297,-0.566427,1.180178,0.257092,-0.912062,-0.689311,-0.672996,-0.865922,-1.374503,0.987138,-0.996769


In [10]:
file_path = Path("Resources/myopia_data_cleaned.csv")
new_df.to_csv(file_path, index=False)

In [11]:
# Loading the preprocessed iris dataset
file_path = Path("Resources/myopia_data_cleaned.csv")
df_myopia = pd.read_csv(file_path)
df_myopia.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,6,-1.363917,-0.892861,0.483784,-0.281443,-1.019792,4.150661,1.69745,-0.689311,-0.672996,0.184058,0.498304,0.987138,1.003241
1,6,-0.308612,-0.17184,0.53591,-0.967997,-0.130763,-0.998898,-0.912062,-0.361875,-0.221409,-0.340932,-0.875088,0.987138,1.003241
2,6,0.604386,-0.009977,-0.506628,-0.177812,0.215809,0.257092,-0.912062,-0.034439,-0.672996,0.184058,-0.750234,-1.01303,-0.996769
3,6,-0.441325,-0.436703,1.230936,0.456927,-0.974587,0.759488,2.676017,-0.689311,-0.672996,-0.865922,0.685585,-1.01303,1.003241
4,5,-0.166306,1.167204,0.42297,-0.566427,1.180178,0.257092,-0.912062,-0.689311,-0.672996,-0.865922,-1.374503,0.987138,-0.996769


In [12]:
# Standarize data with StandarScaler
myopia_scaled = StandardScaler().fit_transform(df_myopia)
print(myopia_scaled[0:5])

[[-0.42021911 -1.3639169  -0.89286146  0.48378402 -0.28144315 -1.0197916
   4.1506609   1.69744958 -0.68931054 -0.67299591  0.18405806  0.49830393
   0.98713773  1.0032415 ]
 [-0.42021911 -0.30861224 -0.1718398   0.53591028 -0.96799728 -0.13076253
  -0.99889822 -0.91206246 -0.36187479 -0.22140908 -0.34093217 -0.87508755
   0.98713773  1.0032415 ]
 [-0.42021911  0.60438629 -0.00997682 -0.5066285  -0.17781207  0.21580879
   0.25709181 -0.91206246 -0.03443904 -0.67299591  0.18405806 -0.75023378
  -1.01302987 -0.99676897]
 [-0.42021911 -0.4413248  -0.43670267  1.23093648  0.4569273  -0.97458715
   0.75948782  2.6760166  -0.68931054 -0.67299591 -0.86592239  0.68558458
  -1.01302987  1.0032415 ]
 [-1.82397807 -0.16630592  1.16720365  0.42296952 -0.56642745  1.18017813
   0.25709181 -0.91206246 -0.68931054 -0.67299591 -0.86592239 -1.37450264
   0.98713773 -0.99676897]]


In [13]:
# Applying PCA to reduce dimensions 

# Initialize PCA model
pca = PCA(n_components=0.99)

# Get two principal components for the iris data.
myopia_pca = pca.fit_transform(myopia_scaled)

In [14]:
# Transform PCA data to a DataFrame
myopia_pca = pd.DataFrame(
    data=myopia_pca)
myopia_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.535503,1.145004,-1.385019,0.116006,-3.226691,-1.928459,-1.958892,-1.922997,0.026753,0.178144,-1.389112,-0.574909
1,-0.624706,-1.575786,-0.794253,1.156287,0.292256,0.34906,-0.455206,0.285332,0.689683,-0.557031,-0.118153,0.527492
2,-0.933479,-0.717076,0.82015,-1.547638,0.173941,0.272783,0.025821,-0.030817,-0.431499,0.013336,-0.287739,0.128478
3,0.106354,1.192475,-0.024441,1.507469,-2.356836,-0.023139,0.236418,-1.717109,-1.230785,-0.7034,0.652956,-0.672679
4,-0.388503,-2.839655,-0.229767,-0.62251,0.271458,-0.928956,0.5379,-1.301787,0.279007,0.631183,0.00764,0.99537


In [15]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.21177355, 0.15659716, 0.08688023, 0.08301762, 0.07584858,
       0.06997878, 0.06486986, 0.06377808, 0.05393666, 0.05205566,
       0.04286506, 0.03839767])

In [16]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [17]:
file_path = 'Resources/myopia_data_cleaned.csv'

In [18]:
mycleaned_df = pd.read_csv(file_path)

In [19]:
mycleaned_df.sample(10)

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
490,9,-2.398435,-1.054722,-2.070436,0.366251,-0.447197,0.633889,0.066505,-0.034439,1.133351,1.759029,1.122573,-1.01303,1.003241
410,6,0.642761,0.961195,0.032017,1.635728,0.592516,0.005894,-0.259684,0.292997,-0.672996,-0.165935,-0.375672,0.987138,-0.996769
201,6,4.30115,-0.480848,2.273476,-0.113043,-1.260884,-0.245304,-0.259684,0.620432,-0.221409,-1.040919,-0.375672,0.987138,-0.996769
194,6,0.211045,-1.628598,-2.591705,0.547604,-0.899245,-0.873299,0.718883,-0.034439,-0.672996,-0.690926,-0.125965,0.987138,1.003241
119,6,-0.417341,-0.760429,-2.93053,1.363698,-0.070489,0.382691,0.718883,-0.361875,-0.221409,0.184058,0.248596,-1.01303,1.003241
188,6,-0.988164,0.048883,0.2579,-0.281443,0.034988,1.010686,0.066505,-0.361875,-0.672996,-0.690926,-0.62538,-1.01303,1.003241
408,6,-0.64599,0.240174,-1.262468,0.003543,0.682925,-0.245304,-0.259684,3.239918,-0.672996,-0.865922,0.498304,-1.01303,-0.996769
300,6,0.564413,-0.436703,-0.567443,-0.475751,-0.130763,-0.622101,-0.259684,-0.034439,-0.672996,-1.215916,-0.875088,-1.01303,-0.996769
385,8,0.564413,0.122456,0.310028,0.184896,-0.025285,-0.622101,0.718883,-0.034439,2.036525,-0.340932,1.122573,-1.01303,1.003241
434,6,0.140692,-1.054722,-0.211242,-0.37212,-0.914313,0.005894,0.392694,-0.361875,-0.672996,-1.565909,-0.750234,0.987138,1.003241
