In [8]:
def parse_arff(file_path):
    data_started = False
    attributes = []
    data = []
    nominal_mapping = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue  # Skip comments and empty lines

            if line.lower().startswith('@relation'):
                continue  # Skip relation information

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:  # Nominal data
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attributes.append((attr_name, 'nominal', values))
                    # Sample attribute information
                    attribute_info = ((attr_name, 'nominal', values))
                    
                    # Extract relevant information
                    attr_name, attr_type, attr_values = attribute_info

                    #Create a mapping dictionary
                    nominal_mapping.append( {value: index for index, value in enumerate(attr_values)} )


                else:  # Numeric data
                    attributes.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')
                
                data.append(line.split(','))

    return attributes, data


file_path = r'2017.arff'
attributes, data = parse_arff(file_path)

for row in data :
    for i in range ( len (attributes) ):
        attr_name, attr_type, attr_values = attributes[i]
        if attr_type == 'nominal' :
            # Create a mapping dictionary
            nominal_mapping = {value: index for index, value in enumerate(attr_values)}
            row[i] = nominal_mapping.get(row[i])
        elif attr_type == 'numeric' :
            try :
                row[i] = float(row[i])
            except Exception as e:
                # Handle the exception
                row[i] = 1


# Assuming 'numeric_data' is already defined from your provided code

# Function to calculate the mean of a list
def mean(values):
    return sum(values) / len(values) if len(values) > 0 else 0

# Calculate the mean of each feature
means = [mean(feature) for feature in zip(*numeric_data)]

# Center the data by subtracting the mean
centered_data = [[value - mean for value, mean in zip(row, means)] for row in numeric_data]

# Function to calculate the dot product of two vectors
def dot_product(a, b):
    return sum(ai * bi for ai, bi in zip(a, b))

# Calculate the covariance matrix
num_samples = len(centered_data)
num_features = len(centered_data[0])

covariance_matrix = [[dot_product(centered_data[i], centered_data[j]) / (num_samples - 1)
                    for j in range(num_features)] for i in range(num_features)]

# Perform eigendecomposition (you may need to implement your own eigendecomposition algorithm)
# For simplicity, we'll use a simple method for demonstration purposes

# Function to find the eigenvalues and eigenvectors of a symmetric matrix
def power_iteration(matrix, num_iterations=100):
    n = len(matrix)
    b = [1] * n

    for _ in range(num_iterations):
        # Multiply matrix by vector
        Ab = [sum(matrix[i][j] * b[j] for j in range(n)) for i in range(n)]

        # Normalize the vector
        norm = max(Ab)
        b = [ai / norm for ai in Ab]

    # Calculate eigenvalue
    eigenvalue = dot_product(b, [sum(matrix[i][j] * b[j] for j in range(n)) for i in range(n)])

    return eigenvalue, b

# Find the eigenvalues and eigenvectors
eigenvalues = []
eigenvectors = []

for _ in range(num_features):
    eigenvalue, eigenvector = power_iteration(covariance_matrix)
    eigenvalues.append(eigenvalue)
    eigenvectors.append(eigenvector)

# Sort eigenvalues and corresponding eigenvectors in descending order
sorted_indices = sorted(range(len(eigenvalues)), key=lambda k: eigenvalues[k], reverse=True)
eigenvalues = [eigenvalues[i] for i in sorted_indices]
eigenvectors = [[eigenvectors[j][i] for i in sorted_indices] for j in range(len(eigenvectors))]

# Select the top k eigenvectors based on the number of components desired (k)
k = 2  # You can adjust the number of components as needed
selected_eigenvectors = [eigenvectors[i] for i in range(k)]

# Project the centered data onto the selected eigenvectors
pca_result = [[dot_product(centered_data[i], selected_eigenvectors[j]) for j in range(k)]
            for i in range(num_samples)]

# Print the result of PCA
print("\nPCA Result:")
print(pca_result)






PCA Result:
[[-5871075.013374202, -5871075.013374202], [-6026800.153956003, -6026800.153956003], [-6165026.06533539, -6165026.06533539], [-6044752.094148457, -6044752.094148457], [-6198954.855453052, -6198954.855453052], [-6198928.855660709, -6198928.855660709], [-6180053.061161205, -6180053.061161205], [-6161122.316603589, -6161122.316603589], [-6168273.724641129, -6168273.724641129], [-6156615.634126263, -6156615.634126263], [-6221758.734090735, -6221758.734090735], [-6186458.2255864795, -6186458.2255864795], [-6198752.857066388, -6198752.857066388], [-6198738.857178204, -6198738.857178204], [-6185941.306268111, -6185941.306268111], [-6214977.51482392, -6214977.51482392], [-6217979.071287994, -6217979.071287994], [-5999161.272350428, -5999161.272350428], [-5737630.052367255, -5737630.052367255], [-6138801.76377984, -6138801.76377984], [-5974329.019128817, -5974329.019128817], [-6081400.548283111, -6081400.548283111], [-6128795.744423801, -6128795.744423801], [-6096441.585331505, -60

### PCA with numpy

In [1]:
import numpy as np

def parse_arff(file_path):
    data_started = False
    attributes = []
    data = []
    nominal_mapping = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue  # Skip comments and empty lines

            if line.lower().startswith('@relation'):
                continue  # Skip relation information

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:  # Nominal data
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attributes.append((attr_name, 'nominal', values))
                    # Sample attribute information
                    attribute_info = ((attr_name, 'nominal', values))
                    
                    # Extract relevant information
                    attr_name, attr_type, attr_values = attribute_info

                    #Create a mapping dictionary
                    nominal_mapping.append( {value: index for index, value in enumerate(attr_values)} )


                else:  # Numeric data
                    attributes.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')
                
                data.append(line.split(','))

    return attributes, data

file_path = r'2017.arff'
attributes, data = parse_arff(file_path)

for row in data :
    for i in range ( len (attributes) ):
        attr_name, attr_type, attr_values = attributes[i]
        if attr_type == 'nominal' :
            # Create a mapping dictionary
            nominal_mapping = {value: index for index, value in enumerate(attr_values)}
            row[i] = nominal_mapping.get(row[i])
        elif attr_type == 'numeric' :
            try :
                row[i] = float(row[i])
            except Exception as e:
                # Handle the exception
                row[i] = 1


# Extracting the numeric data for PCA
numeric_data = []
for row in data:
    numeric_row = [float(value) if attributes[i][1] == 'numeric' else row[i] for i, value in enumerate(row)]
    numeric_data.append(numeric_row)

# Convert the numeric data to a NumPy array
numeric_data_array = np.array(numeric_data)

# Center the data
mean_values = np.mean(numeric_data_array, axis=0)
centered_data = numeric_data_array - mean_values

# Calculate the covariance matrix
covariance_matrix = np.cov(centered_data, rowvar=False)

# Perform eigendecomposition
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

# Sort eigenvalues and corresponding eigenvectors in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Select the top k eigenvectors based on the number of components desired (k)
k = 2  # You can adjust the number of components as needed
selected_eigenvectors = eigenvectors[:, :k]

# Project the centered data onto the selected eigenvectors
pca_result = np.dot(centered_data, selected_eigenvectors)

# Print the result of PCA
print("\nPCA Result:")
print(pca_result)



PCA Result:
[[ 6.07619936e+06+0.j  3.02169639e+05+0.j]
 [ 6.07582642e+06+0.j  1.13076647e+05+0.j]
 [ 6.07580386e+06+0.j -2.02203085e+04+0.j]
 [ 6.07591801e+06+0.j -1.70530325e+03+0.j]
 [ 6.07619698e+06+0.j -2.55546962e+04+0.j]
 [ 6.07619698e+06+0.j -2.55547087e+04+0.j]
 [ 6.07591077e+06+0.j -2.32311621e+04+0.j]
 [ 6.07578549e+06+0.j  1.46375149e+03+0.j]
 [ 6.07580173e+06+0.j -2.38931554e+04+0.j]
 [ 6.07572529e+06+0.j -8.72847723e+03+0.j]
 [ 6.07619680e+06+0.j -4.85562348e+04+0.j]
 [ 6.07604234e+06+0.j -2.50911510e+04+0.j]
 [ 6.07619698e+06+0.j -2.55547929e+04+0.j]
 [ 6.07619698e+06+0.j -2.55547996e+04+0.j]
 [ 6.07600503e+06+0.j -2.49320382e+04+0.j]
 [ 6.07584174e+06+0.j -2.57636235e+04+0.j]
 [ 6.07617051e+06+0.j -3.99955859e+04+0.j]
 [ 6.07588907e+06+0.j  1.54940054e+05+0.j]
 [ 6.07328076e+06+0.j  3.46419198e+05+0.j]
 [ 6.07591502e+06+0.j  1.60450809e+04+0.j]
 [ 6.07416201e+06+0.j  1.95411586e+05+0.j]
 [ 6.07554503e+06+0.j  7.48555949e+04+0.j]
 [ 6.07608747e+06+0.j  4.11451337e+04+0.j

### With the use of libraries:

In [2]:
from sklearn.decomposition import PCA
import numpy as np

def parse_arff(file_path):
    data_started = False
    attributes = []
    data = []
    nominal_mapping = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or line.startswith('%'):
                continue  # Skip comments and empty lines

            if line.lower().startswith('@relation'):
                continue  # Skip relation information

            if line.lower().startswith('@attribute'):
                parts = line.split()
                attr_name = parts[1].strip()

                if '{' in line:  # Nominal data
                    values = line[line.index('{') + 1:line.index('}')].split(',')
                    attributes.append((attr_name, 'nominal', values))
                    # Sample attribute information
                    attribute_info = ((attr_name, 'nominal', values))
                    
                    # Extract relevant information
                    attr_name, attr_type, attr_values = attribute_info

                    #Create a mapping dictionary
                    nominal_mapping.append( {value: index for index, value in enumerate(attr_values)} )


                else:  # Numeric data
                    attributes.append((attr_name, 'numeric', 0))

            if line.lower().startswith('@data'):
                data_started = True
                continue

            if data_started:
                data_line = line.split(',')
                
                data.append(line.split(','))

    return attributes, data

file_path = r'2017.arff'
attributes, data = parse_arff(file_path)

for row in data :
    for i in range ( len (attributes) ):
        attr_name, attr_type, attr_values = attributes[i]
        if attr_type == 'nominal' :
            # Create a mapping dictionary
            nominal_mapping = {value: index for index, value in enumerate(attr_values)}
            row[i] = nominal_mapping.get(row[i])
        elif attr_type == 'numeric' :
            try :
                row[i] = float(row[i])
            except Exception as e:
                # Handle the exception
                row[i] = 1

# Extracting the numeric data for PCA
numeric_data = []
for row in data:
    numeric_row = [float(value) if attributes[i][1] == 'numeric' else row[i] for i, value in enumerate(row)]
    numeric_data.append(numeric_row)

# Convert the numeric data to a NumPy array
numeric_data_array = np.array(numeric_data)

# Initialize PCA with the desired number of components
pca = PCA(n_components=2)  # You can adjust the number of components as needed

# Fit and transform the data
pca_result = pca.fit_transform(numeric_data_array)

# Print the result of PCA
print("\nPCA Result:")
print(pca_result)



PCA Result:
[[-6.07619936e+06  3.02169639e+05]
 [-6.07582642e+06  1.13076647e+05]
 [-6.07580386e+06 -2.02203085e+04]
 [-6.07591801e+06 -1.70530325e+03]
 [-6.07619698e+06 -2.55546962e+04]
 [-6.07619698e+06 -2.55547087e+04]
 [-6.07591077e+06 -2.32311621e+04]
 [-6.07578549e+06  1.46375149e+03]
 [-6.07580173e+06 -2.38931554e+04]
 [-6.07572529e+06 -8.72847723e+03]
 [-6.07619680e+06 -4.85562348e+04]
 [-6.07604234e+06 -2.50911510e+04]
 [-6.07619698e+06 -2.55547929e+04]
 [-6.07619698e+06 -2.55547996e+04]
 [-6.07600503e+06 -2.49320382e+04]
 [-6.07584174e+06 -2.57636235e+04]
 [-6.07617051e+06 -3.99955859e+04]
 [-6.07588907e+06  1.54940054e+05]
 [-6.07328076e+06  3.46419198e+05]
 [-6.07591502e+06  1.60450809e+04]
 [-6.07416201e+06  1.95411586e+05]
 [-6.07554503e+06  7.48555949e+04]
 [-6.07608747e+06  4.11451337e+04]
 [-6.07532345e+06  4.36554616e+04]
 [-6.07580251e+06 -3.25804812e+03]
 [-6.07546347e+06  8.78966257e+03]
 [-6.07583985e+06  1.90439220e+04]
 [-6.07574465e+06 -2.08095111e+04]
 [-6.07