# AI 201 Programming Assignment 3
## Balance the Dataset via SMOTE

Submitted by: 
Jan Lendl R. Uy, 2019-00312

## Install and import necessary libraries
- numpy
- imbalanced-learn
- matplotlib
- tqdm
- ipywidgets

In [29]:
!pip install numpy imbalanced-learn



In [30]:
import csv
import numpy as np
import random
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from collections import Counter

## Load the dataset
- $X$: Features from `data.csv`
- $y$: Labels from `data_labels.csv`

In [31]:
def read_features_file(file_path, dtype=float):
    data = []
    try:
        with open(file_path, "r", newline="") as file:
            reader = csv.reader(file)
            # Read and convert data
            for row in reader:
                try:
                    converted_row = [dtype(val) for val in row]
                    data.append(converted_row)
                except ValueError as e:
                    raise ValueError(f"Error converting value in row: {row}. {str(e)}")
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find file: {file_path}")
    except Exception as e:
        raise Exception(f"Error reading CSV file {file_path}: {str(e)}")
    
    return np.array(data)

def read_labels_file(file_path, dtype=int):
    data = []
    try:
        with open(file_path, "r", newline="") as file:
            reader = csv.reader(file)
            # Read and convert data
            for row in reader:
                try:
                    value = dtype(row[0])  # Get only first element since there is only 1
                    data.append(value)
                except ValueError as e:
                    raise ValueError(f"Error converting value in row: {row}. {str(e)}")
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find file: {file_path}")
    except Exception as e:
        raise Exception(f"Error reading CSV file {file_path}: {str(e)}")
    
    return np.array(data)

In [32]:
X = read_features_file("data.csv")
X_test = read_features_file("test_set.csv")
y_raw = read_labels_file("data_labels.csv")

### Encode labels $y_{raw}$ to one-hot vectors
Encoding labels into one-hot vectors is proven to be beneficial for classification tasks

In [33]:
def one_hot_encode(labels, num_classes=8):
    # Convert labels to 0-based indexing (subtract 1 since labels start from 1)
    zero_based_labels = labels.reshape(-1) - 1
    
    # Create a zero matrix of shape (n_samples, num_classes)
    n_samples = len(labels)
    one_hot = np.zeros((n_samples, num_classes))
    
    # Set the appropriate indices to 1
    one_hot[np.arange(n_samples), zero_based_labels] = 1
    
    return one_hot

In [34]:
y = one_hot_encode((y_raw))

## Explore the dataset
Obtain the class distribution and why data augmentation is needed via SMOTE

In [35]:
def show_class_distribution(y_one_hot):
    # Convert one-hot to class indices (adding 1 since classes are 1-8)
    y_classes = np.argmax(y_one_hot, axis=1) + 1
    
    # Get class frequencies
    class_counts = Counter(y_classes)
    
    # Sort by class labels
    sorted_counts = dict(sorted(class_counts.items()))
    
    # Calculate percentages
    total_samples = len(y)
    percentages = {k: (v/total_samples) for k, v in sorted_counts.items()}
    
    # Print frequency table
    print("\nClass Distribution:")
    print("-" * 50)
    print(f"{'Class':<10} {'Count':<10} {'Percentage':>10}")
    print("-" * 50)
    for class_label, count in sorted_counts.items():
        percentage = percentages[class_label]
        print(f"{class_label:<10} {count:<10} {percentage:>10.2f}%")
    print("-" * 50)
    print(f"Total: {total_samples}")

In [36]:
show_class_distribution(np.array(y))


Class Distribution:
--------------------------------------------------
Class      Count      Percentage
--------------------------------------------------
1          1625             0.47%
2          233              0.07%
3          30               0.01%
4          483              0.14%
5          287              0.08%
6          310              0.09%
7          52               0.01%
8          466              0.13%
--------------------------------------------------
Total: 3486


### Balance the dataset by oversampling using SMOTE

In [37]:
smote = SMOTE()
steps = [("o", smote)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(X, y)

In [38]:
show_class_distribution(y)


Class Distribution:
--------------------------------------------------
Class      Count      Percentage
--------------------------------------------------
1          1625             0.12%
2          1625             0.12%
3          1625             0.12%
4          1625             0.12%
5          1625             0.12%
6          1625             0.12%
7          1625             0.12%
8          1625             0.12%
--------------------------------------------------
Total: 13000


## Split the balanced dataset into training and validation sets
Split the feature-label pairs $(x,y)$ into training and validation sets such that the validation set contains 800 datapoints

In [39]:
def train_test_split(X, y, test_size=0.3, random_state=None):
    if random_state is not None:
        random.seed(random_state)
    
    # Create list of indices and shuffle it
    indices = list(range(len(X)))
    random.shuffle(indices)
    
    # Calculate split point
    split = int(len(X) * (1 - test_size))
    
    # Split the data
    train_indices = indices[:split]
    test_indices = indices[split:]
    
    X_train = np.array([X[i] for i in train_indices])
    X_test = np.array([X[i] for i in test_indices])
    y_train = np.array([y[i] for i in train_indices])
    y_test = np.array([y[i] for i in test_indices])
    
    return X_train, X_test, y_train, y_test

In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=800/len(X), random_state=0)

## Save the balanced dataset as CSV files 

In [41]:
def save_dataset(X, y, features_file="features.csv", labels_file="labels.csv"):
    try:
        # Save features
        np.savetxt(features_file, X, delimiter=',')
        
        # Save labels
        np.savetxt(labels_file, y, delimiter=',', fmt='%d')
        
        # Print info
        print(f"Dataset saved successfully:")
        print(f"Features shape: {X.shape} -> {features_file}")
        print(f"Labels shape: {y.shape} -> {labels_file}")
        
    except Exception as e:
        print(f"Error saving dataset: {str(e)}")

In [42]:
save_dataset(X_train, y_train, "training_set.csv", "training_labels.csv")
save_dataset(X_val, y_val, "validation_set.csv", "validation_labels.csv")

Dataset saved successfully:
Features shape: (12200, 354) -> training_set.csv
Labels shape: (12200, 8) -> training_labels.csv
Dataset saved successfully:
Features shape: (800, 354) -> validation_set.csv
Labels shape: (800, 8) -> validation_labels.csv
