
This file contains functions to read datasets in ARFF format specifically for the following datasets: Chemistry, CS, Philosophy, Chess, Corel5k, Medical, Langlog, Coffee, Yeast, CAL500, Birds, Emotions, Scene, 20NG, and Enron. All datasets can be accessed through the following link: https://www.uco.es/kdis/mllresources/.

In [None]:
!pip install numpy

In [3]:
import numpy as np

# CAL500
def parse_arff_data_CAL500(arff_path):
    data_matrix = []
    feature_names = []
    labels = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if '{0,1}' in line:
                    labels.append(attr_name)
                else:
                    feature_names.append(attr_name)
                total_attributes += 1
            elif line.startswith("@data"):
                in_data_section = True
            elif in_data_section:
                row = np.array([1 if float(val) > 0 else 0 for val in line.split(',')], dtype=int)
                data_matrix.append(row)

    print(data_matrix)
    return np.array(data_matrix), feature_names, labels

# Corel5k
def parse_arff_data_Corel5k(arff_path):
    data_matrix = []
    feature_names = []
    labels = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if 'Cluster' in line:
                    labels.append(attr_name)
                else:
                    feature_names.append(attr_name)
                total_attributes += 1
            elif line.startswith("@data"):
                in_data_section = True
            elif in_data_section:
                # Split the line by commas and convert to integers
                row = np.array([int(val) for val in line.split(',')], dtype=int)
                data_matrix.append(row)

    return np.array(data_matrix), feature_names, labels

# LLOG
def parse_arff_data_LLOG(arff_path):
    data_matrix = []
    feature_names = []
    labels = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if '{0,1}' in line:
                    labels.append(attr_name)
                else:
                    feature_names.append(attr_name)
                total_attributes += 1
            elif line.startswith("@data"):
                in_data_section = True
            elif in_data_section and line.startswith("{") and line.endswith("}"):
                row = np.zeros(total_attributes, dtype=int)
                entries = line[1:-1].split(',')
                for entry in entries:
                    index, value = entry.split()
                    row[int(index)] = int(value)
                data_matrix.append(row)
    return np.array(data_matrix), feature_names, labels

# Chess, cs
def parse_arff_data_chess_cs(arff_path):
    data_matrix = []
    feature_names = []
    labels = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if '{0,1}' in line:
                    labels.append(attr_name)
                else:
                    feature_names.append(attr_name)
                total_attributes += 1
            elif line.startswith("@data"):
                in_data_section = True
            elif in_data_section:
                # Split the line by commas and convert to integers
                row = np.array([int(val) for val in line.split(',')], dtype=int)
                data_matrix.append(row)

    return np.array(data_matrix), feature_names, labels

# genbase
def parse_arff_data_genbase(arff_path):
    data_matrix = []
    feature_names = []
    labels = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if '{YES, NO}' in line:
                    labels.append(attr_name)
                else:
                    feature_names.append(attr_name)
                total_attributes += 1
            elif line.lower().startswith("@data"):
                in_data_section = True
            elif in_data_section and line:
                row = []
                values = line.split(',')
                for value in values:
                  if value == ('YES' or 'NO' or '1' or '0'):
                    if '{YES, NO}' in line or '{0, 1}' in line:
                        # Convert {YES, NO} or {0, 1} to 1 or 0
                        row.append(1 if value.strip() == 'YES' or value.strip() == '1' else 0)
                    else:
                        # Assume numeric attributes, convert to 1 if non-zero, else 0
                        row.append(1 if float(value) != 0 else 0)
                data_matrix.append(row)

    return np.array(data_matrix), feature_names, labels

# Coffee
def parse_arff_data_coffee(arff_path):
    data_matrix = []
    feature_names = []
    labels = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if '{0,1}' in line:
                    labels.append(attr_name)
                else:
                    feature_names.append(attr_name)
                total_attributes += 1
            elif line.startswith("@data"):
                in_data_section = True
            elif in_data_section:
                # Split the line by commas and convert to integers
                row = np.array([1 if float(val) > 0.5 else 0 for val in line.split(',')], dtype=int)
                data_matrix.append(row)

    return np.array(data_matrix), feature_names, labels

# birds
def parse_arff_data_birds(arff_path_train, arff_path_test):
  data_matrix = []
  feature_names = []
  labels = []
  total_attributes = 0
  in_data_section = False

  with open(arff_path_train, 'r') as file:
      for line in file:
          line = line.strip()
          if line.lower().startswith("@attribute"):
              parts = line.split()
              attr_name = parts[1]
              if '{' and '}' in line:
                  labels.append(attr_name)
              else:
                  feature_names.append(attr_name)
              total_attributes += 1
          elif line.startswith("@data"):
              in_data_section = True
          elif in_data_section:
              # Split the line by commas and convert to integers
              row = np.array([1 if float(val) > 0.5 else 0 for val in line.split(',')], dtype=int)
              data_matrix.append(row)

  in_data_section = False
  with open(arff_path_test, 'r') as file:
      for line in file:
        line = line.strip()
        if line.lower().startswith("@data"):
              in_data_section = True
        elif in_data_section:
            # Split the line by commas and convert to integers
            row = np.array([1 if float(val) > 0.5 else 0 for val in line.split(',')], dtype=int)
            data_matrix.append(row)

  return np.array(data_matrix), feature_names, labels

# emotions
def parse_arff_data_emotions(arff_path_train, arff_path_test):
  data_matrix = []
  feature_names = []
  labels = []
  total_attributes = 0
  in_data_section = False

  with open(arff_path_train, 'r') as file:
      for line in file:
          line = line.strip()
          if line.lower().startswith("@attribute"):
              parts = line.split()
              attr_name = parts[1]
              if '{' and '}' in line:
                  labels.append(attr_name)
              else:
                  feature_names.append(attr_name)
              total_attributes += 1
          elif line.startswith("@data"):
              in_data_section = True
          elif in_data_section:
              # Split the line by commas and convert to integers
              row = np.array([1 if float(val) >0 else 0 for val in line.split(',')], dtype=int)
              data_matrix.append(row)

  in_data_section = False
  with open(arff_path_test, 'r') as file:
      for line in file:
        line = line.strip()
        if line.lower().startswith("@data"):
              in_data_section = True
        elif in_data_section:
            # Split the line by commas and convert to integers
            row = np.array([1 if float(val) > 0 else 0 for val in line.split(',')], dtype=int)
            data_matrix.append(row)

  return np.array(data_matrix), feature_names, labels

# chemistry, philosophy
def parse_arff_data_chemistry_philosophy(arff_path):
  data_matrix = []
  feature_names = []
  labels = []
  total_attributes = 0
  in_data_section = False

  with open(arff_path, 'r') as file:
      for line in file:
          line = line.strip()
          if line.lower().startswith("@attribute"):
              parts = line.split()
              attr_name = parts[1]
              if '{0,1}' in line:
                  labels.append(attr_name)
              else:
                  feature_names.append(attr_name)
              total_attributes += 1
          elif line.startswith("@data"):
              in_data_section = True
          elif in_data_section:
              row = np.array([1 if float(val) > 0 else 0 for val in line.split(',')], dtype=int)
              data_matrix.append(row)

  return np.array(data_matrix), feature_names, labels

# yeast
def parse_arff_data_yeast(arff_path):
    data_matrix = []
    feature_names = []
    labels = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if 'Class' in line:
                    labels.append(attr_name)
                else:
                    feature_names.append(attr_name)
                total_attributes += 1
            elif line.startswith("@data"):
                in_data_section = True
            elif in_data_section and line:
                # Split the line by commas and convert to binary values
                row = np.array([1 if float(val) > 0 else 0 for val in line.split(',')], dtype=int)
                data_matrix.append(row)

    return np.array(data_matrix), feature_names, labels

# Scene
def parse_arff_data_scene(arff_path_train, arrf_path_test):
    data_matrix = []
    feature_names = []
    categories = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path_train, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if 'numeric' in line:
                    feature_names.append(attr_name)
                else:
                    categories.append(attr_name)
                total_attributes += 1

            elif line.startswith("@data"):
                in_data_section = True

            elif in_data_section and line:
                # Split the line by commas and convert to integers
                row = np.array([1 if float(val) >= 0.5 else 0 for val in line.split(',')], dtype=int)
                data_matrix.append(row)

    in_data_section = False
    with open(arrf_path_test, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith("@data"):
                in_data_section = True

            elif in_data_section and line:
                # Split the line by commas and convert to integers
                row = np.array([1 if float(val) >= 0.5 else 0 for val in line.split(',')], dtype=int)
                data_matrix.append(row)

    return np.array(data_matrix), feature_names, categories

# 20NG, Enron
def parse_arff_data_enron_20NG(arff_path):
  data_matrix = []
  feature_names = []
  categories = []
  total_attributes = 0
  in_data_section = False

  with open(arff_path, 'r') as file:
      for line in file:
          line = line.strip()
          if line.lower().startswith("@attribute"):
              parts = line.split()
              attr_name = parts[1]
              if 'numeric' in line:
                  feature_names.append(attr_name)
              else:
                  categories.append(attr_name)
              total_attributes += 1

          elif line.startswith("@data"):
              in_data_section = True
          elif in_data_section and line.startswith("{") and line.endswith("}"):
              row = np.zeros(total_attributes, dtype=int)
              entries = line[1:-1].split(',')
              for entry in entries:
                  index, value = entry.split()
                  if value == '1':  # assuming binary encoding
                      row[int(index)] = 1
                  else:
                      row[int(index)] = int(value)  # assuming numeric encoding
              data_matrix.append(row)
  return np.array(data_matrix), feature_names, categories

# Medical
def parse_arff_data_medical(arff_path):
    data_matrix = []
    feature_names = []
    labels = []
    total_attributes = 0
    in_data_section = False

    with open(arff_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.lower().startswith("@attribute"):
                parts = line.split()
                attr_name = parts[1]
                if 'Class' in line:
                    labels.append(attr_name)
                else:
                    feature_names.append(attr_name)
                total_attributes += 1
            elif line.startswith("@data"):
                in_data_section = True
            elif in_data_section and line.startswith("{") and line.endswith("}"):
                row = np.zeros(total_attributes, dtype=int)
                entries = line[1:-1].split(',')
                for entry in entries:
                    index, value = entry.split()
                    row[int(index)] = int(value)
                data_matrix.append(row)
    return np.array(data_matrix), feature_names, labels