In [1]:
import Orange

In [3]:
data = Orange.data.Table("lenses")
data.domain.attributes

(DiscreteVariable(name='age',
                  values=['pre-presbyopic', 'presbyopic', 'young']),
 DiscreteVariable(name='prescription', values=['hypermetrope', 'myope']),
 DiscreteVariable(name='astigmatic', values=['no', 'yes']),
 DiscreteVariable(name='tear_rate', values=['normal', 'reduced']))

In [4]:
data.domain.class_var

DiscreteVariable(name='lenses', values=['hard', 'none', 'soft'])

In [6]:
for d in data[:3]:
    print(d)

[young, myope, no, reduced | none]
[young, myope, no, normal | soft]
[young, myope, yes, reduced | none]


In [7]:
import Orange
data = Orange.data.Table("lenses")
print("Attributes:", ", ".join(x.name for x in data.domain.attributes))
print("Class:", data.domain.class_var.name)
print("Data instances", len(data))

target = "soft"
print("Data instances with %s prescriptions:" % target)
atts = data.domain.attributes
for d in data:
    if d.get_class() == target:
        print(" ".join(["%14s" % str(d[a]) for a in atts]))

Attributes: age, prescription, astigmatic, tear_rate
Class: lenses
Data instances 24
Data instances with soft prescriptions:
         young          myope             no         normal
         young   hypermetrope             no         normal
pre-presbyopic          myope             no         normal
pre-presbyopic   hypermetrope             no         normal
    presbyopic   hypermetrope             no         normal


In [8]:
data.save("new_data.tab")

In [10]:
import Orange
data = Orange.data.Table("lenses")
myope_subset = [d for d in data if d["prescription"] == "myope"]
new_data = Orange.data.Table(data.domain, myope_subset)
new_data.save("lenses-subset.tab")
new_data

[[young, myope, no, reduced | none],
 [young, myope, no, normal | soft],
 [young, myope, yes, reduced | none],
 [young, myope, yes, normal | hard],
 [pre-presbyopic, myope, no, reduced | none],
 ...
]

### Exploration of the Data Domain

Data table stores information on data instances as well as on data domain. Domain holds the names of attributes, optional classes, their types and, and if categorical, the value names. The following code:

In [11]:
import Orange

data = Orange.data.Table("imports-85.tab")
n = len(data.domain.attributes)
n_cont = sum(1 for a in data.domain.attributes if a.is_continuous)
n_disc = sum(1 for a in data.domain.attributes if a.is_discrete)
print("%d attributes: %d continuous, %d discrete" % (n, n_cont, n_disc))

print("First three attributes:",
      ", ".join(data.domain.attributes[i].name for i in range(3)))

print("Class:", data.domain.class_var.name)

25 attributes: 14 continuous, 11 discrete
First three attributes: symboling, normalized-losses, make
Class: price 


In [12]:
print("First attribute:", data.domain[0].name)
name = "fuel-type"
print("Values of attribute '%s': %s" %
      (name, ", ".join(data.domain[name].values)))

First attribute: symboling
Values of attribute 'fuel-type': diesel, gas


### Data Instances

Data table stores data instances (or examples). These can be indexed or traversed as any Python list. Data instances can be considered as vectors, accessed through element index, or through feature name.

In [13]:
import Orange

data = Orange.data.Table("iris")
print("First three data instances:")
for d in data[:3]:
    print(d)

print("25-th data instance:")
print(data[24])

name = "sepal width"
print("Value of '%s' for the first instance:" % name, data[0][name])
print("The 3rd value of the 25th data instance:", data[24][2])

First three data instances:
[5.1, 3.5, 1.4, 0.2 | Iris-setosa]
[4.9, 3.0, 1.4, 0.2 | Iris-setosa]
[4.7, 3.2, 1.3, 0.2 | Iris-setosa]
25-th data instance:
[4.8, 3.4, 1.9, 0.2 | Iris-setosa]
Value of 'sepal width' for the first instance: 3.5
The 3rd value of the 25th data instance: 1.9


In [14]:
average = lambda x: sum(x)/len(x)

data = Orange.data.Table("iris")
print("%-15s %s" % ("Feature", "Mean"))
for x in data.domain.attributes:
    print("%-15s %.2f" % (x.name, average([d[x] for d in data])))

Feature         Mean
sepal length    5.84
sepal width     3.05
petal length    3.76
petal width     1.20


In [15]:
data.domain.attributes

(ContinuousVariable(name='sepal length', number_of_decimals=1),
 ContinuousVariable(name='sepal width', number_of_decimals=1),
 ContinuousVariable(name='petal length', number_of_decimals=1),
 ContinuousVariable(name='petal width', number_of_decimals=1))

In [16]:
data

[[5.1, 3.5, 1.4, 0.2 | Iris-setosa],
 [4.9, 3.0, 1.4, 0.2 | Iris-setosa],
 [4.7, 3.2, 1.3, 0.2 | Iris-setosa],
 [4.6, 3.1, 1.5, 0.2 | Iris-setosa],
 [5.0, 3.6, 1.4, 0.2 | Iris-setosa],
 ...
]

In [17]:
data.X[:3]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2]])

In [18]:
data.Y[:3]

array([0., 0., 0.])

In [21]:
import numpy as np
X = np.array([[1,2],[4,5]])
data = Orange.data.Table(X)
data

[[1.000, 2.000],
 [4.000, 5.000]
]

In [22]:
data.domain

[Feature 1, Feature 2]

In [24]:
domain = Orange.data.Domain([Orange.data.ContinuousVariable("lenght"),
                                 Orange.data.ContinuousVariable("width")])
data = Orange.data.Table(domain, X)
data.domain

[lenght, width]

In [25]:
size = Orange.data.DiscreteVariable("size", ["small", "big"])
height = Orange.data.ContinuousVariable("height")
shape = Orange.data.DiscreteVariable("shape", ["circle", "square", "oval"])
speed = Orange.data.ContinuousVariable("speed")

domain = Orange.data.Domain([size, height, shape], speed)

X = np.array([[1, 3.4, 0], [0, 2.7, 2], [1, 1.4, 1]])
Y = np.array([42.0, 52.2, 13.4])

data = Orange.data.Table(domain, X, Y)
print(data)

[[big, 3.400, circle | 42.000],
 [small, 2.700, oval | 52.200],
 [big, 1.400, square | 13.400]]
