$\def\*#1{\mathbf{#1}}$
$\DeclareMathOperator*{\argmax}{arg\,max}$
# Data Types

## Imports

In [153]:
import matplotlib as mpl
# pyplot : Provides a MATLAB-like plotting framework
import matplotlib.pyplot as plt
import numpy as np

# %matplotlib notebook
%matplotlib

Using matplotlib backend: Qt5Agg


## Data Matrix

The data set is represented by a $n \times d$ **data matrix** :

$$
D = 
\begin{pmatrix}
  x_{1,1} & x_{1,2} & \cdots & x_{1,d} \\
  x_{2,1} & x_{2,2} & \cdots & x_{2,d} \\
  \vdots  & \vdots  & \ddots & \vdots  \\
  x_{n,1} & x_{n,2} & \cdots & x_{n,d} 
\end{pmatrix}
$$

* The *i*-th **row** refers, depending on the application, to an *entity*, *instance*, **record**, *transaction*, *alternative*,...

$$\*x_i = (x_{i1}, x_{i1}, \ldots, x_{id})$$

* The *j*-th **column** refers to an *attribute*, **feature**, *dimension*, *criteria*,... 

$$X_j = (x_{1j}, x_{2j}, \ldots, x_{nj})$$

$$
D = 
\left(
\begin{array}{c|cccc}
        & X_1 & X_2 & \cdots & X_d\\
        \hline
  \*x_1 & x_{1,1} & x_{1,2} & \cdots & x_{1,d} \\
  \*x_2 & x_{2,1} & x_{2,2} & \cdots & x_{2,d} \\
  \vdots & \vdots  & \vdots  & \ddots & \vdots  \\
  \*x_n & x_{n,1} & x_{n,2} & \cdots & x_{n,d} 
\end{array}
\right)
$$

## Iris Data Set

 | sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | Type of iris plant |
 | ----------------- | ---------------- | ----------------- | ---------------- | ------------------ |
 | 5.1               | 3.5              | 1.4               | 0.2              | Setosa             |
 | 4.8               | 3.0              | 1.4               | 0.3              | Setosa             |
 | 6.0               | 3.4              | 4.5               | 1.6              | Versicolor         |
 | 6.8               | 3.0              | 5.5               | 2.1              | Virginica          |
 | 6.7               | 3.1              | 5.6               | 2.4              | Virginica          |

In [154]:
import numpy as np

filename = '../datasets/iris.data'

data = np.loadtxt(filename, delimiter=',', dtype=str)

print(data.nbytes)
print(data)

45300
[['SepalLength' 'SepalWidth' 'PetalLength' 'PetalWidth' 'Name']
 ['5.1' '3.5' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.0' '1.4' '0.2' 'Iris-setosa']
 ['4.7' '3.2' '1.3' '0.2' 'Iris-setosa']
 ['4.6' '3.1' '1.5' '0.2' 'Iris-setosa']
 ['5.0' '3.6' '1.4' '0.2' 'Iris-setosa']
 ['5.4' '3.9' '1.7' '0.4' 'Iris-setosa']
 ['4.6' '3.4' '1.4' '0.3' 'Iris-setosa']
 ['5.0' '3.4' '1.5' '0.2' 'Iris-setosa']
 ['4.4' '2.9' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.1' '1.5' '0.1' 'Iris-setosa']
 ['5.4' '3.7' '1.5' '0.2' 'Iris-setosa']
 ['4.8' '3.4' '1.6' '0.2' 'Iris-setosa']
 ['4.8' '3.0' '1.4' '0.1' 'Iris-setosa']
 ['4.3' '3.0' '1.1' '0.1' 'Iris-setosa']
 ['5.8' '4.0' '1.2' '0.2' 'Iris-setosa']
 ['5.7' '4.4' '1.5' '0.4' 'Iris-setosa']
 ['5.4' '3.9' '1.3' '0.4' 'Iris-setosa']
 ['5.1' '3.5' '1.4' '0.3' 'Iris-setosa']
 ['5.7' '3.8' '1.7' '0.3' 'Iris-setosa']
 ['5.1' '3.8' '1.5' '0.3' 'Iris-setosa']
 ['5.4' '3.4' '1.7' '0.2' 'Iris-setosa']
 ['5.1' '3.7' '1.5' '0.4' 'Iris-setosa']
 ['4.6' '3.6' '1.0' '0.2' 'I

In [155]:
import numpy as np

filename = '../datasets/iris.data'

data = np.loadtxt(filename, delimiter=',', skiprows=1, usecols=range(4))

print(data.nbytes)
print(data[:5,:])

labels = np.loadtxt(filename, delimiter=',', skiprows=1, usecols=4, dtype=object)


print(labels.nbytes)
print(labels[:5])

4800
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]
1200
['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa']


In [156]:
import numpy as np

filename = '../datasets/iris.data'

classes = {b'Iris-setosa': 0,
           b'Iris-versicolor': 1,
           b'Iris-virginica': 2}

classes_converter = {4: lambda c: classes[c]}

data = np.loadtxt(filename, delimiter=',', skiprows=1, converters=classes_converter)

print(data.nbytes)
print(data.dtype)
print(data)

6000
float64
[[ 5.1  3.5  1.4  0.2  0. ]
 [ 4.9  3.   1.4  0.2  0. ]
 [ 4.7  3.2  1.3  0.2  0. ]
 [ 4.6  3.1  1.5  0.2  0. ]
 [ 5.   3.6  1.4  0.2  0. ]
 [ 5.4  3.9  1.7  0.4  0. ]
 [ 4.6  3.4  1.4  0.3  0. ]
 [ 5.   3.4  1.5  0.2  0. ]
 [ 4.4  2.9  1.4  0.2  0. ]
 [ 4.9  3.1  1.5  0.1  0. ]
 [ 5.4  3.7  1.5  0.2  0. ]
 [ 4.8  3.4  1.6  0.2  0. ]
 [ 4.8  3.   1.4  0.1  0. ]
 [ 4.3  3.   1.1  0.1  0. ]
 [ 5.8  4.   1.2  0.2  0. ]
 [ 5.7  4.4  1.5  0.4  0. ]
 [ 5.4  3.9  1.3  0.4  0. ]
 [ 5.1  3.5  1.4  0.3  0. ]
 [ 5.7  3.8  1.7  0.3  0. ]
 [ 5.1  3.8  1.5  0.3  0. ]
 [ 5.4  3.4  1.7  0.2  0. ]
 [ 5.1  3.7  1.5  0.4  0. ]
 [ 4.6  3.6  1.   0.2  0. ]
 [ 5.1  3.3  1.7  0.5  0. ]
 [ 4.8  3.4  1.9  0.2  0. ]
 [ 5.   3.   1.6  0.2  0. ]
 [ 5.   3.4  1.6  0.4  0. ]
 [ 5.2  3.5  1.5  0.2  0. ]
 [ 5.2  3.4  1.4  0.2  0. ]
 [ 4.7  3.2  1.6  0.2  0. ]
 [ 4.8  3.1  1.6  0.2  0. ]
 [ 5.4  3.4  1.5  0.4  0. ]
 [ 5.2  4.1  1.5  0.1  0. ]
 [ 5.5  4.2  1.4  0.2  0. ]
 [ 4.9  3.1  1.5  0.1  0. ]
 [ 5.  

In [157]:
i = 3
xi = data[i]
print(xi)

[ 4.6  3.1  1.5  0.2  0. ]


In [158]:
j = 1
Xj = data[:,j]
print(Xj)

[ 3.5  3.   3.2  3.1  3.6  3.9  3.4  3.4  2.9  3.1  3.7  3.4  3.   3.   4.
  4.4  3.9  3.5  3.8  3.8  3.4  3.7  3.6  3.3  3.4  3.   3.4  3.5  3.4  3.2
  3.1  3.4  4.1  4.2  3.1  3.2  3.5  3.1  3.   3.4  3.5  2.3  3.2  3.5  3.8
  3.   3.8  3.2  3.7  3.3  3.2  3.2  3.1  2.3  2.8  2.8  3.3  2.4  2.9  2.7
  2.   3.   2.2  2.9  2.9  3.1  3.   2.7  2.2  2.5  3.2  2.8  2.5  2.8  2.9
  3.   2.8  3.   2.9  2.6  2.4  2.4  2.7  2.7  3.   3.4  3.1  2.3  3.   2.5
  2.6  3.   2.6  2.3  2.7  3.   2.9  2.9  2.5  2.8  3.3  2.7  3.   2.9  3.
  3.   2.5  2.9  2.5  3.6  3.2  2.7  3.   2.5  2.8  3.2  3.   3.8  2.6  2.2
  3.2  2.8  2.8  2.7  3.3  3.2  2.8  3.   2.8  3.   2.8  3.8  2.8  2.8  2.6
  3.   3.4  3.1  3.   3.1  3.1  3.1  2.7  3.2  3.3  3.   2.5  3.   3.4  3. ]


In [159]:
print(data[0:5,:])

[[ 5.1  3.5  1.4  0.2  0. ]
 [ 4.9  3.   1.4  0.2  0. ]
 [ 4.7  3.2  1.3  0.2  0. ]
 [ 4.6  3.1  1.5  0.2  0. ]
 [ 5.   3.6  1.4  0.2  0. ]]


In [160]:
Xj = data[0:5,j]
print(Xj)

[ 3.5  3.   3.2  3.1  3.6]


In [178]:
import pandas

df = pandas.read_csv('../datasets/iris.data')

df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [162]:
type(df)

pandas.core.frame.DataFrame

In [163]:
df.shape

(150, 5)

In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
SepalLength    150 non-null float64
SepalWidth     150 non-null float64
PetalLength    150 non-null float64
PetalWidth     150 non-null float64
Name           150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [165]:
i = 3
xi = df.iloc[i]
print(xi)

SepalLength            4.6
SepalWidth             3.1
PetalLength            1.5
PetalWidth             0.2
Name           Iris-setosa
Name: 3, dtype: object


In [166]:
j = 1
Xj = df.iloc[:,j]
print(Xj[:5])
print(type(Xj))

0    3.5
1    3.0
2    3.2
3    3.1
4    3.6
Name: SepalWidth, dtype: float64
<class 'pandas.core.series.Series'>


In [167]:
df['SepalWidth'][:5]

0    3.5
1    3.0
2    3.2
3    3.1
4    3.6
Name: SepalWidth, dtype: float64

In [168]:
df['Name'][:5]

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Name, dtype: object

## Attributes

* Numeric attributes

* Categorical attributes

## Numeric (quantitative) Attributes

* `domain(age)` = $\mathbb{N}$
* `domain(petal length)` = $\mathbb{R}_{>0}$
* **discrete** : finite or countably infinite set of values
* **continuous** : any real value

**Measurement scales**

* **Interval scale** :
    * Only addition and substration make sense. 
    * The *zero point* does not indicate the absence of measurement. 
    * The `temperature` measured in $^{\circ}C$ is interval-scaled. If two measurements of $20 ^{\circ}C$ and $10 ^{\circ}C$ are compared, what is the right statement ?
        * There is a temperature drop of $10 ^{\circ}C$.
        * The second measure is twice as cold as the first one.
* **Ratio scale**
    * Addition, substraction, and ratio make sense.
    * The `Age` attribute is ratio-scaled.
    * The `temperature` mesured in *Kelvin* is ratio-scaled. 

## Categorical (qualitative) Attributes
* A set of symbols, for example : 
    * `domain(Education) = {HighSchool, BS, MS, PhD}`
    * `domain(Fruits) = {Orange, Apple}`

**Measurement scales**

* **Nominal scale** : values are *unordered* 
* **Ordinal scale** : values are *ordered* 

## Geometric View

In [169]:
fig, ax = plt.subplots()

ax.set_xlabel('Sepal length')
ax.set_ylabel('Sepal width')

X = data[:,0:4]
Y = data[:,4]

ax.scatter(X[:, 0], X[:, 1], c=Y)

<matplotlib.collections.PathCollection at 0x7f3d0f8246a0>

In [170]:
fig, axs = plt.subplots(4, 4)

attributes = ['sepal length', 'sepal width', 'petal length', 'petal width']

for i in range(4):
    axs[i, 0].set_ylabel(attributes[i])
    axs[-1, i].set_xlabel(attributes[i])
    for j in range(4):
        axs[i, j].scatter(X[:, i], X[:, j], c=Y)

plt.tight_layout(pad=1)

### Data binning

In [171]:
fig, ax = plt.subplots()
hist = ax.hist([1,1,1,2,2,4,4], bins=3, edgecolor='black', linewidth=1)

In [172]:
fig, ax = plt.subplots()

values = np.array([1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7])

nbins = 3
size = (max(values) - min(values)) / nbins
print('Size :', size)

ax.hist(values, bins=3, edgecolor='black', linewidth=1)

ax.set_xticks([1, 1 + size, 1 + 2*size, 1 + 3*size])

print(values[values < 1 + size])
print(values[(1 + size <= values) & (values < 1 + 2*size)])
print(values[(1 + 2*size <= values) & (values <= 1 + 3*size)])

Size : 2.0
[1 1 1 2 2]
[3 3 4 4]
[5 6 7]


In [173]:
fig, ax = plt.subplots()

_, bins, _ = ax.hist(X[:,0], bins=10, edgecolor='black', linewidth=1)
ax.set_xticks(bins)
ax.set_xlabel(attributes[0])

<matplotlib.text.Text at 0x7f3d0f2f0c50>

## Dependency-oriented data

Relationships between data items :

* **Time-Series** : data generated by continouous measurement over time
    * *environmental sensor* : temperature, pressure
    * *finantial market analysis*
* **Discrete Sequences**
    * *event logs* such as web accesses : Client IP, Web page address
    * *strings*
* **Spatial** : non-spatial attributes measured at spatial locations
    * *hurricane forecasting* : sea-surface temperature, pressure
* **Spatiotemporal**
* **Network and Graph Data**
    * *Web graph*
    * *Social networks*



In [202]:
rng = pandas.date_range('2017-09-25 08:30:00', periods=30, freq='3s')
temperatures = np.random.randn(len(rng))*2 + 20
ts = pandas.Series(temperatures, index=rng)
print(ts.head())
ts[pandas.Timestamp('2017-09-25 08:30:09')]

2017-09-25 08:30:00    18.111259
2017-09-25 08:30:03    18.517365
2017-09-25 08:30:06    19.980085
2017-09-25 08:30:09    19.905471
2017-09-25 08:30:12    22.138376
Freq: 3S, dtype: float64


19.905471205177083

## Text Data

* A **string** : a discrete sequence of characters
* **Vector-space representation** : words (terms) frequencies (normalized with respect to the document length)
    * **Document-term matrix** : $n$ documents $\times$ $d$ terms

In [None]:
# import scikit-learn : Machine Learning in Python

# See : http://scikit-learn.org/stable/modules/feature_extraction.html
# and and http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer : Convert a collection of text documents to a matrix of token counts
vectorizer = CountVectorizer()

corpus = ['This is the first document.',
          'This is the second second document.',
          'And the third one.',
          'Is this the first document?']

# Learn the vocabulary dictionary and return term-document matrix.
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
X.toarray()

## Graph Data

A graph $G = (V, E)$ with $n$ ***vertices*** and $m$ ***edges*** consists of:

* $V = V(G)$ : a vertex set; $n = |V|$ is the order of $G$
* $E = E(G)$ : a set of pairs of vertices, called edges; $m = |E|$

A ***weighted graph*** is a graph $G = (V, E)$ in which each edge $e \in E(G)$ is given a numerical weight $w(e)$, where $w : E \rightarrow \mathbb{R}$.

In [None]:
import networkx as nx
    
def draw_weighted_graph(g):
    pos = nx.spectral_layout(g)
    nx.draw_networkx(g, pos)
    edge_labels = {edge[0:2]: edge[2]['weight'] for edge in g.edges(data=True)}
    nx.draw_networkx_edge_labels(g, pos, edge_labels)

g = nx.Graph()
    
g.add_nodes_from(['Lille', 'Paris', 'Amiens', 'Arras'])
g.add_edge('Lille', 'Paris', weight=225)
g.add_edge('Lille', 'Amiens', weight=62.7)
g.add_edge('Lille', 'Arras', weight=52.7)
g.add_edge('Paris', 'Amiens', weight=144.4)
g.add_edge('Paris', 'Arras', weight=185.8)
g.add_edge('Amiens', 'Arras', weight=62.6)

draw_weighted_graph(g)