In [None]:
# For array manipulation
import numpy as np

# For dataframe manipulation
import pandas as pd

# To make plots
import matplotlib.pyplot as plt
%matplotlib inline

# To set the theme of the plot
import seaborn as sns
sns.set()

# To perform PCA, and sparce coding
from sklearn.decomposition import PCA, SparseCoder, DictionaryLearning

# People faces dataset
from sklearn.datasets import fetch_lfw_people, make_sparse_coded_signal, load_breast_cancer

# To split the data for training and testing
from sklearn.model_selection import train_test_split

# For standardizing data
from sklearn.preprocessing import StandardScaler

# To perofrm KNN clustering
from sklearn.neighbors import KNeighborsClassifier

# To perform classification
from sklearn.linear_model import LogisticRegression

# To perform SVD
from numpy.linalg import svd

# For making the sparse matrix
from scipy.sparse import csr_matrix, lil_matrix

# For comparing training times
from time import time

### K Nearest Neighbors Classification
KNN is a non-parametric, lazy learning algorithm. When we say a technique is non-parametric, it means that it does not make any assumptions about the underlying data. In other words, it makes its selection based off of the proximity to other data points regardless of what feature the numerical values represent.

Let’s take a look at how we could go about classifying data using the K-Nearest Neighbors algorithm in Python. For this tutorial, we’ll be using the breast cancer dataset.

In [None]:
# Loading the data
breast_cancer = load_breast_cancer()

# Making a data frame of the input and selecting two columns for this example
X = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names)
X = X[['mean area', 'mean compactness']]

# Making a one-hot encoded data frame for the outputs
y = pd.Categorical.from_codes(breast_cancer.target, breast_cancer.target_names)
y = pd.get_dummies(y, drop_first = True)

In [None]:
# Splitting the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

By default, the KNeighborsClassifier looks for the 5 nearest neighbors. We must explicitly tell the classifier to use Euclidean distance for determining the proximity between neighboring points.

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
knn.fit(X_train, y_train)

Using our newly trained model, we predict whether a tumor is benign or not given its mean compactness and area.

In [None]:
y_pred = knn.predict(X_test)

We visually compare the predictions made by our model with the samples inside the testing set.

In [None]:
sns.scatterplot(
    x='mean area',
    y='mean compactness',
    hue='benign',
    data=X_test.join(y_test, how='outer')
)

In [None]:
plt.scatter(
    X_test['mean area'],
    X_test['mean compactness'],
    c=y_pred,
    cmap='coolwarm',
    alpha=0.7
)

### Principal Component Analysis
Principal component analysis is a fast and flexible unsupervised method for dimensionality reduction in data. 

Consider the following 200 points.

In [None]:
# Initializing a random number generator (rng)
rng = np.random.RandomState(1)

# Genrating the data using rng - 200 points with x and y dimension
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T

# Plotting the points
plt.scatter(X[:, 0], X[:, 1], c = 'mediumblue', alpha = 1)
plt.axis('equal');

By eye, it is clear that there is a nearly linear relationship between the x and y variables. Rather than attempting to predict the y values from the x values, the unsupervised learning problem attempts to learn about the relationship between the x and y values.

In principal component analysis, this relationship is quantified by finding a list of the principal axes in the data, and using those axes to describe the dataset. We can compute thsi using Scikit-Learn's PCA estimator.

In [None]:
# Making an object to perform PCA - n_components is the number of components to keep
pca = PCA(n_components = 2)

# Fit the principal component analyzer to the training data
pca.fit(X)

# The analyzer now has values for the principal components and the fraction of variance explained by them
for i, (vector, variance) in enumerate(zip(pca.components_, pca.explained_variance_), 1):
    print(f'Principal Component {i} - {np.around(vector, 5)}, Explained Variance - {round(100 * variance, 5)}%')

To see what these numbers mean, let's visualize them as vectors over the input data, using the "components" to define the direction of the vector, and the "explained variance" to define the squared-length of the vector.

In [None]:
# Function for drawing the principal components
def draw_vector(v0, v1, ax = None):
    
    ax = ax or plt.gca()
    arrow_props = {'arrowstyle': '->', 'linewidth': 2, 'shrinkA': 0, 'shrinkB': 0, 'color': 'black'}
    ax.annotate('', v1, v0, arrowprops = arrow_props)

# Making the plot
plt.figure(figsize = (10, 6))
plt.scatter(X[:, 0], X[:, 1], c = 'mediumblue', alpha = 0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');

These vectors represent the principal axes of the data, and the length of the vector is an indication of how "important" that axis is in describing the distribution of the data—more precisely, it is a measure of the variance of the data when projected onto that axis. The projection of each data point onto the principal axes are the "principal components" of the data.

If we plot these principal components beside the original data, we see the plots shown here. 

In [None]:
# Making a dictionary for font properties for axes labels and title
fontdict = {'family':'serif', 'color':'black', 'weight':'normal', 'size': 14}

# Initializing the two subplots
fig, ax = plt.subplots(1, 2, figsize = (14, 5))
fig.subplots_adjust(wspace = 0.3)

# Making the plot with X and Y axis
ax[0].scatter(X[:, 0], X[:, 1], c = 'mediumblue', alpha = 0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v, ax = ax[0])
ax[0].axis('equal')
ax[0].set_xlabel('X', fontdict = fontdict, labelpad = 10)
ax[0].set_ylabel('Y', fontdict = fontdict, labelpad = 10)
ax[0].set_title('Input', fontdict = fontdict, pad = 10)

# Making the plot taking the prinicpal components as the axes
X_pca = pca.transform(X)
ax[1].scatter(X_pca[:, 0], X_pca[:, 1], c = 'mediumblue', alpha = 0.2)
draw_vector([0, 0], [0, 3], ax = ax[1])
draw_vector([0, 0], [3, 0], ax = ax[1])
ax[1].axis('equal')
ax[1].set(xlim = (-5, 5), ylim = (-3, 3.1))
ax[1].set_xlabel('Component 1', fontdict = fontdict, labelpad = 10)
ax[1].set_ylabel('Component 2', fontdict = fontdict, labelpad = 10)
ax[1].set_title('Principal Components', fontdict = fontdict, pad = 10);

### PCA for dimensionality reduction
Using PCA for dimensionality reduction involves zeroing out one or more of the smallest principal components, resulting in a lower-dimensional projection of the data that preserves the maximal data variance.

In [None]:
# Setting the number of components to keep to 1
pca = PCA(n_components = 1)

# Fitting the analyzer to the data
pca.fit(X)

# Transforming the data to get a reduced dimension
X_pca = pca.transform(X)

print("Original shape:", X.shape)
print("Transformed shape:", X_pca.shape)

The transformed data has been reduced to a single dimension. To understand the effect of this dimensionality reduction, we can perform the inverse transform of this reduced data and plot it along with the original data.

In [None]:
# Taking the inverse transform of the one dimesnional data
X_new = pca.inverse_transform(X_pca)

plt.figure(figsize = (10, 6))
# Plotting the original data
plt.scatter(X[:, 0], X[:, 1], c = 'mediumblue', alpha = 0.2)
# Plotting the projected data
plt.scatter(X_new[:, 0], X_new[:, 1], c = 'mediumblue', alpha = 0.8)
plt.axis('equal');

The light points are the original data, while the dark points are the projected version. This makes clear what a PCA dimensionality reduction means: the information along the least important principal axis or axes is removed, leaving only the component(s) of the data with the highest variance. The fraction of variance that is cut out (proportional to the spread of points about the line formed in this figure) is roughly a measure of how much "information" is discarded in this reduction of dimensionality.

This reduced-dimension dataset is in some senses "good enough" to encode the most important relationships between the points: despite reducing the dimension of the data by 50%, the overall relationship between the data points are mostly preserved.

### Eigenfaces

In [None]:
# Fetching the data for faces
faces = fetch_lfw_people(min_faces_per_person = 60)

print(faces.target_names)
print(faces.images.shape)

Let’s visualize some images from the dataset. You can see that each image has a complete face, and the facial features like eyes, nose, and lips are clearly visible in each image. Now that we have our dataset ready, let’s compress it.

In [None]:
fig, ax = plt.subplots(1, 5, figsize = (10, 3),
                       subplot_kw = {'xticks': [], 'yticks': []},
                       gridspec_kw = {'wspace': 0.1})

for i in range(5):
    ax[i].imshow(faces.data[i].reshape(62, 47), cmap = 'binary_r')

Let's take a look at the principal axes that span this dataset. Because this is a large dataset, we will use PCA with a randomized algorithm — it contains a randomized method to approximate the first N principal components much more quickly than the standard PCA estimator, and thus is very useful for high-dimensional data (here, a dimensionality of nearly 3,000). 

We will keep the top $k$ templates (principal components) and drop the remaining. But, how many templates shall we keep? If we keep more templates, our reconstructed images will closely resemble the original images but we will need more storage to store the compressed data. If we keep too few templates, our reconstructed images will look very different from the original images.

For this notebook, we will keep $k=150$.

In [None]:
pca = PCA(150, svd_solver = 'randomized', random_state = 42)
pca.fit(faces.data)

In this case, it can be interesting to visualize the images associated with the first several principal components (these components are technically known as "eigenvectors," so these types of images are often called "eigenfaces").

In [None]:
fig, axes = plt.subplots(3, 8, figsize = (10, 5), 
                         subplot_kw = {'xticks': [], 'yticks': []}, 
                         gridspec_kw = {'hspace': 0.1, 'wspace': 0.06})

for i, ax in enumerate(axes.flat):
    ax.imshow(pca.components_[i].reshape(62, 47), cmap = 'bone')

The results are very interesting, and give us insight into how the images vary: for example, the first few eigenfaces (from the top left) seem to be associated with the angle of lighting on the face, and later principal vectors seem to be picking out certain features, such as eyes, noses, and lips. Let's take a look at the cumulative variance of these components to see how much of the data information the projection is preserving.

In [None]:
plt.figure(figsize = (7, 5))

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components', fontdict = fontdict, labelpad = 10)
plt.ylabel('Cumulative Explained Variance', fontdict = fontdict, labelpad = 10);

We see that these 150 components account for just over 90% of the variance. That would lead us to believe that using these 150 components, we would recover most of the essential characteristics of the data. To make this more concrete, we can compare the input images with the images reconstructed from these 150 components.

In [None]:
components = pca.transform(faces.data)
projected = pca.inverse_transform(components)

In [None]:
fig, ax = plt.subplots(2, 10, figsize = (16, 4),
                       subplot_kw = {'xticks': [], 'yticks': []},
                       gridspec_kw = {'hspace': 0.1, 'wspace': 0.1})

for i in range(10):
    ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap = 'binary_r')
    ax[1, i].imshow(projected[i].reshape(62, 47), cmap = 'binary_r')
    
ax[0, 0].set_ylabel('Full-dim\nInput', fontdict = fontdict, labelpad = 10)
ax[1, 0].set_ylabel('150-dim\nReconstruction', fontdict = fontdict, labelpad = 10);

The top row here shows the input images, while the bottom row shows the reconstruction of the images from just 150 of the ~3,000 initial features. Although it reduces the dimensionality of the data by nearly a factor of 20, the projected images contain enough information that we might, by eye, recognize the individuals in the image. What this means is that our classification algorithm needs to be trained on 150-dimensional data rather than 3,000-dimensional data, which depending on the particular algorithm we choose, can lead to a much more efficient classification.

You can learn more about the mathematics of PCA for 2D images <a href='https://towardsdatascience.com/face-dataset-compression-using-pca-cddf13c63583'>here</a>.

### Singular Value Decomposition
SVD is a data summary method similar to PCA. It extracts important features from data. 

In [None]:
# Importing the data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
df = pd.read_csv(url, names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'target'])

# Selecting features
columns = ['sepal length', 'sepal width', 'petal length', 'petal width']
data = df[columns]

data.head()

In [None]:
# We take two singular values
n = 2 

# Performing SVD
U, s, V = svd(data)

# eye() creates a matrix with ones on the diagonal and zeros elsewhere
sigma = np.mat(np.eye(n) * s[:n])

# Making a dataframe with singular values and corresponding targets
singular_values = U[:, :n]
singular_values = pd.DataFrame(singular_values)
singular_values.columns = ['SV1', 'SV2']
singular_values['target'] = df['target']

singular_values.head()

In [None]:
plt.figure(figsize = (10, 5)) 
plt.xlabel('Singular Value 1', fontdict = fontdict, labelpad = 10) 
plt.ylabel('Singular Value 2', fontdict = fontdict, labelpad = 10) 
plt.title('Singular Value Decomposition', fontdict = fontdict, pad = 10) 

targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']

for target, color in zip(targets, colors):
    
    indices_to_keep = singular_values['target'] == target
    plt.scatter(singular_values.loc[indices_to_keep, 'SV1'], 
                singular_values.loc[indices_to_keep, 'SV2'], 
                c = color, 
                s = 50)
    
plt.legend(targets)
plt.grid(b = True)

#### Comparing the output with PCA

In [None]:
x = data.copy()
y = df.loc[:, ['target']].values

# Standardizing the data
x = StandardScaler().fit_transform(x)

x = pd.DataFrame(x, columns = columns)
x.head()

In [None]:
# Getting the pricipal components
x_pca = PCA(n_components = 4).fit_transform(x)
x_pca = pd.DataFrame(x_pca, columns = ['PC1', 'PC2', 'PC3', 'PC4'])
x_pca['Target'] = y
                 
x_pca.head()

In [None]:
plt.figure(figsize = (10, 5)) 
plt.xlabel('Principal Component 1', fontdict = fontdict, labelpad = 10) 
plt.ylabel('Principal Component 2', fontdict = fontdict, labelpad = 10) 
plt.title('2 Component PCA', fontdict = fontdict, pad = 10) 

targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']

for target, color in zip(targets,colors):
    
    indicesToKeep = x_pca['Target'] == target
    plt.scatter(x_pca.loc[indicesToKeep, 'PC1'], 
                x_pca.loc[indicesToKeep, 'PC2'], 
                c = color, 
                s = 50)
    
plt.legend(targets)
plt.grid(b = True)

### Dealing with sparsity
Sparse matrices are computationally expensive because of the large amount of redundant zero’s that are present in the matrix structure. Machine learning algorithms will not work as effectively as expected due to the increasing size and possibly due to the lack of exhaustive resources. 

The other significant issues caused are the decreasing speed times to effectively compute the matrix and a decrease in the computational processing speed of machine learning algorithms

In [None]:
sparse_data = np.array([[0,0,0,5,0,0,0,0],
                        [0,0,0,0,0,0,1,0],
                        [6,0,0,0,0,0,0,0],
                        [0,0,7,0,0,0,0,0],
                        [0,0,0,0,0,8,0,0],
                        [0,9,0,0,0,0,0,0]])

sparse_data

#### CSR - Compressed Sparse Row Matrix
It is a matrix respresentation for sparse matrices, where the zero entries are discarded and the non-zero entries are stored in a key-value pair with keys are tuples of row index and column index, while the values are the entry in the cell.

In [None]:
csr = csr_matrix(sparse_data)
print('Compressed Sparse Row Matrix', csr, sep = '\n')

ori_matrix = csr.todense()
print('\nOriginal Matrix', ori_matrix, sep = '\n')

### Working with ratings data
We will use the MovieLens 100K public data set. The training file contains 100,000 ratings, by 943 users on 1,682 items. For the scope of this analysis we will ignore the timestamp column.

In [None]:
# Importing the data
data = pd.read_csv('ratings.data', sep="\t", header = None, engine = 'python')

# Defining column names and drooping the timestamp column
data.columns = ["user_id", "movie_id", "rating", "timestamp"]
data.drop(["timestamp"], axis = 1, inplace = True)

# Viewing the data
display(data.head())
display(data.shape)

Since the user_id and the movie_id variables are categorical, we need to one-hot encode them. But before doing that let's first create a function to calculate the memory usage by the data frame before and after the encoding.

In [None]:
# Bytes to MB conversion factor
BYTES_TO_MB_DIV = 0.000001

# Defining a function to calculate memory usage
def print_memory_usage_of_data_frame(df):
    
    memory = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is", memory, "MB")

print('Original data frame')
print_memory_usage_of_data_frame(data)

#### One-hot encoding
We start by one-hot encoding the user_id and the movie_id data frames using Pandas.

In [None]:
# One-hot encoding the user_id and movie_id columnsusing Pandas
data_encoded = pd.get_dummies(data, columns = ['user_id', 'movie_id'])

# Viewing the data
display(data_encoded.head())

print('\nOne-hot encoded data frame\n')
print(f'Shape is {data_encoded.shape}')
print_memory_usage_of_data_frame(data_encoded)

#### Sparse one-hot encoding with Pandas
Now, we encode the data as sparse arrays using Pandas.

In [None]:
# Sparse one-hot encoding the user_id and movie_id columns
pd_data_sparse = pd.get_dummies(data, columns = ['user_id', 'movie_id'], sparse = True)

# Viewing the data
display(pd_data_sparse.dtypes)

print('Pandas sparse one-hot encoded data frame')
print_memory_usage_of_data_frame(pd_data_sparse)

#### Sparse one-hot encoding with Scipy
Now, we encode the data as sparse arrays using Scipy.

In [None]:
# Function to convert a one-hot data frame to a scipy sparse matrix
def data_frame_to_scipy_sparse_matrix(df):
    
    arr = lil_matrix(df.shape, dtype = np.float32)
    for i, col in enumerate(df.columns):
        ix = df[col] != 0
        arr[np.where(ix), i] = 1

    return arr.tocsr()

# Function to get the memory usage of a sparse matrix
def get_csr_memory_usage(matrix):
    
    memory = (X_csr.data.nbytes + X_csr.indptr.nbytes + X_csr.indices.nbytes) * BYTES_TO_MB_DIV
    print("Memory usage is", memory, "MB")

In [None]:
y = data['rating']

# One-hot encoded data
X = data_encoded[data_encoded.columns.difference(['rating'])]
print('One-hot encoded X')
print_memory_usage_of_data_frame(X)

# Sparse one-hot encoded data using Pandas
X_sparse = pd_data_sparse[pd_data_sparse.columns.difference(['rating'])]
print('\nPandas sparse one-hot encoded X')
print_memory_usage_of_data_frame(X_sparse)

# Sparse one-hot encoded data using Scipy
X_csr = data_frame_to_scipy_sparse_matrix(X_sparse)
print('\nScipy sparse one-hot encoded X')
get_csr_memory_usage(X_csr)

#### Comapring training speeds
We'll compare the training speeds of a classification model using these encoded matrices.

In [None]:
vector_dict = {'Pandas data frame': [X, y],
               'Sparse pandas data frame': [X_sparse, y],
               'Scipy sparse matrix': [X_csr, y]}

for key, item in vector_dict.items():
    
    print(key)
    
    start = time()
    X_train, X_test, y_train, y_test = train_test_split(item[0], item[1], test_size = 0.3, random_state = 42)
    end = time()
    
    duration = round(end-start, 2)
    print("Train-test split:", duration, "seconds.")
    
    start = time()
    model = LogisticRegression(random_state = 0, multi_class = 'auto', solver = 'liblinear')
    model.fit(X_train, y_train)
    end = time()
    
    duration = round(end-start, 2)
    print('Training:', duration, 'seconds.', '\n' if key != 'Scipy sparse matrix' else '')

### Transform Model
Here we implement the Transform Model. We find a dictionary whose product with a sparse 'code' closely approximates the data vector. 

In [None]:
# Defining parameters
n_samples = 100           # Number of data points
n_components = 15         # Length of the encoding
n_features = 20           # Number of features in the data vector
n_nonzero_coefs = 10      # Number of nonzero coefficients in the encoding

# Generating sample data
X, dictionary, code = make_sparse_coded_signal(n_samples = n_samples, 
                                               n_components = n_components, 
                                               n_features = n_features, 
                                               n_nonzero_coefs = n_nonzero_coefs, 
                                               random_state = 42)

# Asserting that our expectations about array dimensions hold
assert X.shape == (n_features, n_samples)
assert dictionary.shape == (n_features, n_components)
assert code.shape == (n_components, n_samples)
assert np.sum(code != 0, axis = 0).all() == np.full(shape = n_samples, fill_value = n_nonzero_coefs).all()
assert X.all() == (dictionary @ code).all()

print('Shape of the data array is', X.shape)
print('X =', X, sep = '\n')
print('\nShape of the dictionary array is', dictionary.shape)
print('Dictionary =', dictionary, sep = '\n')
print('\nShape of the code array is', code.shape)
print('Code =', code, sep = '\n')

Now we create a dictionary learner and fit to the generated data. 

In [None]:
dict_learner = DictionaryLearning(n_components = n_components, 
                                  transform_algorithm = 'lasso_lars', 
                                  random_state = 42)

X_encoded = dict_learner.fit_transform(X.T).T

print('Shape of encoded data is', X_encoded.shape)
print('X_encoded =', X_encoded, sep = '\n')

We can check the level of sparsity of X_encoded.

In [None]:
print('Sparsity of X_encoded is', np.mean(X_encoded == 0))

We can compare the average squared Euclidean norm of the reconstruction error of the sparse coded signal relative to the squared Euclidean norm of the original signal.

In [None]:
X_predicted = dict_learner.components_.T @ X_encoded
round(np.mean(np.sum((X_predicted - X) ** 2, axis = 0) / np.sum(X ** 2, axis = 0)), 6)

### Synthesis Dictionary Model
Here we implement the Synthesis Dictionary Model. We find a sparse representation of data given a fixed dictionary. The goal is to find a sparse array, 'code', such that $$X\approx code * dictionary$$

In [None]:
X = np.array([[-1, -1, -1], [0, 0, 3]], dtype = np.float64)
dictionary = np.array([[0, 1, 0], [-1, -1, 2], [1, 1, 1], [0, 1, 1], [0, 2, 1]], dtype = np.float64)

coder = SparseCoder(dictionary = dictionary, 
                    transform_algorithm = 'lars',
                    transform_n_nonzero_coefs = 2)
code = coder.transform(X)

print('Dictionary =', dictionary, sep = '\n')
print('Code =', code, sep = '\n')
print('X =', X, sep = '\n')
print('Approximation =', code @ dictionary, sep = '\n')

In [None]:
np.set_printoptions(linewidth = 120)

X = np.random.randint(low = -10, high = 10, size = (6, 8))
dictionary = np.random.randint(low = -10, high = 10, size = (12, X.shape[1]))

X = X.astype(np.float64)
dictionary = dictionary.astype(np.float64)

distances = []

print('Dictionary =', dictionary, sep = '\n', end = '\n\n')

for i in range(X.shape[0]+1):

    if i != 0:
        print()
    
    print('Number of non-zero coefficients per column =', i, end = '\n\n')

    coder = SparseCoder(dictionary = dictionary, 
                        transform_algorithm = 'lars',
                        transform_n_nonzero_coefs = i)
    code = coder.transform(X)

    print('Code =', np.around(code, 4), sep = '\n')
    print('X =', X, sep = '\n')
    approximation = code @ dictionary
    print('Approximation =', np.around(approximation, 4), sep = '\n')
    distance = np.sqrt(np.mean(np.square(np.ravel(approximation) - np.ravel(X))))
    print('Distance =', round(distance, 4))
                    
    distances.append(distance)

Let'see how the average Euclidean distance between the elements of the original array and those of the approxiation varies with the number of non-zero coefficients allowed per column.

In [None]:
plt.figure(figsize = (7, 5))

plt.plot(distances)
plt.xlabel('Number of Non-zero Coefficients', fontdict = fontdict, labelpad = 10)
plt.ylabel('Distance', fontdict = fontdict, labelpad = 10);