# Visualização dos dados

![alt text](https://joaomrcarvalho.github.io/images/1_QVXJVEQ9Nz_uvq1GJIsU2Q.png)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

## Uma introdução ao matplotlib.pyplot

In [None]:
data = pd.read_csv('https://joaomrcarvalho.github.io/datasets/dm/iris.data',names=['x1','x2','x3','x4','species'])

In [None]:
fig,axis = plt.subplots(2,3)

print(axis)

print("---------------------------------------")

print(fig)

In [None]:
axis[0,0].scatter(data['x1'],data['x2'])

fig

In [None]:
from itertools import combinations

cols = list(combinations(data.columns[:-1],2))

print(cols)

print('--------------------------------------------------------------------------------------------------')

# vamos iterar por todas as figuras
count = 0
for i in range(len(axis)):
    for j in range(len(axis[i])):
        axis[i,j].scatter(data[cols[count][0]], data[cols[count][1]])
        count += 1

fig

In [None]:
labels = list(set(data['species']))

f = lambda x: labels.index(x)

In [None]:
count = 0
for i in range(len(axis)):
    for j in range(len(axis[i])):
        axis[i,j].scatter(data[cols[count][0]],data[cols[count][1]],c=[f(x) for x in data['species']])
        count += 1
        
fig

In [None]:
fig,axis = plt.subplots(2,3)

fig.subplots_adjust(hspace=0.5,wspace=0.5)

count = 0
for i in range(len(axis)):
    for j in range(len(axis[i])):
        axis[i,j].scatter(data[cols[count][0]],data[cols[count][1]],c=[f(x) for x in data['species']])
        count += 1

In [None]:
fig,axis = plt.subplots(2,3,figsize=(15,7))

fig.subplots_adjust(hspace=0.5,wspace=0.5)

fig.suptitle('Variáveis do conjunto de dados Iris')

fig.subplots_adjust(top=0.9)


count = 0
for i in range(len(axis)):
    for j in range(len(axis[i])):
        axis[i,j].scatter(data[cols[count][0]],data[cols[count][1]],c=[f(x) for x in data['species']])
        
        # colocamos a legenda no eixo x
        axis[i,j].set_xlabel(cols[count][0])
        
        # colocamos a legenda no eixo x
        axis[i,j].set_ylabel(cols[count][1])

        count += 1

In [None]:
fig,axis = plt.subplots(2,3,figsize=(15,7))
fig.subplots_adjust(hspace=0.5,wspace=0.5)
fig.suptitle('Variáveis do conjunto de dados Iris')
fig.subplots_adjust(top=0.9)

count = 0
for i in range(len(axis)):
    for j in range(len(axis[i])):
        # única linha alterada:
        axis[i,j].scatter(data[cols[count][0]],data[cols[count][1]],c=[f(x) for x in data['species']],alpha=0.75,s=50)
        
        axis[i,j].set_xlabel(cols[count][0])
        axis[i,j].set_ylabel(cols[count][1])
        
        count += 1

## Gráficos de uma variável

### BoxPlot 

In [None]:
data = pd.read_csv('https://joaomrcarvalho.github.io/datasets/dm/iris.data',names=['x1','x2','x3','x4','species'])

data['x1'].plot.box()

In [None]:
color = {'boxes': 'DarkGreen', 'whiskers': 'DarkOrange',
         'medians': 'DarkBlue', 'caps': 'Gray'}

data['x1'].plot.box(color=color)

In [None]:
data['x1'].plot.box(color=color,vert=False)

### Histogramas  

In [None]:
data['x1'].plot.hist(bins=25)

In [None]:
data['x1'].plot.hist(bins=25,cumulative=True)

In [None]:
data['x1'].plot.hist(bins=25,rwidth=0.9)

### Desenhar a distribuição dos dados (KDE)

In [None]:
data['x2'].plot.kde()

mean_val = np.mean(data['x2'])

plt.axvline(mean_val, linestyle='dashed', linewidth=2,c='g')

plt.legend(["density","mean"])

In [None]:
data['x2'].plot.kde()

mean_val = np.mean(data['x2'])

median_val = np.median(data['x2'])

plt.axvline(mean_val, linestyle='dashed', linewidth=2,c='g')
plt.axvline(median_val, linestyle='dashed', linewidth=2,c='r')
plt.legend(["density","mean","median"])

**Sugestão de Exercício:** Desenhar a distribuição das outras variáveis do dataset e analisar a relação da média com a mediana/comparar este boxplot com o das restantes variáveis.

## Séries Temporais com uma única variável independente

In [None]:
# !wget 'https://joaomrcarvalho.github.io/datasets/dm/coinbaseUSD_1-min_data_2014-12-01_to_2019-01-09.zip'
# !unzip 'coinbaseUSD_1-min_data_2014-12-01_to_2019-01-09.zip'

data = pd.read_csv('coinbaseUSD_1-min_data_2014-12-01_to_2019-01-09.csv')

data.head(10)

In [None]:
from datetime import datetime

data.dropna(inplace=True) 

data['date'] = [datetime.fromtimestamp(x) for x in data['Timestamp']]

data.head(10)

In [None]:
btc_data = data['Weighted_Price'].copy()

btc_data.index = data['date'].copy()

btc_data.head()

In [None]:
btc_data.plot(figsize=(20,8))

In [None]:
# vamos desenhar apenas um intervalo com 10000 minutos:
btc_data[10000:20000].plot(figsize=(20,6),style='k.')

In [None]:
groups = btc_data.groupby(pd.Grouper(freq = 'A'))

groups.plot(figsize=(20,7))

In [None]:
years = pd.DataFrame()

fig,ax = plt.subplots(groups.ngroups,figsize=(15,13)) 

fig.subplots_adjust(hspace=0.5)

i = 0
for name, group in groups:
    ax[i].plot(group.index,group.values,c='r')
    i += 1
    

In [None]:
btc_data_2018 = btc_data[btc_data.index.year == 2018]

group_2018 = btc_data_2018.groupby(pd.Grouper(freq='M'))

In [None]:
months = pd.DataFrame()

for name, group in group_2018:
    next_month = pd.Series(group.values,name=name.month)
    
    months = pd.concat([months,next_month],axis=1)

months.head()

In [None]:
months.plot.box(grid=True)

In [None]:
pd.plotting.lag_plot(btc_data)

In [None]:
pd.plotting.lag_plot(btc_data, lag=60)

In [None]:
btc_data_2018_m3 = btc_data[(btc_data.index.year == 2018) & (btc_data.index.month == 3) ]

pd.plotting.autocorrelation_plot(btc_data_2018_m3)

**Sugestão de Exercício:** Explorar a autocorrelação para os restantes meses desse ano (ou de outros).

**Sugestão de Exercício 2:** Ver o que aconteceu com o valor da criptomoeda bitcoin nesse período de tempo e justificar o porquê de a autocorrelação se comportar assim.

## Gráficos de duas variáveis

### Variável discreta vs variável contínua 

In [None]:
in_data = pd.read_csv('https://joaomrcarvalho.github.io/datasets/dm/iris.data',names=['x1','x2','x3','x4','species'])
data = in_data[['x2','species']]

In [None]:
group = data.groupby('species')

In [None]:
group.plot(kind='kde')

In [None]:
group.plot(kind='box')

In [None]:
agg_data = data.groupby('species').agg(['mean','median'])

agg_data.plot.bar()

### Variável contínua vs variável contínua  

In [None]:
data = in_data[['x1','x3']]

data.plot(kind='scatter',x='x1',y='x3')

In [None]:
coefs = np.polyfit(data['x1'],data['x3'],deg=1)

f = np.poly1d(coefs)

In [None]:
x = np.arange(round(data['x1'].min() - 1),round(data['x1'].max() + 1))

line = f(x)

data.plot(kind='scatter',x='x1',y='x3')

plt.plot(x,line,c='r')

In [None]:
data.plot(x='x1',y='x3',kind='hexbin',gridsize=7)
data.plot(kind='scatter',x='x1',y='x3') 

### Variável discreta vs Variável discreta

In [None]:
data = pd.read_csv('https://joaomrcarvalho.github.io/datasets/dm/auto-mpg.data', delim_whitespace=True, header=None,
            names = ['mpg', 'cylinders', 'displacement','horsepower',
            'weight', 'acceleration', 'model_year', 'origin', 'name'],
            na_values='?')

data = data[['cylinders','model_year']]

data.head()

In [None]:
crostab = pd.crosstab(index = data['model_year'],columns=data['cylinders'])

crostab.head()

In [None]:
crostab.plot.bar(figsize=(20,7))

In [None]:
from statsmodels.graphics.mosaicplot import mosaic

data = pd.read_csv('https://joaomrcarvalho.github.io/datasets/dm/adult.data')

fig,ax = plt.subplots(figsize=(20,5))

mosaic(data,['race','sex'],ax=ax)

plt.show()

## Gráficos de várias variáveis

In [None]:
from sklearn.datasets import load_digits

digits = load_digits()

data = digits['data']

labels = digits['target']

print(data.shape)

In [None]:
from sklearn.decomposition import PCA 

pca = PCA(n_components=3)

reduced_data = pca.fit_transform(data)

print(reduced_data)

In [None]:
from mpl_toolkits.mplot3d import Axes3D 

fig = plt.figure(figsize=(10,10)) 

ax = fig.add_subplot(111, projection='3d')

ax.scatter(reduced_data[:,0],reduced_data[:,1],reduced_data[:,2],c=labels)

**Sugestão de Exercício**: Aplicar o T-SNE nos dados e desenhar o resultado. 

### Representar matrizes de gráficos

In [None]:
data = pd.read_csv('https://joaomrcarvalho.github.io/datasets/dm/iris.data',names=['x1','x2','x3','x4','species'])

data.head()

In [None]:
pd.plotting.scatter_matrix(data,figsize=(10,10))

plt.show()

In [None]:
labels = list(set(data['species']))

f = lambda x: labels.index(x)

colors = data['species'].apply(f)

pd.plotting.scatter_matrix(data,figsize=(10,10),c=colors)

plt.show()

In [None]:
pd.plotting.scatter_matrix(data,figsize=(10,10),c=colors,diagonal='kde')
plt.show()