# Introdução aos Pandas

<img src='https://pandas.pydata.org/docs/_images/01_table_dataframe.svg'>

- Sumário
    - Leitura de Dados
    - Manipulando indíces

In [1]:
import pandas as pd
import numpy as np
from numpy.random import randn
np.random.seed(101)

### 1. Criando um DataFrame 

#### 1.1 Usando uma matriz randômica

In [2]:
'W X Y Z'.split()

['W', 'X', 'Y', 'Z']

In [3]:
df = pd.DataFrame(randn(20,4),columns='W X Y Z'.split())

In [4]:
df.head()

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [5]:
df.tail()

Unnamed: 0,W,X,Y,Z
15,0.38603,2.084019,-0.376519,0.230336
16,0.681209,1.035125,-0.03116,1.939932
17,-1.005187,-0.74179,0.187125,-0.732845
18,-1.38292,1.482495,0.961458,-2.141212
19,0.992573,1.192241,-1.04678,1.292765


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       20 non-null     float64
 1   X       20 non-null     float64
 2   Y       20 non-null     float64
 3   Z       20 non-null     float64
dtypes: float64(4)
memory usage: 768.0 bytes


In [7]:
df.shape

(20, 4)

#### 1.2 Usando arquivos do disco

<img src='https://pandas.pydata.org/docs/_images/02_io_readwrite.svg'>

##### CSV

In [8]:
df.head()

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [10]:
df.to_csv('data/exemplo.csv', index=False)

In [11]:
df_csv = pd.read_csv('data/exemplo.csv')
df_csv.head()

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


##### Excel

In [13]:
df.to_excel('data/exemplo.xlsx', index=False)

In [14]:
df_excel = pd.read_excel('data/exemplo.xlsx')
df_excel.head()

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


##### JSON
- orient: é um parâmetro necessário que indica o formato de string esperado do arquivo JSON.
- Pode assumir os valores {‘split’,’ records’,’ index’,’ columns’,’ values’,’ table’}
- [Exemplos de orient](https://appdividend.com/2022/03/15/pandas-to_json/#:~:text=To%20convert%20the%20object%20to,use%20the%20to_json()%20function)

In [15]:
df.to_json('data/exemplo.json', orient='table')

In [16]:
df_json = pd.read_json('data/exemplo.json', orient='table')
df_json.head()

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


##### HTML
- Para usar o pd.read_html é necessário instalar as seguintes dependências
    - conda install lxml -y
    - conda install html5lib
    - conda install BeautifulSoup4

In [17]:
url = 'https://en.wikipedia.org/wiki/Minnesota'
tables = pd.read_html(url)
df_html = tables[0]

In [18]:
df_html

Unnamed: 0,Minnesota,Minnesota.1
0,State,State
1,State of Minnesota,State of Minnesota
2,.mw-parser-output .ib-settlement-cols{text-ali...,.mw-parser-output .ib-settlement-cols{text-ali...
3,"Nickname(s): Land of 10,000 Lakes;North Star S...","Nickname(s): Land of 10,000 Lakes;North Star S..."
4,Motto(s): L'Étoile du Nord (French: The Star o...,Motto(s): L'Étoile du Nord (French: The Star o...
5,"Anthem: ""Hail! Minnesota""","Anthem: ""Hail! Minnesota"""
6,Map of the United States with Minnesota highli...,Map of the United States with Minnesota highli...
7,Country,United States
8,Before statehood,Minnesota Territory
9,Admitted to the Union,"May 11, 1858 (32nd State in the Union)"


### 2. Manipulando index

In [19]:
df.index

RangeIndex(start=0, stop=20, step=1)

In [20]:
list(df.index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [21]:
df

Unnamed: 0,W,X,Y,Z
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509
5,0.302665,1.693723,-1.706086,-1.159119
6,-0.134841,0.390528,0.166905,0.184502
7,0.807706,0.07296,0.638787,0.329646
8,-0.497104,-0.75407,-0.943406,0.484752
9,-0.116773,1.901755,0.238127,1.996652


In [22]:
df.set_index('W')

Unnamed: 0_level_0,X,Y,Z
W,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.70685,0.628133,0.907969,0.503826
0.651118,-0.319318,-0.848077,0.605965
-2.018168,0.740122,0.528813,-0.589001
0.188695,-0.758872,-0.933237,0.955057
0.190794,1.978757,2.605967,0.683509
0.302665,1.693723,-1.706086,-1.159119
-0.134841,0.390528,0.166905,0.184502
0.807706,0.07296,0.638787,0.329646
-0.497104,-0.75407,-0.943406,0.484752
-0.116773,1.901755,0.238127,1.996652


In [23]:
df.set_index('W', inplace=True)

In [24]:
df.set_index('Z', inplace=True)

In [25]:
df

Unnamed: 0_level_0,X,Y
Z,Unnamed: 1_level_1,Unnamed: 2_level_1
0.503826,0.628133,0.907969
0.605965,-0.319318,-0.848077
-0.589001,0.740122,0.528813
0.955057,-0.758872,-0.933237
0.683509,1.978757,2.605967
-1.159119,1.693723,-1.706086
0.184502,0.390528,0.166905
0.329646,0.07296,0.638787
0.484752,-0.75407,-0.943406
1.996652,1.901755,0.238127


In [26]:
pd.DataFrame()

In [27]:
df.reset_index(inplace=True)

In [28]:
df

Unnamed: 0,Z,X,Y
0,0.503826,0.628133,0.907969
1,0.605965,-0.319318,-0.848077
2,-0.589001,0.740122,0.528813
3,0.955057,-0.758872,-0.933237
4,0.683509,1.978757,2.605967
5,-1.159119,1.693723,-1.706086
6,0.184502,0.390528,0.166905
7,0.329646,0.07296,0.638787
8,0.484752,-0.75407,-0.943406
9,1.996652,1.901755,0.238127


In [29]:
df.reset_index(inplace=True)

In [30]:
df

Unnamed: 0,index,Z,X,Y
0,0,0.503826,0.628133,0.907969
1,1,0.605965,-0.319318,-0.848077
2,2,-0.589001,0.740122,0.528813
3,3,0.955057,-0.758872,-0.933237
4,4,0.683509,1.978757,2.605967
5,5,-1.159119,1.693723,-1.706086
6,6,0.184502,0.390528,0.166905
7,7,0.329646,0.07296,0.638787
8,8,0.484752,-0.75407,-0.943406
9,9,1.996652,1.901755,0.238127


**Convertendo valores de um dataframe para String**

In [31]:
print(df.to_string())

    index         Z         X         Y
0       0  0.503826  0.628133  0.907969
1       1  0.605965 -0.319318 -0.848077
2       2 -0.589001  0.740122  0.528813
3       3  0.955057 -0.758872 -0.933237
4       4  0.683509  1.978757  2.605967
5       5 -1.159119  1.693723 -1.706086
6       6  0.184502  0.390528  0.166905
7       7  0.329646  0.072960  0.638787
8       8  0.484752 -0.754070 -0.943406
9       9  1.996652  1.901755  0.238127
10     10  0.000366  0.196800 -1.136645
11     11  0.649826 -0.156598 -0.031579
12     12 -0.346419 -0.610259 -0.755325
13     13  1.024810 -0.479448  0.558769
14     14  0.610478  1.862864 -1.133817
15     15  0.230336  2.084019 -0.376519
16     16  1.939932  1.035125 -0.031160
17     17 -0.732845 -0.741790  0.187125
18     18 -2.141212  1.482495  0.961458
19     19  1.292765  1.192241 -1.046780


In [32]:
print(df.to_string(index=False))

 index         Z         X         Y
     0  0.503826  0.628133  0.907969
     1  0.605965 -0.319318 -0.848077
     2 -0.589001  0.740122  0.528813
     3  0.955057 -0.758872 -0.933237
     4  0.683509  1.978757  2.605967
     5 -1.159119  1.693723 -1.706086
     6  0.184502  0.390528  0.166905
     7  0.329646  0.072960  0.638787
     8  0.484752 -0.754070 -0.943406
     9  1.996652  1.901755  0.238127
    10  0.000366  0.196800 -1.136645
    11  0.649826 -0.156598 -0.031579
    12 -0.346419 -0.610259 -0.755325
    13  1.024810 -0.479448  0.558769
    14  0.610478  1.862864 -1.133817
    15  0.230336  2.084019 -0.376519
    16  1.939932  1.035125 -0.031160
    17 -0.732845 -0.741790  0.187125
    18 -2.141212  1.482495  0.961458
    19  1.292765  1.192241 -1.046780
