In [21]:
import numpy as np
import pandas as pd
import random

### Series 
* é basicamente uma matriz rotulada e é uma das estruturas de dados fundamentais no pandas. 

In [22]:
serie = pd.Series(
    data = np.array([np.nan,1,2,3,5]),
    name = 'Coluna'
)
serie 

0    NaN
1    1.0
2    2.0
3    3.0
4    5.0
Name: Coluna, dtype: float64

&nbsp;

### data_range

In [23]:
datas = pd.date_range(start='2018/01/01', periods=6)
datas

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [24]:
pd.date_range(start='2018/01/09', periods=6, freq='Y')

  pd.date_range(start='2018/01/09', periods=6, freq='Y')


DatetimeIndex(['2018-12-31', '2019-12-31', '2020-12-31', '2021-12-31',
               '2022-12-31', '2023-12-31'],
              dtype='datetime64[ns]', freq='YE-DEC')

In [25]:
#?pd.date_range

&nbsp;

### Dataframe

In [26]:
# criando um data frame
pd.DataFrame(
    data  = np.random.random(size=(4,3)),
    index = ['A','B','C','D'],
    columns = ['A','B','C'] # Ou use list("ABC")
)

Unnamed: 0,A,B,C
A,0.992529,0.897954,0.423913
B,0.037584,0.436806,0.150944
C,0.224543,0.444708,0.706652
D,0.565201,0.614312,0.777079


In [27]:
pd.DataFrame(
    data  = np.random.random(size=(6,3)),
    index = datas,
    columns = ['A','B','C'] # Ou use list("ABC")
)

Unnamed: 0,A,B,C
2018-01-01,0.739817,0.930934,0.358365
2018-01-02,0.757135,0.590365,0.877801
2018-01-03,0.549522,0.440292,0.092039
2018-01-04,0.000488,0.952354,0.653826
2018-01-05,0.662101,0.368135,0.516591
2018-01-06,0.054822,0.819236,0.414252


In [28]:
# Usando um dicionario
data = pd.DataFrame(
    data = dict(
        nome  = ['Maria','Amanda','Katia'],
        idade = np.array([16,18,29],dtype=np.int64),
        rg    = [ random.randint(10000000000,99999999999) for i in range(3)],
        categoria = pd.Categorical(values=['mina','gamer','mulher'])
    ),
    index = ['U1','U2','U3']
)
display(data)
display(data.info())

Unnamed: 0,nome,idade,rg,categoria
U1,Maria,16,31107790765,mina
U2,Amanda,18,13874347467,gamer
U3,Katia,29,88516672545,mulher


<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, U1 to U3
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   nome       3 non-null      object  
 1   idade      3 non-null      int64   
 2   rg         3 non-null      int64   
 3   categoria  3 non-null      category
dtypes: category(1), int64(2), object(1)
memory usage: 231.0+ bytes


None

In [29]:
data['classe'] = pd.Categorical(
    values=['media','media-alta','rica']
)
data

Unnamed: 0,nome,idade,rg,categoria,classe
U1,Maria,16,31107790765,mina,media
U2,Amanda,18,13874347467,gamer,media-alta
U3,Katia,29,88516672545,mulher,rica


In [30]:
data.head(3)

Unnamed: 0,nome,idade,rg,categoria,classe
U1,Maria,16,31107790765,mina,media
U2,Amanda,18,13874347467,gamer,media-alta
U3,Katia,29,88516672545,mulher,rica


In [31]:
# vendo coluna no dataframe
display(data.columns)

#vendo index no dataframe
display(data.index)

Index(['nome', 'idade', 'rg', 'categoria', 'classe'], dtype='object')

Index(['U1', 'U2', 'U3'], dtype='object')

In [32]:
data.to_numpy()

array([['Maria', 16, 31107790765, 'mina', 'media'],
       ['Amanda', 18, 13874347467, 'gamer', 'media-alta'],
       ['Katia', 29, 88516672545, 'mulher', 'rica']], dtype=object)

#### Concatenado

In [33]:
df1 = pd.DataFrame(
    data = np.random.random(size=(3,1)),
    columns = ['a']
)
df2 = pd.DataFrame(
    data = np.random.random(size=(3,1)),
    columns = ['b']
)
display(df1)
display(df2)

Unnamed: 0,a
0,0.572774
1,0.157281
2,0.543726


Unnamed: 0,b
0,0.134034
1,0.269426
2,0.613313


In [34]:
pd.concat([df1,df2] , axis=1)

Unnamed: 0,a,b
0,0.572774,0.134034
1,0.157281,0.269426
2,0.543726,0.613313


In [35]:
pd.concat([df1,df2] , keys=['1','2'],axis=1 )

Unnamed: 0_level_0,1,2
Unnamed: 0_level_1,a,b
0,0.572774,0.134034
1,0.157281,0.269426
2,0.543726,0.613313


### merge

In [36]:
data1 = pd.read_csv(r'dataset/table1.csv')
data2 = pd.read_csv(r'dataset/table2.csv')

In [37]:
merged = pd.merge(
    left  = data1,
    right = data2,
    how='inner',
    on ='order_id',
    suffixes=('X','Y'),
)
merged

Unnamed: 0,order_id,customer_id,order_date,total_amount,order_detail_id,product_id,quantity,unit_price
0,1,101,2024-03-15,150.0,1,1,2,50.0
1,1,101,2024-03-15,150.0,2,2,1,25.0
2,2,102,2024-03-16,200.0,3,3,3,40.0
3,3,103,2024-03-17,100.0,4,1,1,50.0
4,3,103,2024-03-17,100.0,5,2,2,30.0
5,4,101,2024-03-18,180.0,6,1,3,50.0
6,4,101,2024-03-18,180.0,7,3,1,40.0
7,5,104,2024-03-18,220.0,8,2,4,30.0
8,5,104,2024-03-18,220.0,9,3,2,40.0


In [38]:
coluna = [(np.random.choice(['2','M','4'])) for i in range(9)]
merged['NewColumn'] = coluna
merged.drop_duplicates(inplace=True)
merged.head(2)

Unnamed: 0,order_id,customer_id,order_date,total_amount,order_detail_id,product_id,quantity,unit_price,NewColumn
0,1,101,2024-03-15,150.0,1,1,2,50.0,4
1,1,101,2024-03-15,150.0,2,2,1,25.0,M


In [40]:
#pd.pivot(data=merged, index='order_id',columns='NewColumn',values='NewColumn')

### stack

In [41]:
data = pd.read_csv(r'dataset/nba.csv')

data.stack()

0    Name         Avery Bradley
     Team        Boston Celtics
     Number                 0.0
     Position                PG
     Age                   25.0
                      ...      
456  Age                   26.0
     Height                 7-0
     Weight               231.0
     College             Kansas
     Salary            947276.0
Length: 4018, dtype: object

In [42]:
data.stack().unstack().head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [43]:
data.isna().nunique()

Name        2
Team        2
Number      2
Position    2
Age         2
Height      2
Weight      2
College     2
Salary      2
dtype: int64

### loc

In [44]:
data.loc[data['Name'] != 'A']

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


### inputeando valores

In [45]:
data.filter(items=['Boston Celtics'])

0
1
2
3
4
...
453
454
455
456
457
