# Criação de Dataframes

In [1]:
import pandas as pd
import numpy as np

## Series

In [2]:
# Creating series with pandas
series = pd.Series([7,3,2,np.nan,6,9])

In [3]:
series

0    7.0
1    3.0
2    2.0
3    NaN
4    6.0
5    9.0
dtype: float64

In [4]:
type(series)

pandas.core.series.Series

## Date Range

In [5]:
datas = pd.date_range('20180101', periods=6) # 6 days

In [8]:
datas

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
type(datas)

pandas.core.indexes.datetimes.DatetimeIndex

In [7]:
pd.date_range('20180101', periods=6, freq="ME") # 6 months

DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
               '2018-05-31', '2018-06-30'],
              dtype='datetime64[ns]', freq='ME')

## Dataframes

In [8]:
list("ABCD")

['A', 'B', 'C', 'D']

In [9]:
df  = pd.DataFrame(np.random.randn(6,4), index = datas, columns = list("ABCD"))

In [10]:
df

Unnamed: 0,A,B,C,D
2018-01-01,0.759099,-0.139208,1.185623,-0.1648
2018-01-02,1.122273,-0.310766,1.08091,2.123221
2018-01-03,0.001628,1.770904,0.877313,0.609363
2018-01-04,1.409029,-1.743649,1.405578,1.450324
2018-01-05,-0.423286,-0.652694,0.283375,-1.18753
2018-01-06,0.294183,0.965611,0.199598,1.223889


In [11]:
type(df)

pandas.core.frame.DataFrame

In [12]:
df2 = pd.DataFrame({
        "A": 2.7,
        "B": pd.Timestamp('20190101'),
        "C": pd.Series(1, index=list(range(4)), dtype='float32'),
        "D": np.array([3] * 4, dtype='int32'), # [3] * 4 == [3, 3, 3, 3]
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "Python"
           
})
df2

Unnamed: 0,A,B,C,D,E,F
0,2.7,2019-01-01,1.0,3,test,Python
1,2.7,2019-01-01,1.0,3,train,Python
2,2.7,2019-01-01,1.0,3,test,Python
3,2.7,2019-01-01,1.0,3,train,Python


In [13]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

# Criação de Dataframe

In [15]:
datas = pd.date_range('20190101', periods=60, freq='D')
df = pd.DataFrame(np.random.randn(60, 5), index=datas, columns=list('ABCDE'))

In [17]:
df

Unnamed: 0,A,B,C,D,E
2019-01-01,-0.701834,0.742164,0.94327,-0.619141,-2.058841
2019-01-02,1.236186,1.089333,0.861889,1.536135,-0.051553
2019-01-03,-0.370802,-0.233948,0.477841,-0.328171,-0.429833
2019-01-04,-0.154116,-0.599043,-0.228383,0.422999,-0.540941
2019-01-05,1.022604,-2.167304,-1.035693,-0.64629,-0.446798
2019-01-06,-0.20677,0.344137,-1.265969,0.330962,0.394707
2019-01-07,0.543265,-0.563407,-0.394699,0.612866,1.280333
2019-01-08,-0.471911,-0.979601,-1.595537,-0.603199,0.554465
2019-01-09,0.092755,-1.121101,0.731914,-0.633408,1.501153
2019-01-10,-0.658107,-1.465328,-0.305171,-0.747241,-0.993178


In [18]:
df.shape

(60, 5)

In [19]:
df.head(3)

Unnamed: 0,A,B,C,D,E
2019-01-01,-0.701834,0.742164,0.94327,-0.619141,-2.058841
2019-01-02,1.236186,1.089333,0.861889,1.536135,-0.051553
2019-01-03,-0.370802,-0.233948,0.477841,-0.328171,-0.429833


In [20]:
df['F'] = 1

In [21]:
df.head(10)

Unnamed: 0,A,B,C,D,E,F
2019-01-01,-0.701834,0.742164,0.94327,-0.619141,-2.058841,1
2019-01-02,1.236186,1.089333,0.861889,1.536135,-0.051553,1
2019-01-03,-0.370802,-0.233948,0.477841,-0.328171,-0.429833,1
2019-01-04,-0.154116,-0.599043,-0.228383,0.422999,-0.540941,1
2019-01-05,1.022604,-2.167304,-1.035693,-0.64629,-0.446798,1
2019-01-06,-0.20677,0.344137,-1.265969,0.330962,0.394707,1
2019-01-07,0.543265,-0.563407,-0.394699,0.612866,1.280333,1
2019-01-08,-0.471911,-0.979601,-1.595537,-0.603199,0.554465,1
2019-01-09,0.092755,-1.121101,0.731914,-0.633408,1.501153,1
2019-01-10,-0.658107,-1.465328,-0.305171,-0.747241,-0.993178,1


In [23]:
df['G'] = range(60)

In [24]:
df.head(10)

Unnamed: 0,A,B,C,D,E,F,G
2019-01-01,-0.701834,0.742164,0.94327,-0.619141,-2.058841,1,0
2019-01-02,1.236186,1.089333,0.861889,1.536135,-0.051553,1,1
2019-01-03,-0.370802,-0.233948,0.477841,-0.328171,-0.429833,1,2
2019-01-04,-0.154116,-0.599043,-0.228383,0.422999,-0.540941,1,3
2019-01-05,1.022604,-2.167304,-1.035693,-0.64629,-0.446798,1,4
2019-01-06,-0.20677,0.344137,-1.265969,0.330962,0.394707,1,5
2019-01-07,0.543265,-0.563407,-0.394699,0.612866,1.280333,1,6
2019-01-08,-0.471911,-0.979601,-1.595537,-0.603199,0.554465,1,7
2019-01-09,0.092755,-1.121101,0.731914,-0.633408,1.501153,1,8
2019-01-10,-0.658107,-1.465328,-0.305171,-0.747241,-0.993178,1,9


In [25]:
df['Produto'] = df['A'] * df['B']

In [26]:
df.head(10)

Unnamed: 0,A,B,C,D,E,F,G,Produto
2019-01-01,-0.701834,0.742164,0.94327,-0.619141,-2.058841,1,0,-0.520876
2019-01-02,1.236186,1.089333,0.861889,1.536135,-0.051553,1,1,1.346618
2019-01-03,-0.370802,-0.233948,0.477841,-0.328171,-0.429833,1,2,0.086748
2019-01-04,-0.154116,-0.599043,-0.228383,0.422999,-0.540941,1,3,0.092322
2019-01-05,1.022604,-2.167304,-1.035693,-0.64629,-0.446798,1,4,-2.216294
2019-01-06,-0.20677,0.344137,-1.265969,0.330962,0.394707,1,5,-0.071157
2019-01-07,0.543265,-0.563407,-0.394699,0.612866,1.280333,1,6,-0.306079
2019-01-08,-0.471911,-0.979601,-1.595537,-0.603199,0.554465,1,7,0.462284
2019-01-09,0.092755,-1.121101,0.731914,-0.633408,1.501153,1,8,-0.103988
2019-01-10,-0.658107,-1.465328,-0.305171,-0.747241,-0.993178,1,9,0.964343


In [27]:
df['D'] = 88

In [28]:
df.head(3)

Unnamed: 0,A,B,C,D,E,F,G,Produto
2019-01-01,-0.701834,0.742164,0.94327,88,-2.058841,1,0,-0.520876
2019-01-02,1.236186,1.089333,0.861889,88,-0.051553,1,1,1.346618
2019-01-03,-0.370802,-0.233948,0.477841,88,-0.429833,1,2,0.086748


# Visualização de DataFrames

In [33]:
datas = pd.date_range('20190101', periods=6, freq='D')
df = pd.DataFrame(np.random.randn(6, 4), index=datas, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2019-01-01,-0.655656,0.999554,-0.805681,0.628005
2019-01-02,-0.57345,0.881671,-0.011596,1.180308
2019-01-03,-0.571897,0.116632,0.693235,1.140049
2019-01-04,1.249813,-0.279108,-1.004478,1.320853
2019-01-05,0.444857,0.828192,-0.353371,-0.448061
2019-01-06,1.387221,0.041618,0.009393,1.877769


In [32]:
df2

Unnamed: 0,A,B,C,D,E,F
0,2.7,2019-01-01,1.0,3,test,Python
1,2.7,2019-01-01,1.0,3,train,Python
2,2.7,2019-01-01,1.0,3,test,Python
3,2.7,2019-01-01,1.0,3,train,Python


In [34]:
df.head(2) # Show the first 2 rows

Unnamed: 0,A,B,C,D
2019-01-01,-0.655656,0.999554,-0.805681,0.628005
2019-01-02,-0.57345,0.881671,-0.011596,1.180308


In [35]:
df.tail(2) # show the last 2 rows

Unnamed: 0,A,B,C,D
2019-01-05,0.444857,0.828192,-0.353371,-0.448061
2019-01-06,1.387221,0.041618,0.009393,1.877769


In [37]:
df.index # Show the index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [38]:
#Show the columns
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [39]:
c = df.columns

In [40]:
c

Index(['A', 'B', 'C', 'D'], dtype='object')

In [41]:
# Para pegar o conjunto de valores (removendo o index e colunas)
df.to_numpy()

array([[-0.65565609,  0.99955354, -0.80568074,  0.62800533],
       [-0.57344957,  0.88167136, -0.01159568,  1.18030787],
       [-0.5718973 ,  0.11663195,  0.6932352 ,  1.14004914],
       [ 1.2498133 , -0.2791083 , -1.00447806,  1.32085344],
       [ 0.44485736,  0.82819237, -0.35337059, -0.44806092],
       [ 1.38722122,  0.04161809,  0.00939291,  1.87776941]])

In [42]:
df.T # obter a matriz transposta

Unnamed: 0,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06
A,-0.655656,-0.57345,-0.571897,1.249813,0.444857,1.387221
B,0.999554,0.881671,0.116632,-0.279108,0.828192,0.041618
C,-0.805681,-0.011596,0.693235,-1.004478,-0.353371,0.009393
D,0.628005,1.180308,1.140049,1.320853,-0.448061,1.877769


In [43]:
df2.T

Unnamed: 0,0,1,2,3
A,2.7,2.7,2.7,2.7
B,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,Python,Python,Python,Python


In [44]:
df2.to_numpy()

array([[2.7, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'test', 'Python'],
       [2.7, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'train', 'Python'],
       [2.7, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'test', 'Python'],
       [2.7, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'train', 'Python']],
      dtype=object)

In [45]:
df2.to_json()

'{"A":{"0":2.7,"1":2.7,"2":2.7,"3":2.7},"B":{"0":1546,"1":1546,"2":1546,"3":1546},"C":{"0":1.0,"1":1.0,"2":1.0,"3":1.0},"D":{"0":3,"1":3,"2":3,"3":3},"E":{"0":"test","1":"train","2":"test","3":"train"},"F":{"0":"Python","1":"Python","2":"Python","3":"Python"}}'

In [47]:
df2.to_dict()

{'A': {0: 2.7, 1: 2.7, 2: 2.7, 3: 2.7},
 'B': {0: Timestamp('2019-01-01 00:00:00'),
  1: Timestamp('2019-01-01 00:00:00'),
  2: Timestamp('2019-01-01 00:00:00'),
  3: Timestamp('2019-01-01 00:00:00')},
 'C': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0},
 'D': {0: 3, 1: 3, 2: 3, 3: 3},
 'E': {0: 'test', 1: 'train', 2: 'test', 3: 'train'},
 'F': {0: 'Python', 1: 'Python', 2: 'Python', 3: 'Python'}}

In [49]:
df2.to_csv()

',A,B,C,D,E,F\n0,2.7,2019-01-01,1.0,3,test,Python\n1,2.7,2019-01-01,1.0,3,train,Python\n2,2.7,2019-01-01,1.0,3,test,Python\n3,2.7,2019-01-01,1.0,3,train,Python\n'

In [50]:
df2.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>A</th>\n      <th>B</th>\n      <th>C</th>\n      <th>D</th>\n      <th>E</th>\n      <th>F</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2.7</td>\n      <td>2019-01-01</td>\n      <td>1.0</td>\n      <td>3</td>\n      <td>test</td>\n      <td>Python</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2.7</td>\n      <td>2019-01-01</td>\n      <td>1.0</td>\n      <td>3</td>\n      <td>train</td>\n      <td>Python</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2.7</td>\n      <td>2019-01-01</td>\n      <td>1.0</td>\n      <td>3</td>\n      <td>test</td>\n      <td>Python</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2.7</td>\n      <td>2019-01-01</td>\n      <td>1.0</td>\n      <td>3</td>\n      <td>train</td>\n      <td>Python</td>\n    </tr>\n  </tbody>\n</table>'

# Combinando Dataframes

In [52]:
df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
}, index=[0,1,2,3])

df2 = pd.DataFrame({
    'A': ['A4', 'A5', 'A6', 'A7'],
    'B': ['B4', 'B5', 'B6', 'B7'],
    'C': ['C4', 'C5', 'C6', 'C7'],
    'D': ['D4', 'D5', 'D6', 'D7']
}, index=[4,5,6,7])

df3 = pd.DataFrame({
    'A': ['A8', 'A9', 'A10', 'A11'],
    'B': ['B8', 'B9', 'B10', 'B11'],
    'C': ['C8', 'C9', 'C10', 'C11'],
    'D': ['D8', 'D9', 'D10', 'D11']
}, index=[8,9,10,11])


In [53]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [54]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [55]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [57]:
frames = [df1, df2, df3] # Concatenação errada
frames 

[    A   B   C   D
 0  A0  B0  C0  D0
 1  A1  B1  C1  D1
 2  A2  B2  C2  D2
 3  A3  B3  C3  D3,
     A   B   C   D
 4  A4  B4  C4  D4
 5  A5  B5  C5  D5
 6  A6  B6  C6  D6
 7  A7  B7  C7  D7,
       A    B    C    D
 8    A8   B8   C8   D8
 9    A9   B9   C9   D9
 10  A10  B10  C10  D10
 11  A11  B11  C11  D11]

In [59]:
type(frames)

list

In [58]:
frames_combinados = pd.concat(frames) # Forma correta de concatenar.
frames_combinados

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [60]:
type(frames_combinados)

pandas.core.frame.DataFrame

In [63]:
frames_combinados2 = pd.concat([df1, df2, df3])
frames_combinados2

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [65]:
# adicionando chaves para separar em subgrupos
grupo = pd.concat([df1, df2, df3], keys=['x', 'y', 'z'])
grupo

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7
z,8,A8,B8,C8,D8
z,9,A9,B9,C9,D9


In [66]:
grupo['A']

x  0      A0
   1      A1
   2      A2
   3      A3
y  4      A4
   5      A5
   6      A6
   7      A7
z  8      A8
   9      A9
   10    A10
   11    A11
Name: A, dtype: object

In [69]:
grupo.loc['x'] # retorna apenas o subgrupo x

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [70]:
grupo.loc['y']

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [71]:
grupo.loc['z']

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [73]:
# Outra maneira de concatenar
g2 = df1.append(df2).append(df3) # deprecated since version 1.4.0

AttributeError: 'DataFrame' object has no attribute 'append'