## Importacao das bibliotecas

In [106]:
import numpy as np
import pandas as pd

## Criacao dos objetos

In [107]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [108]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [109]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-2.640543,0.678012,-0.539839,-1.774136
2013-01-02,-0.124694,0.363536,0.544696,-0.639071
2013-01-03,-0.672799,-0.74775,-1.5764,-0.680069
2013-01-04,0.29081,0.826253,-1.378022,1.492247
2013-01-05,-0.737309,2.054823,-1.395033,0.596742
2013-01-06,-0.906625,0.166054,1.691705,1.619999


In [110]:
df2 = pd.DataFrame(
    {
          "A": 1.0,
          "B": pd.Timestamp("20130102"),
          "C": pd.Series(1, index=list(range(4)), dtype="float32"),
          "D": np.array([3] * 4, dtype="int32"),
          "E": pd.Categorical(["test", "train", "test", "train"]),
          "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [111]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Visualizando os dados

In [112]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-2.640543,0.678012,-0.539839,-1.774136
2013-01-02,-0.124694,0.363536,0.544696,-0.639071
2013-01-03,-0.672799,-0.74775,-1.5764,-0.680069
2013-01-04,0.29081,0.826253,-1.378022,1.492247
2013-01-05,-0.737309,2.054823,-1.395033,0.596742


In [113]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.29081,0.826253,-1.378022,1.492247
2013-01-05,-0.737309,2.054823,-1.395033,0.596742
2013-01-06,-0.906625,0.166054,1.691705,1.619999


In [114]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [115]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [116]:
df.to_numpy()

array([[-2.64054344,  0.67801207, -0.53983905, -1.77413596],
       [-0.12469383,  0.36353624,  0.54469637, -0.63907102],
       [-0.67279918, -0.74774956, -1.57639994, -0.68006889],
       [ 0.29081   ,  0.8262532 , -1.37802248,  1.49224722],
       [-0.73730871,  2.05482255, -1.3950332 ,  0.59674213],
       [-0.9066248 ,  0.1660541 ,  1.69170491,  1.61999885]])

In [117]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [118]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.798527,0.556821,-0.442149,0.102619
std,1.006476,0.919049,1.31203,1.353705
min,-2.640543,-0.74775,-1.5764,-1.774136
25%,-0.864296,0.215425,-1.390781,-0.669819
50%,-0.705054,0.520774,-0.958931,-0.021164
75%,-0.26172,0.789193,0.273563,1.268371
max,0.29081,2.054823,1.691705,1.619999


In [119]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-2.640543,-0.124694,-0.672799,0.29081,-0.737309,-0.906625
B,0.678012,0.363536,-0.74775,0.826253,2.054823,0.166054
C,-0.539839,0.544696,-1.5764,-1.378022,-1.395033,1.691705
D,-1.774136,-0.639071,-0.680069,1.492247,0.596742,1.619999


In [120]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.774136,-0.539839,0.678012,-2.640543
2013-01-02,-0.639071,0.544696,0.363536,-0.124694
2013-01-03,-0.680069,-1.5764,-0.74775,-0.672799
2013-01-04,1.492247,-1.378022,0.826253,0.29081
2013-01-05,0.596742,-1.395033,2.054823,-0.737309
2013-01-06,1.619999,1.691705,0.166054,-0.906625


In [121]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,-0.672799,-0.74775,-1.5764,-0.680069
2013-01-06,-0.906625,0.166054,1.691705,1.619999
2013-01-02,-0.124694,0.363536,0.544696,-0.639071
2013-01-01,-2.640543,0.678012,-0.539839,-1.774136
2013-01-04,0.29081,0.826253,-1.378022,1.492247
2013-01-05,-0.737309,2.054823,-1.395033,0.596742


## Selecao

In [122]:
df["A"]

2013-01-01   -2.640543
2013-01-02   -0.124694
2013-01-03   -0.672799
2013-01-04    0.290810
2013-01-05   -0.737309
2013-01-06   -0.906625
Freq: D, Name: A, dtype: float64

In [123]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-2.640543,0.678012,-0.539839,-1.774136
2013-01-02,-0.124694,0.363536,0.544696,-0.639071
2013-01-03,-0.672799,-0.74775,-1.5764,-0.680069


In [124]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.124694,0.363536,0.544696,-0.639071
2013-01-03,-0.672799,-0.74775,-1.5764,-0.680069
2013-01-04,0.29081,0.826253,-1.378022,1.492247


In [125]:
df.loc[dates[0]]

A   -2.640543
B    0.678012
C   -0.539839
D   -1.774136
Name: 2013-01-01 00:00:00, dtype: float64

In [126]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-2.640543,0.678012
2013-01-02,-0.124694,0.363536
2013-01-03,-0.672799,-0.74775
2013-01-04,0.29081,0.826253
2013-01-05,-0.737309,2.054823
2013-01-06,-0.906625,0.166054


In [127]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,-0.124694,0.363536
2013-01-03,-0.672799,-0.74775
2013-01-04,0.29081,0.826253


In [128]:
df.loc["20130102", ["A", "B"]]

A   -0.124694
B    0.363536
Name: 2013-01-02 00:00:00, dtype: float64

In [129]:
df.loc[dates[0], "A"]

-2.6405434432429256

In [130]:
df.at[dates[0], "A"]

-2.6405434432429256

### Selecao por posicao

In [131]:
df.iloc[3]

A    0.290810
B    0.826253
C   -1.378022
D    1.492247
Name: 2013-01-04 00:00:00, dtype: float64

In [132]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.29081,0.826253
2013-01-05,-0.737309,2.054823


In [133]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.124694,0.544696
2013-01-03,-0.672799,-1.5764
2013-01-05,-0.737309,-1.395033


In [134]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.124694,0.363536,0.544696,-0.639071
2013-01-03,-0.672799,-0.74775,-1.5764,-0.680069


In [135]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.678012,-0.539839
2013-01-02,0.363536,0.544696
2013-01-03,-0.74775,-1.5764
2013-01-04,0.826253,-1.378022
2013-01-05,2.054823,-1.395033
2013-01-06,0.166054,1.691705


In [136]:
df.iloc[1, 1]

0.3635362371684921

In [137]:
df.iat[1, 1] # Acesso rapido a um escalar

0.3635362371684921

### Indexacao booleana

In [138]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-04,0.29081,0.826253,-1.378022,1.492247


In [139]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.678012,,
2013-01-02,,0.363536,0.544696,
2013-01-03,,,,
2013-01-04,0.29081,0.826253,,1.492247
2013-01-05,,2.054823,,0.596742
2013-01-06,,0.166054,1.691705,1.619999


In [140]:
df3 = df.copy()

In [141]:
df3["E"] = ["one", "one", "two", "three", "four", "three"]

In [142]:
df3

Unnamed: 0,A,B,C,D,E
2013-01-01,-2.640543,0.678012,-0.539839,-1.774136,one
2013-01-02,-0.124694,0.363536,0.544696,-0.639071,one
2013-01-03,-0.672799,-0.74775,-1.5764,-0.680069,two
2013-01-04,0.29081,0.826253,-1.378022,1.492247,three
2013-01-05,-0.737309,2.054823,-1.395033,0.596742,four
2013-01-06,-0.906625,0.166054,1.691705,1.619999,three


In [143]:
df3[df3["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.672799,-0.74775,-1.5764,-0.680069,two
2013-01-05,-0.737309,2.054823,-1.395033,0.596742,four


### Setting

In [144]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [145]:
df.at[dates[0], "A"] = 0 # Setting values by label

In [146]:
df.iat[0, 1] = 0 # Setting values by position

In [147]:
df.loc[:, "D"] = np.array([5] * len(df)) # Setting by assigning with a NumPy array

In [148]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.539839,5
2013-01-02,-0.124694,0.363536,0.544696,5
2013-01-03,-0.672799,-0.74775,-1.5764,5
2013-01-04,0.29081,0.826253,-1.378022,5
2013-01-05,-0.737309,2.054823,-1.395033,5
2013-01-06,-0.906625,0.166054,1.691705,5


In [149]:
df4 = df.copy()

In [150]:
df4[df4 > 0] = -df4
df4

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.539839,-5
2013-01-02,-0.124694,-0.363536,-0.544696,-5
2013-01-03,-0.672799,-0.74775,-1.5764,-5
2013-01-04,-0.29081,-0.826253,-1.378022,-5
2013-01-05,-0.737309,-2.054823,-1.395033,-5
2013-01-06,-0.906625,-0.166054,-1.691705,-5


### Missing data

In [151]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])

In [152]:
df1.loc[dates[0] : dates[1], "E"] = 1

In [153]:
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.539839,5,1.0
2013-01-02,-0.124694,0.363536,0.544696,5,1.0
2013-01-03,-0.672799,-0.74775,-1.5764,5,
2013-01-04,0.29081,0.826253,-1.378022,5,


In [154]:
df1.dropna(how="any") # Drop any row thar have missing data

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.539839,5,1.0
2013-01-02,-0.124694,0.363536,0.544696,5,1.0


In [155]:
df1.fillna(value=5) ## filling missing data

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.539839,5,1.0
2013-01-02,-0.124694,0.363536,0.544696,5,1.0
2013-01-03,-0.672799,-0.74775,-1.5764,5,5.0
2013-01-04,0.29081,0.826253,-1.378022,5,5.0


In [156]:
pd.isna(df1) # to get the boolean mask where values are nan

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


## Operacoes

In [157]:
df.mean()

A   -0.358436
B    0.443819
C   -0.442149
D    5.000000
dtype: float64

In [159]:
df.mean(axis=1)

2013-01-01    1.115040
2013-01-02    1.445885
2013-01-03    0.500763
2013-01-04    1.184760
2013-01-05    1.230620
2013-01-06    1.487784
Freq: D, dtype: float64

In [160]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [161]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,-1.672799,-1.74775,-2.5764,4.0
2013-01-04,-2.70919,-2.173747,-4.378022,2.0
2013-01-05,-5.737309,-2.945177,-6.395033,0.0
2013-01-06,,,,


In [167]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.539839,5
2013-01-02,-0.124694,0.363536,0.004857,10
2013-01-03,-0.797493,-0.384213,-1.571543,15
2013-01-04,-0.506683,0.44204,-2.949565,20
2013-01-05,-1.243992,2.496862,-4.344598,25
2013-01-06,-2.150617,2.662917,-2.652893,30


## String Methods

In [168]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])

In [169]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Merge

### Concat

In [170]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.455279,-0.353539,-1.939826,-0.673179
1,1.145391,-1.299141,1.096182,-1.440932
2,1.304393,-0.250829,0.842551,1.565212
3,0.401913,0.545182,-0.505721,1.507442
4,0.873443,0.378443,-1.695082,0.151044
5,0.187202,-0.964037,0.427724,1.083089
6,0.062777,1.302959,-0.821462,-1.496101
7,-0.840384,-0.344479,-0.410182,0.37311
8,-1.656694,-0.703026,0.081799,-0.448612
9,0.204248,-1.778737,-0.467957,0.58955


In [171]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  0.455279 -0.353539 -1.939826 -0.673179
 1  1.145391 -1.299141  1.096182 -1.440932
 2  1.304393 -0.250829  0.842551  1.565212,
           0         1         2         3
 3  0.401913  0.545182 -0.505721  1.507442
 4  0.873443  0.378443 -1.695082  0.151044
 5  0.187202 -0.964037  0.427724  1.083089
 6  0.062777  1.302959 -0.821462 -1.496101,
           0         1         2         3
 7 -0.840384 -0.344479 -0.410182  0.373110
 8 -1.656694 -0.703026  0.081799 -0.448612
 9  0.204248 -1.778737 -0.467957  0.589550]

In [174]:
type(pieces)

list

In [172]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.455279,-0.353539,-1.939826,-0.673179
1,1.145391,-1.299141,1.096182,-1.440932
2,1.304393,-0.250829,0.842551,1.565212
3,0.401913,0.545182,-0.505721,1.507442
4,0.873443,0.378443,-1.695082,0.151044
5,0.187202,-0.964037,0.427724,1.083089
6,0.062777,1.302959,-0.821462,-1.496101
7,-0.840384,-0.344479,-0.410182,0.37311
8,-1.656694,-0.703026,0.081799,-0.448612
9,0.204248,-1.778737,-0.467957,0.58955


### Join

In [175]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})

In [176]:
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})

In [177]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [178]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [179]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


### Grouping

In [180]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,0.668722,-1.951482
1,bar,one,0.773172,-0.884064
2,foo,two,-0.245448,1.045923
3,bar,three,1.414062,1.073127
4,foo,two,1.402399,1.277285
5,bar,two,-0.110706,-0.005571
6,foo,one,0.585307,-0.768538
7,foo,three,-0.347617,0.62423


In [182]:
df.groupby("A").sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,2.076528,0.183492
foo,2.063363,0.227418


In [183]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.773172,-0.884064
bar,three,1.414062,1.073127
bar,two,-0.110706,-0.005571
foo,one,1.254029,-2.72002
foo,three,-0.347617,0.62423
foo,two,1.156951,2.323208


## Reshaping

In [184]:
tuples = list(
    zip(
        *[
          ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
          ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
    )
)
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [185]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [186]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.02274,0.222971
bar,two,0.184842,-1.326296
baz,one,-0.24386,1.615342
baz,two,0.766726,1.364219
foo,one,1.354768,0.194439
foo,two,-2.299775,-1.258639
qux,one,0.391554,1.151382
qux,two,0.028243,-0.350878


In [187]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.02274,0.222971
bar,two,0.184842,-1.326296
baz,one,-0.24386,1.615342
baz,two,0.766726,1.364219


In [188]:
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.022740
               B    0.222971
       two     A    0.184842
               B   -1.326296
baz    one     A   -0.243860
               B    1.615342
       two     A    0.766726
               B    1.364219
dtype: float64

In [189]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.02274,0.222971
bar,two,0.184842,-1.326296
baz,one,-0.24386,1.615342
baz,two,0.766726,1.364219


In [191]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.02274,0.184842
bar,B,0.222971,-1.326296
baz,A,-0.24386,0.766726
baz,B,1.615342,1.364219


In [193]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.02274,-0.24386
one,B,0.222971,1.615342
two,A,0.184842,0.766726
two,B,-1.326296,1.364219


## Pivot tables

In [194]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.885677,1.453199
1,one,B,foo,1.615757,0.346432
2,two,C,foo,0.482248,-0.075302
3,three,A,bar,-2.043788,-0.324668
4,one,B,bar,0.577455,2.463588
5,one,C,bar,-0.115125,-0.984013
6,two,A,foo,-1.450102,0.521137
7,three,B,foo,-0.277557,-0.428759
8,one,C,foo,0.559254,-0.619454
9,one,A,bar,-1.667516,-0.166843


In [195]:
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-1.667516,0.885677
one,B,0.577455,1.615757
one,C,-0.115125,0.559254
three,A,-2.043788,
three,B,,-0.277557
three,C,1.912631,
two,A,,-1.450102
two,B,1.015756,
two,C,,0.482248


## Time Series

In [200]:
rng = pd.date_range("1/1/2012", periods=100, freq="S")

In [197]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

In [198]:
ts.resample("5Min").sum()

2012-01-01    22786
Freq: 5T, dtype: int64

In [201]:
rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D")

In [202]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2012-03-06   -0.518154
2012-03-07    0.007982
2012-03-08    1.815282
2012-03-09    0.127397
2012-03-10   -0.450394
Freq: D, dtype: float64

In [203]:
ts_utc = ts.tz_localize("UTC") # Time zone representation
ts_utc

2012-03-06 00:00:00+00:00   -0.518154
2012-03-07 00:00:00+00:00    0.007982
2012-03-08 00:00:00+00:00    1.815282
2012-03-09 00:00:00+00:00    0.127397
2012-03-10 00:00:00+00:00   -0.450394
Freq: D, dtype: float64

In [204]:
ts_utc.tz_convert("US/Eastern") # Converting to another time zone

2012-03-05 19:00:00-05:00   -0.518154
2012-03-06 19:00:00-05:00    0.007982
2012-03-07 19:00:00-05:00    1.815282
2012-03-08 19:00:00-05:00    0.127397
2012-03-09 19:00:00-05:00   -0.450394
Freq: D, dtype: float64

In [205]:
rng = pd.date_range("1/1/2012", periods=5, freq="M")

In [206]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2012-01-31    1.123558
2012-02-29    0.214311
2012-03-31   -0.060715
2012-04-30   -0.578930
2012-05-31   -1.231251
Freq: M, dtype: float64

In [207]:
ps = ts.to_period()
ps

2012-01    1.123558
2012-02    0.214311
2012-03   -0.060715
2012-04   -0.578930
2012-05   -1.231251
Freq: M, dtype: float64

In [208]:
ps.to_timestamp()

2012-01-01    1.123558
2012-02-01    0.214311
2012-03-01   -0.060715
2012-04-01   -0.578930
2012-05-01   -1.231251
Freq: MS, dtype: float64

In [209]:
prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV")

In [210]:
ts = pd.Series(np.random.randn(len(prng)), prng)

In [211]:
ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9

In [212]:
ts.head()

1990-03-01 09:00   -1.155460
1990-06-01 09:00    0.285660
1990-09-01 09:00   -1.359415
1990-12-01 09:00    0.667700
1991-03-01 09:00   -1.058927
Freq: H, dtype: float64

## Categoricals

In [213]:
df = pd.DataFrame(
    {
        "id": [1, 2, 3, 4, 5, 6], 
        "raw_grade": ["a", "b", "b", "a", "a", "e"]
    }
)
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [214]:
df["grade"] = df["raw_grade"].astype("category") # Convert the raw grades to a categorical data type

In [215]:
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [216]:
df["grade"].cat.categories = ["very good", "good", "very bad"] # Rename the categories to more meaningful names

In [217]:
df["grade"] = df["grade"].cat.set_categories(
    ["very bad", "bad", "medium", "good", "very good"]
) # Reorder the categories and simultaneously add the missing categories

In [218]:
df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

In [219]:
df.sort_values(by="grade") # Sorting is per order in the categories, not lexical order

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [220]:
df.groupby("grade").size() # Grouping by a categorical column also shows empty categories

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64