# Cap 04 - The pandas Library

In [1]:
import pandas as pd
import numpy as np

### The Series

In [2]:
# The series is the object of the pandas library designed to represent one-dimensional data
# structures, similar to an array but with some additional features

s = pd.Series([12,-4,7,9])
s


0    12
1    -4
2     7
3     9
dtype: int64

In [4]:
# If you want to set a index by your self:
s = pd.Series([12,-4,7,9], index=['a','b','c','d'])
s

a    12
b    -4
c     7
d     9
dtype: int64

In [None]:
# If you want to individually see the two arrays, index and values:
print(s.index)
print("Values: " + str(s.values))

Index(['a', 'b', 'c', 'd'], dtype='object')
Values: [12 -4  7  9]


#### Defining a Series from NumPy Arrays and Other Series

In [None]:
arr = np.array((12, -4, 7, 9))
s = pd.Series(arr)
s

0    12
1    -4
2     7
3     9
dtype: int64

#### Filtering Values

In [14]:
s [s > 8]

0    12
3     9
dtype: int64

#### Operations and Mathematical Functions

In [7]:
s / 2

0    6.0
1   -2.0
2    3.5
3    4.5
dtype: float64

In [16]:
# with the NumPy mathematical functions, you must specify the function
# referenced with np and the instance of the series passed as an argument

np.log(s)

0    2.484907
1         NaN
2    1.945910
3    2.197225
dtype: float64

#### Evaluating Values

In [17]:
serd = pd.Series([1,0,2,1,2,3], index = ['white','white','blue','green','green','yellow'])
serd

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64

In [18]:
# To know all the values contained in the series, excluding duplicates, you can use
# the unique() function.

serd.unique()

array([1, 0, 2, 3])

In [19]:
# value_counts(), which not only returns unique values but also calculates the occurrences within a series
serd.value_counts()

1    2
2    2
0    1
3    1
Name: count, dtype: int64

In [23]:
# isin() is a function that searches for a value or list of values ​​in the series, returning a bool value
# whether it finds it or not.

serd.isin([0,3])

white     False
white      True
blue      False
green     False
green     False
yellow     True
dtype: bool

In [24]:
serd[serd.isin([0,3])]

white     0
yellow    3
dtype: int64

#### NaN Values

In [26]:
s2 = pd.Series([5,-3,np.nan,14])
s2

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64

In [27]:
# The isnull() and notnull() functions are very useful to identify the indexes
# without a value
s2.isnull()


0    False
1    False
2     True
3    False
dtype: bool

In [28]:
s2.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [29]:
s2[s2.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [31]:
s2[s2.isnull()]

2   NaN
dtype: float64

In [32]:
mydict = {'red':2000,'blue':1000, 'yellow':500, 'orange':1000}
mySeries = pd.Series(mydict)
mySeries

red       2000
blue      1000
yellow     500
orange    1000
dtype: int64

### The DataFrame

In [34]:
data = {'color' : ['blue','green','yellow','red','white'],
'object' : ['ball','pen','pencil','paper','mug'],
'price' : [1.2,1.0,0.6,0.9,1.7]}
data

{'color': ['blue', 'green', 'yellow', 'red', 'white'],
 'object': ['ball', 'pen', 'pencil', 'paper', 'mug'],
 'price': [1.2, 1.0, 0.6, 0.9, 1.7]}

In [35]:
df = pd.DataFrame(data)
df

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [43]:
# if the labels are not explicitly specified in the Index array,
# pandas automatically assigns a numeric sequence starting from 0.
# if you want to assign labels to the indexes of a dataframe.
df2 = pd.DataFrame(data, index=['one','two','three','four','five'])
df2

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


In [80]:
# to create a matrix of
# values quickly and easily, you can use np.arange(16).reshape((4,4)), which generates
# a 4x4 matrix of numbers increasing from 0 to 15.
df = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=['red','blue','yellow','white'],
                   columns=['ball','pen','pencil','paper'])
df

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [84]:
df.columns

Index(['ball', 'pen', 'pencil', 'paper'], dtype='object')

In [85]:
df.index

Index(['red', 'blue', 'yellow', 'white'], dtype='object')

In [87]:
df.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [91]:
df['paper']
# or
df.paper

red        3
blue       7
yellow    11
white     15
Name: paper, dtype: int64

In [94]:
# For rows within a dataframe, it is possible to use the loc attribute with the index
# value of the row that you want to extract.

df.loc['red']

ball      0
pen       1
pencil    2
paper     3
Name: red, dtype: int64

In [96]:
df.index = [1,2,3,4]
df


Unnamed: 0,ball,pen,pencil,paper
1,0,1,2,3
2,4,5,6,7
3,8,9,10,11
4,12,13,14,15


In [97]:
df.loc[1]

ball      0
pen       1
pencil    2
paper     3
Name: 1, dtype: int64