import pandas as pd
# ^^^ pyforest auto-imports - don't write above this line
# Importing convetion

In [8]:
import pandas as pd
import numpy as np

In [14]:
!pip install pandas --user



In [15]:
pd.__version__

'1.0.1'

# What is Pandas?

Pandas can be thought as an enhanced version of numpy arrays. In this case, the rows and columns can be identified with labels instead of just simple integer indices.

There are **three** main pandas elements we **need** to understand.
1. Pandas Series
2. Pandas DataFrame
3. Index

# The Pandas Series

A pandas series is a one-dimensional (**1-D**) indexed array.

In [16]:
pd

<module 'pandas' from 'C:\\Users\\josep\\anaconda3\\lib\\site-packages\\pandas\\__init__.py'>

In [17]:
pd.Series()

  """Entry point for launching an IPython kernel.


Series([], dtype: float64)

## Creating a pandas Series from a list

You'll start to recognize a pandas Series by its visual 

In [18]:
pd.Series([5,8,3])

0    5
1    8
2    3
dtype: int64

`dtype` means the `data type` of what is inside your pandas Series.

In [19]:
# As in lists, you don't need to have all of the same type inside a pandas series

pd.Series(['a', 2, 3])

0    a
1    2
2    3
dtype: object

When you see `dtype: object`, it usually means you have a `str` inside your `Series`

In [20]:
data = pd.Series([10,23,3,43,25,136])

In [21]:
data

0     10
1     23
2      3
3     43
4     25
5    136
dtype: int64

In [22]:
type(data)

pandas.core.series.Series

So, the `type` of `data` is a `pandas...Series` and the types of the data inside the `pandas.Series` is `int`

## Accessing elements 

Can be done like a numpy array. 

In [23]:
data

0     10
1     23
2      3
3     43
4     25
5    136
dtype: int64

In [24]:
data[0]

10

In [26]:
data[4:]

4     25
5    136
dtype: int64

Em resumo: pandas series pode ser considerado uma numpy array de 1-D

### What is the difference then? Numpy array vs Pandas Series

Mostly the index notation.

Numpy arrays only have the **implicit** index associated with its location. By using a **explicit** index notation, Pandas Series are much more flexible. For example:

## Index don't need to be numbers.

In [27]:
my_series = pd.Series(data=[1,2,3,5,7,9], ) #index argument
my_series

0    1
1    2
2    3
3    5
4    7
5    9
dtype: int64

In [40]:
my_series = pd.Series(data=[1,2,3,5,7,9], index= ['josep', 'camila', 'naosei', 'nao', 'ai', 'andre']) #index argument
my_series

josep     1
camila    2
naosei    3
nao       5
ai        7
andre     9
dtype: int64

In [44]:
my_series["josep"]

1

## Index don't need to be in sequence

In [58]:
data = pd.Series(data=[1,2,3,4], 
                 index=[1,7,4313,19])

In [59]:
data

1       1
7       2
4313    3
19      4
dtype: int64

### Then how can I access these pandas series?

In [60]:
my_series

josep     1
camila    2
naosei    3
nao       5
ai        7
andre     9
dtype: int64

**NOTE:** One can think of a pandas series, then, as a form of dictionary, in which the indexes are keys and the rows are the values

## Creating a pandas series from a dict.

In [61]:
my_dict = {'RODRIGO': 20, 
           'ANDRE':10}

In [62]:
my_dict

{'RODRIGO': 20, 'ANDRE': 10}

In [63]:
pd.Series(my_dict)

RODRIGO    20
ANDRE      10
dtype: int64


But what about > 1-D?


# Pandas DataFrame




Pandas Dataframes can be thought as a generalization of **2-D** numpy arrays. However, again, they bring flexibility on both the indices and column names.

In [64]:


pd.DataFrame()


In [65]:
type(pd.DataFrame())

pandas.core.frame.DataFrame

## Pandas DataFrame can be thought as a group of Pandas Series

In [66]:
my_dict = {'RODRIGO': 26, 
           'ANDRE':28}

data = pd.Series(my_dict)

In [67]:
data

RODRIGO    26
ANDRE      28
dtype: int64

In [68]:
another_dict = {'RODRIGO': 178,'ANDRE': 175}

data_2 = pd.Series(another_dict)

In [69]:
data_2

RODRIGO    178
ANDRE      175
dtype: int64

# Create dataframe as a collection of Series

In [70]:
{'idade':data, 'altura':data_2}

{'idade': RODRIGO    26
 ANDRE      28
 dtype: int64,
 'altura': RODRIGO    178
 ANDRE      175
 dtype: int64}

In [71]:
my_dataframe = pd.DataFrame({'idade':data, 'altura':data_2})
my_dataframe 

Unnamed: 0,idade,altura
RODRIGO,26,178
ANDRE,28,175


**NOTE:**: So a dataframe can be thought of as a dictionary, in which `keys` are the `column names` and `values` are the `pandas Series` themselves

In [72]:
my_dataframe['idade']

RODRIGO    26
ANDRE      28
Name: idade, dtype: int64

# `Access` Methods: Accessing dataframes rows and columns

In [73]:
my_dataframe

Unnamed: 0,idade,altura
RODRIGO,26,178
ANDRE,28,175


## `dataframe.loc[row_name, col_name]`

In [74]:
my_dataframe.loc['RODRIGO', 'idade']

26

In [75]:
my_dataframe.loc['ANDRE', 'altura']

175

In [76]:
my_dataframe.

SyntaxError: invalid syntax (<ipython-input-76-0063534a61b4>, line 1)

## `dataframe.iloc[row_number, col_number]`

In [77]:
my_dataframe.iloc[0, 0]

26

In [78]:
my_dataframe.iloc[1, 1]

175

In [79]:
my_dataframe.iloc[-1, 1]

175

# Creating dataframes

## From a list in 1-D

In [80]:
my_list = [1,2,3]

In [81]:
np.array(my_list)

array([1, 2, 3])

In [82]:
pd.DataFrame(data=my_list)

Unnamed: 0,0
0,1
1,2
2,3


In [83]:
pd.DataFrame(data=my_list, columns=['notas'], index=['Andre','Rai','Rodrigo'])

Unnamed: 0,notas
Andre,1
Rai,2
Rodrigo,3


## From a list in > 1-D (let's remember numpy arrays here!)

In [84]:
my_list = [[1,2,3],[-5,-6,-7]]

In [85]:
np.array(my_list).shape

(2, 3)

In [86]:
df = pd.DataFrame(data=[[1,2,3],[-5,-6,-7]], columns=['idade','peso','altura'])
df

Unnamed: 0,idade,peso,altura
0,1,2,3
1,-5,-6,-7


In [87]:
df.shape

(2, 3)

## From a dictionary composed by lists

In [88]:
pd.DataFrame({'ironhack_students': ['a','b','c'],
              'NOTA':[10, 10, 0]})

Unnamed: 0,ironhack_students,NOTA
0,a,10
1,b,10
2,c,0


## From a numpy array

In [89]:
a = np.random.random(size=(5, 3))
a

array([[0.8705972 , 0.55990858, 0.35031866],
       [0.60853753, 0.15521884, 0.11410907],
       [0.40225686, 0.61678235, 0.09163417],
       [0.09649529, 0.32580611, 0.24511518],
       [0.61383461, 0.85940097, 0.77283359]])

In [90]:
data = pd.DataFrame(a, columns=['a', 'b', 'c'])

In [91]:
data

Unnamed: 0,a,b,c
0,0.870597,0.559909,0.350319
1,0.608538,0.155219,0.114109
2,0.402257,0.616782,0.091634
3,0.096495,0.325806,0.245115
4,0.613835,0.859401,0.772834


### Accessing rows and columns:

#### `.loc`

In [92]:
data.loc[0, 'a']

0.8705972020794788

In [93]:
# the whole column
data.loc[:, 'a']

0    0.870597
1    0.608538
2    0.402257
3    0.096495
4    0.613835
Name: a, dtype: float64

In [65]:
data.loc[2:4, 'b':'c']

Unnamed: 0,b,c
2,0.385277,0.114059
3,0.324834,0.652996
4,0.060475,0.22675


#### `.iloc`

In [66]:
data.iloc[0, 0]

0.23142622677119784

In [67]:
data.iloc[0, 0:2]

a    0.231426
b    0.517063
Name: 0, dtype: float64

In [68]:
data.iloc[:, 2]

0    0.281033
1    0.289039
2    0.114059
3    0.652996
4    0.226750
Name: c, dtype: float64

## Math operations

In [71]:
data = np.random.random(size=(8, 4))

In [73]:
df = pd.DataFrame(data, columns=['Andre','Rai','Rodrigo','Vamp'])

In [74]:
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7
Andre,0.138144,0.706192,0.324577,0.175163,0.556243,0.424498,0.111416,0.163319
Rai,0.60836,0.182484,0.70561,0.169785,0.573794,0.330713,0.18102,0.671962
Rodrigo,0.971235,0.336923,0.562461,0.399329,0.022689,0.935024,0.68595,0.28029
Vamp,0.587279,0.592032,0.777561,0.447378,0.800874,0.952296,0.717392,0.104736


In [76]:
df.mean()

Andre      0.324944
Rai        0.427966
Rodrigo    0.524238
Vamp       0.622443
dtype: float64

In [77]:
df.mean(axis=1)

0    0.576254
1    0.454408
2    0.592552
3    0.297914
4    0.488400
5    0.660633
6    0.423945
7    0.305077
dtype: float64

In [78]:
df.std()

Andre      0.219677
Rai        0.235368
Rodrigo    0.329049
Vamp       0.260370
dtype: float64

In [79]:
df.describe()

Unnamed: 0,Andre,Rai,Rodrigo,Vamp
count,8.0,8.0,8.0,8.0
mean,0.324944,0.427966,0.524238,0.622443
std,0.219677,0.235368,0.329049,0.26037
min,0.111416,0.169785,0.022689,0.104736
25%,0.157025,0.182118,0.322765,0.552304
50%,0.24987,0.452254,0.480895,0.654712
75%,0.457434,0.624261,0.748219,0.783389
max,0.706192,0.70561,0.971235,0.952296


In [80]:
df['Andre']

0    0.138144
1    0.706192
2    0.324577
3    0.175163
4    0.556243
5    0.424498
6    0.111416
7    0.163319
Name: Andre, dtype: float64

# Pandas Index

In [None]:
pd.Index([1,2,3])

In [None]:
data.index