In [2]:
import numpy as np
import pandas as pd
pd.__version__

'0.21.1'

# `Series`

### simple, indices numericos

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data[2]

0.75

In [7]:
data.keys()

RangeIndex(start=0, stop=4, step=1)

### lista, indices personalizados

In [8]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [9]:
data[2]

0.75

In [10]:
data['c']

0.75

In [11]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

### diccionario, indices personalizados

In [12]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [13]:
population[2]

12882135

In [14]:
population[0:2]

California    38332521
Florida       19552860
dtype: int64

In [15]:
population['California']

38332521

In [16]:
population['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

In [17]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

### `loc` e `iloc`

In [18]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

##### loc <references the explicit index:>

In [19]:
data.loc[1]

'a'

In [20]:
data.loc[1:3]

1    a
3    b
dtype: object

##### iloc <references the implicit Python-style index>

In [21]:
data.iloc[1]

'b'

In [22]:
data.iloc[1:3]

3    b
5    c
dtype: object

# inexistencia de valores

In [23]:
data = pd.Series([1, np.nan, 'hello', None])
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [24]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [25]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [26]:
data.fillna(0)

0        1
1        0
2    hello
3        0
dtype: object

##### dropna()

In [27]:
data.dropna()

0        1
2    hello
dtype: object

# `Dataframe`

# Creación

### forma 1

Creamos una serie

In [28]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

Creamos un dataframe:

In [29]:
df = pd.DataFrame({'population': population,
                       'area': area})
df

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


### forma 2

In [30]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


### forma 3

In [59]:
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.372804,0.729756
b,0.534925,0.155666
c,0.521956,0.075576


In [87]:
df.head()

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [33]:
df.tail()

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [34]:
df.describe()

Unnamed: 0,area,population
count,5.0,5.0
mean,316246.6,23373370.0
std,242437.411951,9640386.0
min,141297.0,12882140.0
25%,149995.0,19552860.0
50%,170312.0,19651130.0
75%,423967.0,26448190.0
max,695662.0,38332520.0


# columna como atributo

In [35]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
poblacion = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area': area, 'poblacion': poblacion})
data

Unnamed: 0,area,poblacion
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [36]:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [37]:
data.area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

# operaciones

In [38]:
data['densidad'] = data['poblacion'] / data['area']
data

Unnamed: 0,area,poblacion,densidad
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [39]:
data.values

array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

##### filas

In [40]:
data.values[0]  # fila 0

array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])

In [41]:
data['Florida':'Illinois']

Unnamed: 0,area,poblacion,densidad
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


##### columnas

In [42]:
data['area']  # columna 'area'

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [43]:
data[1:3]

Unnamed: 0,area,poblacion,densidad
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


#### ambas

In [44]:
data[['poblacion', 'densidad']].loc[['Florida', 'California']]

Unnamed: 0,poblacion,densidad
Florida,19552860,114.806121
California,38332521,90.413926


### filtrado

In [45]:
data.loc[data.densidad > 100, ['poblacion', 'densidad']]

Unnamed: 0,poblacion,densidad
Florida,19552860,114.806121
New York,19651127,139.076746


### inexistencia de valores

In [46]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [47]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [48]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


### concatenacion

In [49]:
df1 = pd.DataFrame({'A': ['A1', 'A2'], 'B': ['B1', 'B2']}, index=[1, 2])
df2 = pd.DataFrame({'A': ['A3', 'A4'], 'B': ['B3', 'B4']}, index=[3, 4])

In [50]:
df1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2


In [51]:
df2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4


In [52]:
pd.concat([df1, df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


#### mismo indice, ignorar

In [53]:
df1 = pd.DataFrame({'A': ['A1', 'A2'], 'B': ['B1', 'B2']}, index=[1, 2])
df2 = pd.DataFrame({'A': ['A3', 'A4'], 'B': ['B3', 'B4']}, index=[1, 2])

In [54]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,A,B
0,A1,B1
1,A2,B2
2,A3,B3
3,A4,B4


### merge

In [55]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

In [56]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [57]:
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [58]:
df3 = pd.merge(df1, df2) # hint: how param!
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014
