# Preprocesamiento de datos con Python
## FES Aragón
Contreras Ortiz Miguel

In [136]:
import pandas as pd
import numpy as np
from datetime import datetime

# Series

In [23]:
s = pd.Series({'a':10, 'b':20, 'c':30})
print(type(s))
print(s)

<class 'pandas.core.series.Series'>
a    10
b    20
c    30
dtype: int64


In [17]:
s.index

Index(['a', 'b', 'c'], dtype='object')

In [11]:
s1 = pd.Series([10, 20, 30, 40])
print(s1)

0    10
1    20
2    30
3    40
dtype: int64


In [19]:
s1.index

RangeIndex(start=0, stop=4, step=1)

In [22]:
s1.max()

40

In [25]:
s1.describe()

count     4.000000
mean     25.000000
std      12.909944
min      10.000000
25%      17.500000
50%      25.000000
75%      32.500000
max      40.000000
dtype: float64

In [29]:
# Generamos una serie de números aleatorios
s2 = pd.Series(np.random.randn(10))
s2

0    0.641962
1   -1.629561
2    0.011053
3    1.071895
4    1.215066
5    1.046533
6   -1.072195
7   -0.275130
8   -0.800915
9   -1.873307
dtype: float64

In [31]:
s3 = pd.Series([[10, 20],\
                [30, 40.5,'series'],\
                [50, 55],\
                {'Name':'Tess','Org':'Packt'}])

s3

0                            [10, 20]
1                  [30, 40.5, series]
2                            [50, 55]
3    {'Name': 'Tess', 'Org': 'Packt'}
dtype: object

In [36]:
s3.describe()

count            4
unique           4
top       [10, 20]
freq             1
dtype: object

In [37]:
s3[1][1]

40.5

In [38]:
# shape nos da el número de renglones, columnas de un arreglo
s3.shape

(4,)

# DataFrame

In [40]:
df = pd.DataFrame([30,50,20])
print(type(df))
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
(3, 1)


In [41]:
df

Unnamed: 0,0
0,30
1,50
2,20


In [42]:
df.columns

RangeIndex(start=0, stop=1, step=1)

In [44]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [46]:
list(df.index)

[0, 1, 2]

In [47]:
print("Las columnas del DF son: ", list(df.columns))
print("Los indices de renglones del DF son: ", list(df.index))

Las columnas del DF son:  [0]
Los indices de renglones del DF son:  [0, 1, 2]


In [49]:
df.columns = ["C1"]
df.index = ["R1", "R2", "R3"]
df

Unnamed: 0,C1
R1,30
R2,50
R3,20


In [53]:
df["C1"]

R1    30
R2    50
R3    20
Name: C1, dtype: int64

In [55]:
l = [(0, 1, 2), (1, 2, 3), (4, 5, 6), (7, 8, 9)]
df1 = pd.DataFrame(data = l)
print(df1.shape)
df1

(4, 3)


Unnamed: 0,0,1,2
0,0,1,2
1,1,2,3
2,4,5,6
3,7,8,9


In [58]:
l1 = [(0, 1, 2), (1, 2, 3), (4, 5, 6), (7, 8, 9, 10)]
df12 = pd.DataFrame(data = l1)
df12

Unnamed: 0,0,1,2,3
0,0,1,2,
1,1,2,3,
2,4,5,6,
3,7,8,9,10.0


In [72]:
l = [(0, 1, 2), (1, 2, 3), (4, 5, 6), (7, 8, 9)]
df1 = pd.DataFrame(data = l,
                   columns = ("C1", "C2", "C3"),
                   index = ["R1", "R2", "R3"])
df1

Unnamed: 0,C1,C2,C3
R1,0,1,2
R2,1,2,3
R3,4,5,6


In [64]:
df1.describe()

Unnamed: 0,0,1,2
count,4.0,4.0,4.0
mean,3.0,4.0,5.0
std,3.162278,3.162278,3.162278
min,0.0,1.0,2.0
25%,0.75,1.75,2.75
50%,2.5,3.5,4.5
75%,4.75,5.75,6.75
max,7.0,8.0,9.0


In [66]:
df12.describe()

Unnamed: 0,0,1,2,3
count,4.0,4.0,4.0,1.0
mean,3.0,4.0,5.0,10.0
std,3.162278,3.162278,3.162278,
min,0.0,1.0,2.0,10.0
25%,0.75,1.75,2.75,10.0
50%,2.5,3.5,4.5,10.0
75%,4.75,5.75,6.75,10.0
max,7.0,8.0,9.0,10.0


In [76]:
d = {'py101':[10, 5, 33 ,45, 25, 22], 
     'py111':[0, 15, 21 , 30, 31, 11], 
     'py121':[15, 5, 1 ,10, 42, 21], 
     'py301':[20, 35, 3 ,15, 0, 0], }

meses = ('enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio')

df4 = pd.DataFrame(data = d, index = meses)
df4

Unnamed: 0,py101,py111,py121,py301
enero,10,0,15,20
febrero,5,15,5,35
marzo,33,21,1,3
abril,45,30,10,15
mayo,25,31,42,0
junio,22,11,21,0


In [81]:
# Arange genera una lista de numeros consecutivos
m= np.arange(9).reshape(3,3)
m

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [83]:
df5 = pd.DataFrame(m, index = ["uno", "dos", "tres"], columns = ["C1", 2, "C3"])
df5

Unnamed: 0,C1,2,C3
uno,0,1,2
dos,3,4,5
tres,6,7,8


In [86]:
df5.columns

Index(['C1', 2, 'C3'], dtype='object')

In [88]:
d

{'py101': [10, 5, 33, 45, 25, 22],
 'py111': [0, 15, 21, 30, 31, 11],
 'py121': [15, 5, 1, 10, 42, 21],
 'py301': [20, 35, 3, 15, 0, 0]}

In [91]:
df6 = pd.DataFrame(data=d, index=meses, columns=['mesas', 'py101', 'bancos', 'puertas'])
df6

Unnamed: 0,mesas,py101,bancos,puertas
enero,,10,,
febrero,,5,,
marzo,,33,,
abril,,45,,
mayo,,25,,
junio,,22,,


In [93]:
df7 = pd.DataFrame(data= d, index =meses)
df7

Unnamed: 0,py101,py111,py121,py301
enero,10,0,15,20
febrero,5,15,5,35
marzo,33,21,1,3
abril,45,30,10,15
mayo,25,31,42,0
junio,22,11,21,0


In [95]:
# Renombramiento de columnas
df7.columns = ['BD', 'TEBD', 'UML', 'JAVA']
df7

Unnamed: 0,BD,TEBD,UML,JAVA
enero,10,0,15,20
febrero,5,15,5,35
marzo,33,21,1,3
abril,45,30,10,15
mayo,25,31,42,0
junio,22,11,21,0


In [97]:
df7.rename(columns = {'UML' : 'SI', 'JAVA' : 'POO'}, inplace = True)
df7

Unnamed: 0,BD,TEBD,SI,POO
enero,10,0,15,20
febrero,5,15,5,35
marzo,33,21,1,3
abril,45,30,10,15
mayo,25,31,42,0
junio,22,11,21,0


In [99]:
#Cambiar todos los indices del DF
df7.index = ['ene', 'feb', 'mar', 'abr', 'may', 'jun']
df7

Unnamed: 0,BD,TEBD,SI,POO
ene,10,0,15,20
feb,5,15,5,35
mar,33,21,1,3
abr,45,30,10,15
may,25,31,42,0
jun,22,11,21,0


In [102]:
#Cambiar el nombre de algunos indices del DF
df7.rename(index={'feb':'febrero', 'may':'mayo'},
           columns= {'SI' : 'PROG1'}, inplace=True)
df7

Unnamed: 0,BD,TEBD,PROG1,POO
ene,10,0,15,20
febrero,5,15,5,35
mar,33,21,1,3
abr,45,30,10,15
mayo,25,31,42,0
jun,22,11,21,0


In [106]:
df7['POO'].describe()

count     6.000000
mean     12.166667
std      13.934370
min       0.000000
25%       0.750000
50%       9.000000
75%      18.750000
max      35.000000
Name: POO, dtype: float64

In [108]:
df7[['POO', 'BD']]

Unnamed: 0,POO,BD
ene,20,10
febrero,35,5
mar,3,33
abr,15,45
mayo,0,25
jun,0,22


In [112]:
dfPOOBD = df7[['POO', 'BD']]
dfPOOBD

Unnamed: 0,POO,BD
ene,20,10
febrero,35,5
mar,3,33
abr,15,45
mayo,0,25
jun,0,22


In [115]:
df7['mar' : 'mayo']

Unnamed: 0,BD,TEBD,PROG1,POO
mar,33,21,1,3
abr,45,30,10,15
mayo,25,31,42,0


In [120]:
df7['mar' : 'mayo'][['POO', 'BD']]

Unnamed: 0,POO,BD
mar,3,33
abr,15,45
mayo,0,25


In [126]:
df7[:3]

Unnamed: 0,BD,TEBD,PROG1,POO
ene,10,0,15,20
febrero,5,15,5,35
mar,33,21,1,3


In [128]:
df7[1:4]

Unnamed: 0,BD,TEBD,PROG1,POO
febrero,5,15,5,35
mar,33,21,1,3
abr,45,30,10,15


In [131]:
df7[::]

Unnamed: 0,BD,TEBD,PROG1,POO
ene,10,0,15,20
febrero,5,15,5,35
mar,33,21,1,3
abr,45,30,10,15
mayo,25,31,42,0
jun,22,11,21,0


In [133]:
df7[::2]

Unnamed: 0,BD,TEBD,PROG1,POO
ene,10,0,15,20
mar,33,21,1,3
mayo,25,31,42,0


In [135]:
df7['BD'][::2]

ene     10
mar     33
mayo    25
Name: BD, dtype: int64

In [139]:
df = pd.DataFrame({'nombres':('Juan Pérez',
                                 'María Sánchez'
                                 , 'Jorge Vargas',
                                 'Rodrigo Martínez'),
            'fechas':(datetime(1995,12,21), 
                      datetime(1989,1,13), 
                      datetime(1992,9,14), 
                      datetime(1993,7,8)),
            'saldo': (2500, 
                      5345, 
                      np.NaN, 
                      11323.2),
            'al_corriente':(True, 
                            True, 
                            False, 
                            True)})

df

Unnamed: 0,nombres,fechas,saldo,al_corriente
0,Juan Pérez,1995-12-21,2500.0,True
1,María Sánchez,1989-01-13,5345.0,True
2,Jorge Vargas,1992-09-14,,False
3,Rodrigo Martínez,1993-07-08,11323.2,True


In [141]:
df.dtypes

nombres                 object
fechas          datetime64[ns]
saldo                  float64
al_corriente              bool
dtype: object

In [143]:
df.describe()

Unnamed: 0,saldo
count,3.0
mean,6389.4
std,4503.36464
min,2500.0
25%,3922.5
50%,5345.0
75%,8334.1
max,11323.2


In [146]:
df['saldo'].describe()

count        3.00000
mean      6389.40000
std       4503.36464
min       2500.00000
25%       3922.50000
50%       5345.00000
75%       8334.10000
max      11323.20000
Name: saldo, dtype: float64

In [150]:
df.astype(str).dtypes

nombres         object
fechas          object
saldo           object
al_corriente    object
dtype: object

In [152]:
df.astype(str).dtypes
df['saldo'].astype(str)

0     2500.0
1     5345.0
2        nan
3    11323.2
Name: saldo, dtype: object

In [156]:
df['fechas'].astype("int64")

0    819504000000000000
1    600652800000000000
2    716428800000000000
3    742089600000000000
Name: fechas, dtype: int64

In [157]:
df['saldo'] * 1.16

0     2900.000
1     6200.200
2          NaN
3    13134.912
Name: saldo, dtype: float64

In [159]:
df['saldo_iva'] = df['saldo'] *1.16
df

Unnamed: 0,nombres,fechas,saldo,al_corriente,saldo_iva
0,Juan Pérez,1995-12-21,2500.0,True,2900.0
1,María Sánchez,1989-01-13,5345.0,True,6200.2
2,Jorge Vargas,1992-09-14,,False,
3,Rodrigo Martínez,1993-07-08,11323.2,True,13134.912


In [161]:
df.dtypes

nombres                 object
fechas          datetime64[ns]
saldo                  float64
al_corriente              bool
saldo_iva              float64
dtype: object

In [163]:
df['saldo_iva'] = df['saldo_iva'].astype(str)
df.dtypes

nombres                 object
fechas          datetime64[ns]
saldo                  float64
al_corriente              bool
saldo_iva               object
dtype: object

In [164]:
df['saldo_iva'] * 12

0    2900.02900.02900.02900.02900.02900.02900.02900...
1    6200.26200.26200.26200.26200.26200.26200.26200...
2                 nannannannannannannannannannannannan
3    13134.91213134.91213134.91213134.91213134.9121...
Name: saldo_iva, dtype: object

In [165]:
df['saldo'] * 2

0     5000.0
1    10690.0
2        NaN
3    22646.4
Name: saldo, dtype: float64