In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("who.csv")
df.isna().sum()

# df = df[["Country",df.columns[-2]]]
# print(df[:5])

Country                            0
CountryID                          0
Continent                          0
Adolescent fertility rate (%)     25
Adult literacy rate (%)           71
                                  ..
Under_five_mortality_from_IHME    32
Under_five_mortality_rate         21
Urban_population                  14
Urban_population_growth           14
Urban_population_pct_of_total     14
Length: 358, dtype: int64

In [5]:
# Como ya sabéis através de la API se puede obtener una descripción más detallada de las posibilidades de cada método de Python, y en especial
# de los métodos de Pandas.
# Para cargar un fichero de tamaño elevado es recomendable cargar aquellos atributos que nos interesen desde un principio usando el argumento: usecols
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
df = pd.read_csv("who.csv", usecols=["Country","Urban_population_growth"])
print(df[:5])

       Country  Urban_population_growth
0  Afghanistan                     5.44
1      Albania                     2.21
2      Algeria                     2.61
3      Andorra                      NaN
4       Angola                     4.14


In [6]:
# ¿Qué valor corresponde a un NA del dataframe?
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html
df.isna().sum()

Country                     0
Urban_population_growth    14
dtype: int64

In [8]:
#¿Qué columnas tienen datos sin valor: NaN, NaT, None?
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.any.html

print(df.columns[df.isna().any()])

# Equivale a preguntar si ¿existe algún valor positivo dentro de esas series?
print("-"*30)
print(df.any())isna()

SyntaxError: invalid syntax (201822211.py, line 8)

In [9]:
#No dudéis en ejecutar "partes" (dividamos la instrucción para comprenderla)
print(df.isna()[:5])

   Country  Urban_population_growth
0    False                    False
1    False                    False
2    False                    False
3    False                     True
4    False                    False


In [10]:
#¿Cuántas muestras son correctas?
df.notna().sum()
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.notna.html
# y de cuantas muestras?

Country                    202
Urban_population_growth    188
dtype: int64

In [11]:
df.notnull().sum() #ambas funcionas son equivalentes en Pandas, no en numpy

Country                    202
Urban_population_growth    188
dtype: int64

In [None]:
#Tratando la ausencia de datos. reemplazo por 0, por la media, elimino la fila, y si la columna tuviese muchos NaN puedo eliminar la col

In [12]:
#La manera más optima de remplazar estos valores es con la función: fillna
print(df.fillna(0)[:5]) #reemplazo por 0

       Country  Urban_population_growth
0  Afghanistan                     5.44
1      Albania                     2.21
2      Algeria                     2.61
3      Andorra                     0.00
4       Angola                     4.14


In [13]:
import numpy as np

np.random.seed(20)

#Creamos un dataframe
df = pd.DataFrame(np.random.randn(5, 3),
                     index=['a', 'b', 'c', 'd', 'e'],
                     columns=['one', 'two', 'three'])
print(df)

        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262 -1.084833  0.559696
c  0.939469 -0.978481  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017 -0.842368 -1.279503


In [14]:
#Creamos valores NaN para testear
df.two[df.two<0]=np.nan
print(df)

        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262       NaN  0.559696
c  0.939469       NaN  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017       NaN -1.279503


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df.two[df.two<0]=np.nan


In [15]:
print(df)
print("-"*33)
print(df.fillna(df.mean())) #reemplazo por la media

        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262       NaN  0.559696
c  0.939469       NaN  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017       NaN -1.279503
---------------------------------
        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262  0.259663  0.559696
c  0.939469  0.259663  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017  0.259663 -1.279503


In [16]:
#Con un valor en concreto del propio dataframe
print(df.fillna("HOLA"))
print("-"*33)
print(df.fillna(df.loc["a", ["one"]].values[0]))

        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262      HOLA  0.559696
c  0.939469      HOLA  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017      HOLA -1.279503
---------------------------------
        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262  0.883893  0.559696
c  0.939469  0.883893  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017  0.883893 -1.279503


In [None]:
#Podemos rellenar con datos interpolados

In [17]:
print(df)
print("-"*35)
print(df.interpolate())

        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262       NaN  0.559696
c  0.939469       NaN  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017       NaN -1.279503
-----------------------------------
        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262  0.238397  0.559696
c  0.939469  0.280929  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017  0.323461 -1.279503


In [18]:
print(df.interpolate(axis=1)) # Tomemos como referencia el valor NA de (b,"two")
print("--"*35)
print(df.mean(axis=1).b)

        one       two     three
a  0.883893  0.195865  0.357537
b -2.343262 -0.891783  0.559696
c  0.939469  0.721283  0.503097
d  0.406414  0.323461 -0.493411
e -0.792017 -1.035760 -1.279503
----------------------------------------------------------------------
-0.8917828081181468


In [19]:
# Para usar otro tipo de interpolaciones es recomendable tener un índice numérico por cuestiones de frecuencia en el método de interpolación
df.index = range(len(df))
print(df.two.interpolate(method="pad"))

0    0.195865
1    0.195865
2    0.195865
3    0.323461
4    0.323461
Name: two, dtype: float64


  print(df.two.interpolate(method="pad"))


In [23]:
print(df.two.interpolate(method="nearest"))

ImportError: Missing optional dependency 'scipy'. nearest interpolation requires SciPy. Use pip or conda to install scipy.

In [22]:
print("Valores interpolados:" + str(df.two.interpolate().count()-df.two.count()))

Valores interpolados:3


In [24]:
#Eliminación de valores NA

In [26]:
print(df)
print("-"*35)
print(df.dropna()) #elimina cualquier fila con NaN

# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

        one       two     three
0  0.883893  0.195865  0.357537
1 -2.343262       NaN  0.559696
2  0.939469       NaN  0.503097
3  0.406414  0.323461 -0.493411
4 -0.792017       NaN -1.279503
-----------------------------------
        one       two     three
0  0.883893  0.195865  0.357537
3  0.406414  0.323461 -0.493411


In [28]:
#O bien, podemos borrar cambiando el eje AXIS=0 o 1
df.dropna(axis=1) #elimina columnas

Unnamed: 0,one,three
0,0.883893,0.357537
1,-2.343262,0.559696
2,0.939469,0.503097
3,0.406414,-0.493411
4,-0.792017,-1.279503


In [None]:
#Ejercicios

In [29]:
df = pd.read_csv("who.csv")
df.isna()

Unnamed: 0,Country,CountryID,Continent,Adolescent fertility rate (%),Adult literacy rate (%),Gross national income per capita (PPP international $),Net primary school enrolment ratio female (%),Net primary school enrolment ratio male (%),Population (in thousands) total,Population annual growth rate (%),...,Total_CO2_emissions,Total_income,Total_reserves,Trade_balance_goods_and_services,Under_five_mortality_from_CME,Under_five_mortality_from_IHME,Under_five_mortality_rate,Urban_population,Urban_population_growth,Urban_population_pct_of_total
0,False,False,False,False,False,True,True,True,False,False,...,False,True,True,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,True,True,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
198,False,False,False,True,True,True,True,True,True,True,...,False,False,True,True,False,False,False,False,False,False
199,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
200,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
df.isna().sum()

Country                            0
CountryID                          0
Continent                          0
Adolescent fertility rate (%)     25
Adult literacy rate (%)           71
                                  ..
Under_five_mortality_from_IHME    32
Under_five_mortality_rate         21
Urban_population                  14
Urban_population_growth           14
Urban_population_pct_of_total     14
Length: 358, dtype: int64

In [31]:
df.isna('Country')

TypeError: DataFrame.isna() takes 1 positional argument but 2 were given

In [34]:
df1 = pd.read_csv("who.csv", usecols=["Country"])
df1.isna()

Unnamed: 0,Country
0,False
1,False
2,False
3,False
4,False
...,...
197,False
198,False
199,False
200,False
