# Pandas
- Solve short hands-on challenges to perfect your data manipulation skills.
- https://www.kaggle.com/learn/pandas

## 5.- Data Types and Missing Values
- Deal with the most common progress-blocking problems  

In [2]:
import numpy as np
import pandas as pd

print('np.__version__:', np.__version__)
print('pd.__version__:', pd.__version__)

#pd.set_option('display.max_rows', 5)

np.__version__: 1.23.5
pd.__version__: 1.5.3


In [3]:
reviews = pd.read_csv('Red.csv')
reviews.head(2)

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017


In [4]:
## Add a twitter_region column !
reviews['twitter_region'] = '@' +  reviews.Region
reviews.head(2)

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011,@Pomerol
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017,@Lirac


### Dtypes

In [5]:
print(reviews.Price.dtype)
print(reviews['Country'].dtype)


float64
object


In [6]:
reviews.dtypes

Name                object
Country             object
Region              object
Winery              object
Rating             float64
NumberOfRatings      int64
Price              float64
Year                object
twitter_region      object
dtype: object

In [7]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8666 entries, 0 to 8665
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8666 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8666 non-null   object 
 3   Winery           8666 non-null   object 
 4   Rating           8666 non-null   float64
 5   NumberOfRatings  8666 non-null   int64  
 6   Price            8666 non-null   float64
 7   Year             8666 non-null   object 
 8   twitter_region   8666 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 609.5+ KB


It's possible to convert a column of one type into another wherever such a conversion makes sense by using the astype()

In [8]:
reviews.NumberOfRatings.astype('float32')

0       100.0
1       100.0
2       100.0
3       100.0
4       100.0
        ...  
8661    994.0
8662    995.0
8663    996.0
8664    998.0
8665    999.0
Name: NumberOfRatings, Length: 8666, dtype: float32

In [9]:
## Inmutable... unless 'inplace' parameter
reviews.dtypes

Name                object
Country             object
Region              object
Winery              object
Rating             float64
NumberOfRatings      int64
Price              float64
Year                object
twitter_region      object
dtype: object

In [10]:
## A DataFrame or Series index has its own dtype, too:
reviews.index.dtype
# Pandas also supports more exotic data types, such as categorical data and timeseries data

dtype('int64')

### Missing data
Entries missing values are given the value NaN, short for "Not a Number". For technical reasons these NaN values are always of the float64 dtype.

Pandas provides some methods specific to missing data. To select NaN entries you can use pd.isnull() (or its companion pd.notnull()


In [11]:
reviews[pd.isnull(reviews.Country)]

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region


### No NaN values, then I will insert some new rows !!
- add Row to DataFrame    
list_row = ["Hyperion", 27000, "60days", 2000]    
df.loc[len(df)] = list_row

- Using pandas.concat() to add a row
new_row = pd.DataFrame({'Courses':'Hyperion', 'Fee':24000, 'Duration':'55days', 'Discount':1800}, index=[0])
df2 = pd.concat([new_row,df.loc[:]]).reset_index(drop=True)

- Add specific row/index name using DataFrame.loc[]    
df.loc['7', :] = ['Hive',25000,'45days',2000]

-  Add row in DataFrame using DataFrame.loc[]    
df.loc['7'] = ['Hive',25000,'45days',2000]

#### .append() is DEPRECATED

- Insert Dict to the dataframe using DataFrame.append()    
new_row = {'Courses':'Hyperion', 'Fee':24000, 'Duration':'55days', 'Discount':1800}    
df2 = df.append(new_row, ignore_index=True)

- Append row to the DataFrame
df2 = df.append(pd.Series(new_row, index=df.columns, name='7'))

In [12]:
display(reviews.head(2))
print(len(reviews))
#lst_row1 = ['Name1', 'NaN', 'Reg1', 'Win1', 'NaN',1, 'NaN',2023,]
#reviews.loc[len(reviews)] = lst_row1
### .append() WILL BE DEPRECATED - use .concat()
# new_row1 = {'Name': 'Name1', 'Country': 'NaN', 'Region': 'Reg1',
#             'Winery': 'Win1', 'Rating': 1, 'NumberOfRatings': 101,
#             'Price': 10.1, 'Year': 2021, 'twitter_region': '@Reg1'}
# r1 = reviews.append(new_row1, ignore_index=True)

ndf_row1 = pd.DataFrame({'Name': 'Name1', 'Country': np.nan, 'Region': 'Reg1',
                         'Winery': 'Win1', 'Rating': 1, 'NumberOfRatings': 101,
                         'Price': 10.1, 'Year': 2021, 'twitter_region': '@Reg1'}, index=[0])
revs = pd.concat([reviews, ndf_row1]).reset_index(drop=True)    # concat at the end
ndf_row2 = pd.DataFrame({'Name': 'Name2', 'Country': np.nan, 'Region': 'Reg2',
                         'Winery': 'Win2', 'Rating': 2, 'NumberOfRatings': 202,
                         'Price': 20., 'Year': 2022, 'twitter_region': '@Reg2'}, index=[0])
revs = pd.concat([ndf_row2, revs]).reset_index(drop=True)    # concat at the beginning and in the same df
# OuuuKKK! Above inserting a list at a given place

# display(revs.tail(3))
# revs.loc[revs.Name == 'Name1']


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011,@Pomerol
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017,@Lirac


8666


In [13]:
revs

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Name2,,Reg2,Win2,2.0,202,20.00,2022,@Reg2
1,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.00,2011,@Pomerol
2,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.50,2017,@Lirac
3,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015,@Toscana
4,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019,@Bardolino
...,...,...,...,...,...,...,...,...,...
8663,Botrosecco Maremma Toscana 2016,Italy,Maremma Toscana,Le Mortelle,4.0,995,20.09,2016,@Maremma Toscana
8664,Haut-Médoc 2010,France,Haut-Médoc,Château Cambon La Pelouse,3.7,996,23.95,2010,@Haut-Médoc
8665,Shiraz 2019,Australia,South Eastern Australia,Yellow Tail,3.5,998,6.21,2019,@South Eastern Australia
8666,Portillo Cabernet Sauvignon 2016,Argentina,Tunuyán,Salentein,3.4,999,7.88,2016,@Tunuyán


### OK, ready to find NaN values rows

In [14]:
revs[pd.isnull(revs.Country)]

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Name2,,Reg2,Win2,2.0,202,20.0,2022,@Reg2
8667,Name1,,Reg1,Win1,1.0,101,10.1,2021,@Reg1


In [15]:
display(revs.loc[3422:3426])
lst_row3 = ['Name3', np.nan, 'Reg3', 'Win3', np.nan, 303, np.nan,2023, '@Reg3']
revs.loc[3423.5] = lst_row3                         # make index float and insert in .5
print('revs.index.dtype now:', revs.index.dtype)
revs = revs.sort_index().reset_index(drop=True)     # return index to int and add .5 and create new int entry
display(revs.loc[3422:3426])
revs.index.dtype


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
3422,Les Saint-Georges Nuits-Saint-Georges 1er Cru ...,France,Nuits-Saint-Georges Premier Cru,Domaine Robert Chevillon,4.1,28,113.39,2015,@Nuits-Saint-Georges Premier Cru
3423,Castel Firmian Riserva Teroldego Rotaliano 2015,Italy,Teroldego Rotaliano,Mezzacorona,3.7,28,9.37,2015,@Teroldego Rotaliano
3424,Barolo 2016,Italy,Barolo,Mauro Molino,3.8,28,36.5,2016,@Barolo
3425,Primitivo (Since 1913) 2017,Italy,Primitivo di Manduria,Torrevento,3.9,28,14.35,2017,@Primitivo di Manduria
3426,Blauer Zweigelt Klassik 2016,Austria,Wagram,Leth,3.3,28,7.31,2016,@Wagram


revs.index.dtype now: float64


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
3422,Les Saint-Georges Nuits-Saint-Georges 1er Cru ...,France,Nuits-Saint-Georges Premier Cru,Domaine Robert Chevillon,4.1,28,113.39,2015,@Nuits-Saint-Georges Premier Cru
3423,Castel Firmian Riserva Teroldego Rotaliano 2015,Italy,Teroldego Rotaliano,Mezzacorona,3.7,28,9.37,2015,@Teroldego Rotaliano
3424,Name3,,Reg3,Win3,,303,,2023,@Reg3
3425,Barolo 2016,Italy,Barolo,Mauro Molino,3.8,28,36.5,2016,@Barolo
3426,Primitivo (Since 1913) 2017,Italy,Primitivo di Manduria,Torrevento,3.9,28,14.35,2017,@Primitivo di Manduria


dtype('int64')

In [16]:
# And now 3 NaN values rows.
revs[pd.isnull(revs.Country)]

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Name2,,Reg2,Win2,2.0,202,20.0,2022,@Reg2
3424,Name3,,Reg3,Win3,,303,,2023,@Reg3
8668,Name1,,Reg1,Win1,1.0,101,10.1,2021,@Reg1


In [17]:
# only 1 NaN value row
revs[pd.isnull(revs.Rating)]

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
3424,Name3,,Reg3,Win3,,303,,2023,@Reg3


In [18]:
revs.info()
#  1   Country          8666 non-null   object 
# see 8666 vs 8669 from 'RangeIndex:' (or Rating or Price columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8669 entries, 0 to 8668
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8669 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8669 non-null   object 
 3   Winery           8669 non-null   object 
 4   Rating           8668 non-null   float64
 5   NumberOfRatings  8669 non-null   int64  
 6   Price            8668 non-null   float64
 7   Year             8669 non-null   object 
 8   twitter_region   8669 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 609.7+ KB


In [19]:
# True or False a single row
print(pd.isnull(revs.loc[3424]))
print("\n --> isnull? .loc[3424, 'Country']:", pd.isnull(revs.loc[3424, 'Country']))

Name               False
Country             True
Region             False
Winery             False
Rating              True
NumberOfRatings    False
Price               True
Year               False
twitter_region     False
Name: 3424, dtype: bool

 --> isnull? .loc[3424, 'Country']: True


In [20]:
# Inverse view that above
print("--> notnull? .loc[3424, 'Country']:", pd.notnull(revs.loc[3424, 'Country']))
pd.notnull(revs.loc[3424])

--> notnull? .loc[3424, 'Country']: False


Name                True
Country            False
Region              True
Winery              True
Rating             False
NumberOfRatings     True
Price              False
Year                True
twitter_region      True
Name: 3424, dtype: bool

In [27]:
#  True or False whole df for a single Col
pd.isnull(revs.Country)     # 8669 (len(df) - elements

0        True
1       False
2       False
3       False
4       False
        ...  
8664    False
8665    False
8666    False
8667    False
8668     True
Name: Country, Length: 8669, dtype: bool

### Continue wwith missing values
Replacing missing values is a common operation. Pandas provides a really handy method for this problem: fillna(). fillna() provides a few different strategies for mitigating such data.

In [21]:
# .fillna() #1 - we can simply replace each NaN with an "Unknown":
revs.Price = revs.Price.fillna('Unknown')
revs[pd.isnull(revs.Country)]

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Name2,,Reg2,Win2,2.0,202,20.0,2022,@Reg2
3424,Name3,,Reg3,Win3,,303,Unknown,2023,@Reg3
8668,Name1,,Reg1,Win1,1.0,101,10.1,2021,@Reg1


In [22]:
revs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8669 entries, 0 to 8668
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8669 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8669 non-null   object 
 3   Winery           8669 non-null   object 
 4   Rating           8668 non-null   float64
 5   NumberOfRatings  8669 non-null   int64  
 6   Price            8669 non-null   object 
 7   Year             8669 non-null   object 
 8   twitter_region   8669 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 609.7+ KB


- __backfill strategic:__ fill each missing value with the first non-null value that appears sometime after the given record in the database.

In [24]:
# before apply backfill or forwardfill return revs df to previous values
revs.Price = revs.Price.replace('Unknown', np.nan)          # Use of .replace .. las point.
revs[pd.isnull(revs.Country)]


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Name2,,Reg2,Win2,2.0,202,20.0,2022,@Reg2
3424,Name3,,Reg3,Win3,,303,,2023,@Reg3
8668,Name1,,Reg1,Win1,1.0,101,10.1,2021,@Reg1


In [25]:
revs.info()
# Automatic Price return to float64 dtype !! excelent.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8669 entries, 0 to 8668
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8669 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8669 non-null   object 
 3   Winery           8669 non-null   object 
 4   Rating           8668 non-null   float64
 5   NumberOfRatings  8669 non-null   int64  
 6   Price            8668 non-null   float64
 7   Year             8669 non-null   object 
 8   twitter_region   8669 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 609.7+ KB


In [26]:
## CONT. with backfill strategic
revs.Price = revs.Price.fillna(method='ffill')      # 'bfill' set 36.50
display(revs[pd.isnull(revs.Country)])
revs.iloc[3422:3427]

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Name2,,Reg2,Win2,2.0,202,20.0,2022,@Reg2
3424,Name3,,Reg3,Win3,,303,9.37,2023,@Reg3
8668,Name1,,Reg1,Win1,1.0,101,10.1,2021,@Reg1


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
3422,Les Saint-Georges Nuits-Saint-Georges 1er Cru ...,France,Nuits-Saint-Georges Premier Cru,Domaine Robert Chevillon,4.1,28,113.39,2015,@Nuits-Saint-Georges Premier Cru
3423,Castel Firmian Riserva Teroldego Rotaliano 2015,Italy,Teroldego Rotaliano,Mezzacorona,3.7,28,9.37,2015,@Teroldego Rotaliano
3424,Name3,,Reg3,Win3,,303,9.37,2023,@Reg3
3425,Barolo 2016,Italy,Barolo,Mauro Molino,3.8,28,36.5,2016,@Barolo
3426,Primitivo (Since 1913) 2017,Italy,Primitivo di Manduria,Torrevento,3.9,28,14.35,2017,@Primitivo di Manduria


### .raplace to 'replace' -je-- NaN values.
df.col.replace(old_val, new val)

In [29]:
# suppose a region change twitter (@Tunuyán - "@TyOK") - *** inplace for impact actual df ***
# 'inplace' instead of redefining col (Series): revs.twitter_region = revs.twitter_region.repl...
revs.twitter_region.replace('@Tunuyán', '@TyOK', inplace=True)
revs.loc[revs.Region == 'Tunuyán']


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
2562,Numina Gran Corte 2016,Argentina,Tunuyán,Salentein,4.2,2202,19.4,2016,@TyOK
3647,Portillo Malbec 2019,Argentina,Tunuyán,Salentein,3.6,2905,7.5,2019,@TyOK
6168,Portillo Malbec 2016,Argentina,Tunuyán,Salentein,3.4,5267,7.99,2016,@TyOK
6867,Portillo Malbec 2018,Argentina,Tunuyán,Salentein,3.6,6209,7.89,2018,@TyOK
7483,Primus Malbec 2016,Argentina,Tunuyán,Salentein,4.3,726,42.0,2016,@TyOK
8580,Primus Malbec 2017,Argentina,Tunuyán,Salentein,4.4,97,45.95,2017,@TyOK
8667,Portillo Cabernet Sauvignon 2016,Argentina,Tunuyán,Salentein,3.4,999,7.88,2016,@TyOK


In [43]:
# NOW an example of replacing NaN values - but only for one row, 'Name2'
display(revs[pd.isnull(revs.Country)])
#revs.loc[revs.Name == 'Name2'].Country.replace(np.nan, 'Argentina', inplace=True)  
# inplace  not in this case: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
revs.loc[revs.Name == 'Name2', 'Country'] = revs.loc[revs.Name == 'Name2'].Country.replace(np.nan, 'Argentina') 
display(revs[pd.isnull(revs.Country)])
revs.iloc[0]
revs.head(2)

# revs.loc[revs.Name == 'Name2'].replace('Name2', 'Replaced')   # Replace de Name2 value (cell)

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
3424,Name3,,Reg3,Win3,,303,9.37,2023,@Reg3
8668,Name1,,Reg1,Win1,1.0,101,10.1,2021,@Reg1


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
3424,Name3,,Reg3,Win3,,303,9.37,2023,@Reg3
8668,Name1,,Reg1,Win1,1.0,101,10.1,2021,@Reg1


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Name2,Argentina,Reg2,Win2,2.0,202,20.0,2022,@Reg2
1,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011,@Pomerol


In [48]:
## The above case is not useful for a single value (cell)
# replace es useful for the whole col (Serie), one value:
revs.loc[revs.Name == 'Name2', 'Country'] = 'Uruguay'
revs.loc[revs.Country == 'Uruguay']

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,twitter_region
0,Name2,Uruguay,Reg2,Win2,2.0,202,20.0,2022,@Reg2
1166,Estate Tannat - Merlot (Varietales) 2016,Uruguay,San José,Bodega Garzón,3.5,142,14.29,2016,@San José
2131,Río de Los Pájaros Reserve Tannat 2016,Uruguay,Progreso,Pisano,3.9,190,13.09,2016,@Progreso
2767,Estate Cabernet Franc - Tannat (Varietales) 2016,Uruguay,Maldonado,Bodega Garzón,3.8,238,14.29,2016,@Maldonado
4372,Reserva Tannat 2018,Uruguay,Maldonado,Bodega Garzón,4.1,3435,15.9,2018,@Maldonado


## JM-future .dropna() ?