# Conditional Manipulation

## New Columns created

##### If condition - Set of Numbers

In [None]:
# General structure:
df.loc[df['column name'] condition, 'new column name'] = 'value if condition is met'

In [139]:
import pandas as pd
numbers = {'set_of_numbers': [1,2,3,4,5,6,7,8,9,10]}
df = DataFrame(numbers,columns=['set_of_numbers'])

df.loc[df['set_of_numbers'] <= 4, 'equal_or_lower_than_4?'] = 'True' 
df.loc[df['set_of_numbers'] > 4, 'equal_or_lower_than_4?'] = 'False' 
print (df)

   set_of_numbers equal_or_lower_than_4?
0               1                   True
1               2                   True
2               3                   True
3               4                   True
4               5                  False
5               6                  False
6               7                  False
7               8                  False
8               9                  False
9              10                  False


##### If condition – set of numbers and lambda

In [None]:
# General structure:
df['new colname'] = df['col name'].apply(lambda x: 'value if ok' if x condition else 'value if nok')

In [140]:
numbers = {'set_of_numbers': [1,2,3,4,5,6,7,8,9,10]}
df = DataFrame(numbers,columns=['set_of_numbers'])

df['equal_or_lower_than_4?'] = df['set_of_numbers'].apply(lambda x: 'True' if x <= 4 else 'False')
print (df)

   set_of_numbers equal_or_lower_than_4?
0               1                   True
1               2                   True
2               3                   True
3               4                   True
4               5                  False
5               6                  False
6               7                  False
7               8                  False
8               9                  False
9              10                  False


##### IF condition – for strings

In [None]:
# If we are dealing with strings

In [141]:
names = {'First_name': ['Jon','Bill','Maria','Emma']}
df = DataFrame(names,columns=['First_name'])

df.loc[df['First_name'] == 'Bill', 'name_match'] = 'Match'  
df.loc[df['First_name'] != 'Bill', 'name_match'] = 'Mismatch'  
 
print (df)

  First_name name_match
0        Jon   Mismatch
1       Bill      Match
2      Maria   Mismatch
3       Emma   Mismatch


##### IF condition – strings and lambda 

In [142]:
names = {'First_name': ['Jon','Bill','Maria','Emma']}
df = DataFrame(names,columns=['First_name'])

df['name_match'] = df['First_name'].apply(lambda x: 'Match' if x == 'Bill' else 'Mismatch')

print (df)

  First_name name_match
0        Jon   Mismatch
1       Bill      Match
2      Maria   Mismatch
3       Emma   Mismatch


##### IF condition with OR

In [143]:
names = {'First_name': ['Jon','Bill','Maria','Emma']}
df = DataFrame(names,columns=['First_name'])

df.loc[(df['First_name'] == 'Bill') | (df['First_name'] == 'Emma'), 'name_match'] = 'Match'  
df.loc[(df['First_name'] != 'Bill') & (df['First_name'] != 'Emma'), 'name_match'] = 'Mismatch'  

print (df)

  First_name name_match
0        Jon   Mismatch
1       Bill      Match
2      Maria   Mismatch
3       Emma      Match


## In Existing Dataframe column

##### Selecting rows based on conditions

###### Boolean Variables

In [None]:
# Create variable with TRUE if nationality is USA
american = df['nationality'] == "USA"

# Create variable with TRUE if age is greater than 50
elderly = df['age'] > 50

# Select all cases where nationality is USA and age is greater than 50
df[american & elderly]

###### Variable Attributes

In [None]:
# Select all cases where the first name is not missing and nationality is USA 
df[df['first_name'].notnull() & (df['nationality'] == "USA")]

##### Changing the cell value directly with 'at'

In [212]:
df = pd.DataFrame([[1.4, 8], [1.2, 5], [0.3, 10]],
     index=['China', 'India', 'USA'],
     columns=['Population', 'Economy'])

# df['Population']['India'] = 10    This will work as well - [Column][Row] is hte proper syntax
df.at['India', 'Population'] = 11   # This is a better way of doing, with 'at'
df

Unnamed: 0,Population,Economy
China,1.4,8
India,11.0,5
USA,0.3,10


##### A simple way to modify a column values

In [150]:
numbers = {'set_of_numbers': [1,2,3,4,5,6,7,8,9,10,0,0]}
df = DataFrame(numbers,columns=['set_of_numbers'])

df[df>5] = 'True'
df

Unnamed: 0,set_of_numbers
0,1
1,2
2,3
3,4
4,5
5,True
6,True
7,True
8,True
9,True


##### Using 'where'

###### Syntax

In [None]:
df.where(cond, other=nan, inplace=False, axis=None,
         level=None, errors=’raise’, try_cast=False)

cond     bool Series/DataFrame, array-like, or callable
         Where cond is True, keep the original value. 
         Where False, replace with corresponding value from other. 
         If cond is callable, it is computed, should return boolean Series/DataFrame or array. 
         The callable must not change input Series/DataFrame (though pandas doesn’t check it).

other   scalar, Series/DataFrame, or callable
        Entries where cond is False are replaced with corresponding value from other. 
        If other is callable, it is computed on the Series/DataFrame and should return scalar or Series/DataFrame. The callable must not change input Series/DataFrame (though pandas doesn’t check it).

inplace  bool, default False
         Whether to perform the operation in place on the data.

axis     int, default None
level    int, default None
errors   str, {‘raise’, ‘ignore’}, default ‘raise’
try_cast bool, default False

###### Example 1

In [None]:
# character | is used for &
(df['A'] + df['B']).where((df['A'] < 0) | (df['B'] > 0), df['A'] / df['B'])

###### Example 2 

In [None]:
# This is an exemple where if currency column = $, then the budget column is modified
df['Normalized'] = np.where(df['Currency'] == '$', df['Budget'] * 0.78125, df['Budget'])

##### Using loc

In [145]:
numbers = {'set_of_numbers': [1,2,3,4,5,6,7,8,9,10,0,0]}
df = DataFrame(numbers,columns=['set_of_numbers'])

df.loc[df['set_of_numbers'] == 0, 'set_of_numbers'] = 999
df.loc[df['set_of_numbers'] == 5, 'set_of_numbers'] = 555

print (df)

    set_of_numbers
0                1
1                2
2                3
3                4
4              555
5                6
6                7
7                8
8                9
9               10
10             999
11             999


In [162]:
np.random.seed(101)
a = pd.DataFrame(np.random.randn(5,4),index=['A','B','C','D','E'],columns=['W','X','Y','Z'])

a.loc[(a['W'] > 0),'Y']= 'AAA'
a

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,AAA,0.503826
B,0.651118,-0.319318,AAA,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,AAA,0.955057
E,0.190794,1.978757,AAA,0.683509


In [146]:
numbers = {'set_of_numbers': [1,2,3,4,5,6,7,8,9,10,np.nan,np.nan]}
df = DataFrame(numbers,columns=['set_of_numbers'])

df.loc[df['set_of_numbers'].isnull(), 'set_of_numbers'] = 0
print (df)

    set_of_numbers
0              1.0
1              2.0
2              3.0
3              4.0
4              5.0
5              6.0
6              7.0
7              8.0
8              9.0
9             10.0
10             0.0
11             0.0


##### Mask

Pandas dataframe.mask() function return an object of same shape as self.
- Corresponding entries are from self where cond is False and otherwise are from other object. 
- The other object could be a scalar, series, dataframe or could be a callable. 


The mask method is an application of the if-then idiom. 
- For each element in the calling DataFrame, 
   - if cond is False the element is used
   - otherwise the corresponding element from the DataFrame other is used.

###### Syntax

In [None]:
DataFrame.mask(cond, other=nan, inplace=False, axis=None, level=None, errors=’raise’, 
               try_cast=False, raise_on_error=None)

cond : Where cond is False, keep the original value. 
       Where True, replace with corresponding value from other. 
       If cond is callable, it is computed on the NDFrame and should return boolean NDFrame or array

other : Entries where cond is True are replaced with corresponding value from other. 
        If other is callable, it is computed on the NDFrame and should return scalar or NDFrame. 
        
inplace :  Whether to perform the operation in place on the data
    
axis :     alignment axis if needed, default None
level :    alignment level if needed, default None
    
errors :   str, {‘raise’, ‘ignore’}, default ‘raise’
           raise allow exceptions to be raised and ignore suppress exceptions. 
           On error return original object. 
        
try_cast : try to cast the result back to the input type (if possible),

###### Mask - Example 1

In [199]:
df = pd.DataFrame({"A":[12, 4, 5, 44, 1], 
                   "B":[5, 2, 54, 3, 2], 
                   "C":[20, 16, 7, 3, 8], 
                   "D":[14, 3, 17, 2, 6]}) 
df 

Unnamed: 0,A,B,C,D
0,12,5,20,14
1,4,2,16,3
2,5,54,7,17
3,44,3,3,2
4,1,2,8,6


In [200]:
df.mask(df > 10, -25) 

Unnamed: 0,A,B,C,D
0,-25,5,-25,-25
1,4,2,-25,3
2,5,-25,7,-25
3,-25,3,3,2
4,1,2,8,6


###### Mask - Example 2

In [202]:
df = pd.DataFrame({"A":[12, 4, 5, None, 1], 
                   "B":[7, 2, 54, 3, None], 
                   "C":[20, 16, 11, 3, 8], 
                   "D":[14, 3, None, 2, 6]}) 
df

Unnamed: 0,A,B,C,D
0,12.0,7.0,20,14.0
1,4.0,2.0,16,3.0
2,5.0,54.0,11,
3,,3.0,3,2.0
4,1.0,,8,6.0


In [204]:
df.mask(df.isna(), 1000)

Unnamed: 0,A,B,C,D
0,12.0,7.0,20,14.0
1,4.0,2.0,16,3.0
2,5.0,54.0,11,1000.0
3,1000.0,3.0,3,2.0
4,1.0,1000.0,8,6.0


###### Mask - Example 3

In [206]:
numbers = {'set_of_numbers': [1,2,3,4,5,6,7,8,9,10,0,0]}
df = pd.DataFrame(numbers,columns=['set_of_numbers'])

df['set_of_numbers'].mask(df['set_of_numbers'] == 0, 99999, inplace=True)
print(df)

    set_of_numbers
0                1
1                2
2                3
3                4
4                5
5                6
6                7
7                8
8                9
9               10
10           99999
11           99999


###### Mask - Example 4 - Using where

In [211]:
sl = pd.DataFrame({'a':[1, 0, 3], 'b':[0, 8, 9], 'c':[2, 0, 4],})
display(sl)

m = (sl != 0)
sl = sl.where(m, other='blah')
sl

Unnamed: 0,a,b,c
0,1,0,2
1,0,8,0
2,3,9,4


Unnamed: 0,a,b,c
0,1,blah,2
1,blah,8,blah
2,3,9,4


###### Mask - Example 5 - Apply a Mask on selected columns only

In [215]:
np.random.seed(101)
a = pd.DataFrame(np.random.randn(5,4),index=['A','B','C','D','E'],columns=['W','X','Y','Z'])

a.mask(a[['W','Y']] < 0, 'False', inplace=True)
a


Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,False,0.605965
C,False,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,False,0.955057
E,0.190794,1.978757,2.60597,0.683509


##### Applications of loc for data replacement

In [209]:
df = pd.DataFrame([[1.4, 8], [1.2, 5], [0.3, 10]],
     index=['China', 'India', 'USA'],
     columns=['Population', 'Economy'])
df

Unnamed: 0,Population,Economy
China,1.4,8
India,1.2,5
USA,0.3,10


In [153]:
# set the value of an entire row
df.loc['India'] = 10
df

Unnamed: 0,Population,Economy
China,1.4,8
India,10.0,10
USA,0.3,10


In [154]:
# set value for an entire column
df.loc[:, 'Economy'] = 30
df

Unnamed: 0,Population,Economy
China,1.4,30
India,10.0,30
USA,0.3,30


In [210]:
# Set value for rows matching condition
df.loc[df['Population'] < 1] = 0
df

Unnamed: 0,Population,Economy
China,1.4,8
India,1.2,5
USA,0.0,0


## Filter based on conditions

### Filter based on Values

##### Simple way

In [None]:
# VERY IMPORTANT CONCEPT HERE
# Get the values of 'Y' where values of W are greater than zero
               df[df['Col1']>0]['Col2']         
                # condition       # Col2 is the column we want to display
        
To note, this is probably better using .loc here

In [None]:
To understand the syntax

    df['Col1']>0]         # the output is a series of bolean
    df[df['Col1']>0]      # the output is an array with actual values fitered on the bolean


##### Using loc

In [None]:
             df.loc[df['Col1']>0]['Col2']     
                 # condition      # Col2 is the column we want to display

##### Multiple Conditions

In [None]:
df[df['Col1'] > 200 | df['Col2'] > 500 ]
df[df['Col1'] =='AAA' | df['Col2'] == 'BBB' ]

df[df['Col1'].isin(['AAA','BBB'])]      this works as well - cool syntax
df[~df['Col1'].isin(['AAA','BBB'])]     ~ is hte negation symbol

### Filter based on Frequency

##### Filder by largest categories (meaning most frequent categories from a given column)

In [None]:
start with this:
counts = df['col1'].value_counts()

    counts.nlargest(n)               # with col1 beeing a category, list of largest categories
    counts.nlargest(n).index         # index 
    
the formula is:
    
    df[df['Col1']isin(counts.nlargest(n).index)]    #only n largest categories are shown

### Filter based on data type

In [None]:
df.select_dtypes(include=['number','object'])
df.select_dtypes(exclude=['number','object'])

# MultiIndex

In [167]:
# Index Levels - A basic example
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [168]:
a = pd.DataFrame(np.random.randn(6,2),index = hier_index,columns=['A','B'])
a

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


# Missing Values

## Finding Missing Values

In [None]:
df.isnull()             # returns df with column filled with true / false if NaN (false = NaN)
df.isna().sum()         # count number of missing values

## Drop Values

##### dropna syntax


In [None]:
DataFrame.dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False)

Parameters
- axis{0 or ‘index’, 1 or ‘columns’}, default 0 (rows)
    - Pass tuple or list to drop on multiple axes. 
- how{‘any’, ‘all’}, default ‘any’
    - Determine if row or column is removed from DataFrame, when we have at least one NA or all NA.
    - ‘any’ : If any NA values are present, drop that row or column.
    - ‘all’ : If all values are NA, drop that row or column.
- thresh int, optional
    - Require that many non-NA values.
- subset array-like, optional
    - Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include.
- inplace bool, default False
    - If True, do operation inplace and return None.


##### Drop All Rows with any Null/NaN/NaT Values (this is actually the default behaviour)

In [184]:
d1 = {'Name': ['Pankaj', 'Meghna', 'David', 'Lisa'], 'ID': [1, 2, 3, 4], 'Salary': [100, 200, np.nan, pd.NaT],
      'Role': ['CEO', None, pd.NaT, pd.NaT]}
df = pd.DataFrame(d1)
print(df)
print('\n')
# drop all rows with any NaN and NaT values
df1 = df.dropna()
print(df1)

     Name  ID Salary  Role
0  Pankaj   1    100   CEO
1  Meghna   2    200  None
2   David   3    NaN   NaT
3    Lisa   4    NaT   NaT


     Name  ID Salary Role
0  Pankaj   1    100  CEO


##### Drop All Columns With Any Missing Value

In [None]:
df1 = df.dropna(axis=1)

##### Drop Row / Column Only If All Values Are Null

In [177]:
d1 = {'Name': ['Pankaj', 'Meghna', 'David', np.nan], 'ID': [1, 2, 3, np.nan], 'Salary': [100, 200, np.nan, np.nan],
      'Role': [np.nan, np.nan, np.nan, np.nan]}

df = pd.DataFrame(d1)
print(df)
print('\n')

df1 = df.dropna(how='all', axis=1)     # drop column 
print(df1)
print('\n')

df1 = df.dropna(how='all', axis=0)     # drop rows
print(df1)

     Name   ID  Salary  Role
0  Pankaj  1.0   100.0   NaN
1  Meghna  2.0   200.0   NaN
2   David  3.0     NaN   NaN
3     NaN  NaN     NaN   NaN


     Name   ID  Salary
0  Pankaj  1.0   100.0
1  Meghna  2.0   200.0
2   David  3.0     NaN
3     NaN  NaN     NaN


     Name   ID  Salary  Role
0  Pankaj  1.0   100.0   NaN
1  Meghna  2.0   200.0   NaN
2   David  3.0     NaN   NaN


##### Define Labels to look for null values

In [179]:
d1 = {'Name': ['Pankaj', 'Meghna', 'David', 'Lisa'], 'ID': [1, 2, 3, pd.NaT], 'Salary': [100, 200, np.nan, pd.NaT],
      'Role': ['CEO', np.nan, pd.NaT, pd.NaT]}

df = pd.DataFrame(d1)
print(df)
print('\n')
df1 = df.dropna(subset=['ID'])
print(df1)

     Name   ID Salary Role
0  Pankaj    1    100  CEO
1  Meghna    2    200  NaN
2   David    3    NaN  NaT
3    Lisa  NaT    NaT  NaT


     Name ID Salary Role
0  Pankaj  1    100  CEO
1  Meghna  2    200  NaN
2   David  3    NaN  NaT


##### Dropping Rows with NA inplace

We can pass inplace=True to change the source DataFrame itself. 

It’s useful when the DataFrame size is huge and we want to save some memory.

In [182]:
d1 = {'Name': ['Pankaj', 'Meghna', 'David', 'Lisa'], 'ID': [1, 2, 3, pd.NaT], 'Salary': [100, 200, np.nan, pd.NaT],
      'Role': ['CEO', np.nan, pd.NaT, pd.NaT]}
df = pd.DataFrame(d1)
print(df)
print('\n')

df.dropna(inplace=True)
print(df)

     Name   ID Salary Role
0  Pankaj    1    100  CEO
1  Meghna    2    200  NaN
2   David    3    NaN  NaT
3    Lisa  NaT    NaT  NaT


     Name ID Salary Role
0  Pankaj  1    100  CEO


## Fill Values

##### Replacing NaN Values


In [192]:
numbers = {'set_of_numbers': [1,2,3,4,5,6,7,8,9,10,np.nan,np.nan]}
df = DataFrame(numbers,columns=['set_of_numbers'])

df.loc[df['set_of_numbers'].isnull(), 'set_of_numbers'] = 0
print (df)

    set_of_numbers
0              1.0
1              2.0
2              3.0
3              4.0
4              5.0
5              6.0
6              7.0
7              8.0
8              9.0
9             10.0
10             0.0
11             0.0


##### Using Mask

In [None]:
numbers = {'set_of_numbers': [1,2,3,4,5,6,7,8,9,10,0,0]}
df = pd.DataFrame(numbers,columns=['set_of_numbers'])

df['set_of_numbers'].mask(df['set_of_numbers'] == 0, 99999, inplace=True)
print(df)

In [193]:
sl = pd.DataFrame({'a':[1, 0, 3], 'b':[0, 8, 9], 'c':[2, 0, 4],})
display(sl)

m = ~(sl == 0)                           # ~ is for negating the bolean series
sl = sl.where(m, other='blah')
sl

Unnamed: 0,a,b,c
0,1,0,2
1,0,8,0
2,3,9,4


Unnamed: 0,a,b,c
0,1,blah,2
1,blah,8,blah
2,3,9,4


##### Using Replace  ** Preferred **

# Duplicates

In [None]:

df['col_name'].duplicated()             returns series of true / false if duplicate
df['col_name'].duplicated().sum()       returns the number of duplicate values
df.duplicated(subset=['col1','col2'])   returns duplicate based on 2 columns
df.duplicated()                         returns true/false if entire row is a duplicate

# display
df.loc[df.duplicated(keep=false),:]     prints all the duplicated rows
df.loc[df.duplicated(keep=first),:]     prints the duplicated rows, but remove the 1st line (original?)
df.loc[df.duplicated(keep=last),:]      prints the duplicated rows, but remove the last line

df.drop_duplicates(keep=first)          drops the duplicates
