##### <b> Modifying Columns </b></br> 

In [1]:
import pandas as pd
import numpy as np

##### <b> Renaming Modifying Columns </b></br> Can rename columns in places via assignment by using `.columns = ['list of column name(s)]` </br> Columns names must be listed in correct order for assignment </br> can programmatically change column case using `.columns = [col.upper() for col in pd.columns]` using list comprehension </br> .rename() method </br> Using a dictionary to map new names to old names </br>`pd.rename(columns={'old_name':'new_name'})`

In [2]:
# create DataFrame using dictionary {key:values}
products_df = pd.DataFrame(
    {'product': ['Dairy', 'Dairy', 'Dairy', 'Vegetable', 'Fruits'],
    'price': [2.56, 2.56, 4.55, 2.74, 5.44]        
    }
)
products_df

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


In [3]:
# assign product_name and cost to columns
products_df.columns = ['product_name', 'cost']
products_df

Unnamed: 0,product_name,cost
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


In [4]:
# change column labels to be uppercase using list comprehension
products_df.columns = [col.upper() for col in products_df.columns]
products_df

Unnamed: 0,PRODUCT_NAME,COST
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


In [5]:
# recreate DataFrame using dictionary {key:values}
products_df = pd.DataFrame(
    {'product': ['Dairy', 'Dairy', 'Dairy', 'Vegetable', 'Fruits'],
    'price': [2.56, 2.56, 4.55, 2.74, 5.44]        
    }
)
products_df

Unnamed: 0,product,price
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


In [6]:
# use .rename() method to change label names, creates new DataFrame to be assigned
products_df.rename(columns= {'product':'product_name', 'price':'cost'})

Unnamed: 0,product_name,cost
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


In [7]:
# can use lambda function to clean/standardize column names, creates new DataFrame
products_df.rename(columns=lambda x: x.upper())


Unnamed: 0,PRODUCT,PRICE
0,Dairy,2.56
1,Dairy,2.56
2,Dairy,4.55
3,Vegetable,2.74
4,Fruits,5.44


##### **Reorder Columns**</br> .reindex() method: use this when sorting won't suffice </br>
`pd.reindex(labels=[list of columns is specified order], axis=1)`

In [8]:
products_df = pd.DataFrame(
    {'product': ['Dairy', 'Dairy', 'Dairy', 'Vegetable', 'Fruits'],
    'price': [2.56, 2.56, 4.55, 2.74, 5.44],
    'product_id': [1, 2, 3, 4, 5]     
    }
)
products_df

Unnamed: 0,product,price,product_id
0,Dairy,2.56,1
1,Dairy,2.56,2
2,Dairy,4.55,3
3,Vegetable,2.74,4
4,Fruits,5.44,5


In [9]:
# reindex the columns for as required, returns a new DataFrame
products_df.reindex(labels = ['product_id', 'product', 'price'], axis=1)

Unnamed: 0,product_id,product,price
0,1,Dairy,2.56
1,2,Dairy,2.56
2,3,Dairy,4.55
3,4,Vegetable,2.74
4,5,Fruits,5.44


##### **Arithmetic Column Creation** </b></br> Can create columns with arithmetic by assigning them Series operations </br> Specify new column name and assign operation required

In [10]:
path_retail = 'Pandas Course Resources/retail/retail_2016_2017.csv'
retail_df = pd.read_csv(path_retail)

retail_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,1945944,2016-01-01,1,AUTOMOTIVE,0.0,0
1,1945945,2016-01-01,1,BABY CARE,0.0,0
2,1945946,2016-01-01,1,BEAUTY,0.0,0
3,1945947,2016-01-01,1,BEVERAGES,0.0,0
4,1945948,2016-01-01,1,BOOKS,0.0,0


In [11]:
# filter retail_df for 'BABY CARE' and BOOKS
mask = (
    retail_df['family'].isin(['BABY CARE', 'BOOKS']) &
    (retail_df['sales'] > 0)
)
baby_books = retail_df.loc[mask]
baby_books = baby_books.sample(5, random_state=2022)
baby_books

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
1021351,2967295,2017-07-28,17,BABY CARE,1.0,0
587170,2533114,2016-11-25,34,BABY CARE,2.0,0
592618,2538562,2016-11-28,37,BOOKS,2.0,0
824377,2770321,2017-04-08,4,BOOKS,2.0,0
397321,2343265,2016-08-10,8,BABY CARE,1.0,0


In [12]:
# Create tax_amount column = sales columns * 0.05 (use .loc[:, new_column])
baby_books.loc[:,'tax_amount'] = baby_books['sales'] * 0.05
baby_books.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount
1021351,2967295,2017-07-28,17,BABY CARE,1.0,0,0.05
587170,2533114,2016-11-25,34,BABY CARE,2.0,0,0.1
592618,2538562,2016-11-28,37,BOOKS,2.0,0,0.1
824377,2770321,2017-04-08,4,BOOKS,2.0,0,0.1
397321,2343265,2016-08-10,8,BABY CARE,1.0,0,0.05


In [13]:
# Create total_amount column = tax_amount + sales columns (use .loc[:, new_column])
baby_books.loc[:,'total_amount'] = baby_books['tax_amount'] + baby_books['sales']
baby_books.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount,total_amount
1021351,2967295,2017-07-28,17,BABY CARE,1.0,0,0.05,1.05
587170,2533114,2016-11-25,34,BABY CARE,2.0,0,0.1,2.1
592618,2538562,2016-11-28,37,BOOKS,2.0,0,0.1,2.1
824377,2770321,2017-04-08,4,BOOKS,2.0,0,0.1,2.1
397321,2343265,2016-08-10,8,BABY CARE,1.0,0,0.05,1.05


##### **Boolean Column Creation** </b></br> Can create columns using logical test

In [14]:
# create new columns taxable_category using logical test
baby_books.loc[:, 'taxable_category'] = baby_books['family'] != 'BABY CARE'
baby_books

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount,total_amount,taxable_category
1021351,2967295,2017-07-28,17,BABY CARE,1.0,0,0.05,1.05,False
587170,2533114,2016-11-25,34,BABY CARE,2.0,0,0.1,2.1,False
592618,2538562,2016-11-28,37,BOOKS,2.0,0,0.1,2.1,True
824377,2770321,2017-04-08,4,BOOKS,2.0,0,0.1,2.1,True
397321,2343265,2016-08-10,8,BABY CARE,1.0,0,0.05,1.05,False


##### **Column Creation based on Boolean Arithmetic** </b></br> Can create columns using logical test that will preform arithmetic is true

In [15]:
# if baby_books['family'] != 'BABY CARE' is True then multiply by 1, if false multiple by 0 so the value for the column will be zero because it failed the logical test
baby_books.loc[:,'tax_amount_bool'] = baby_books['sales'] * 0.05 * (baby_books['family'] != 'BABY CARE')
baby_books

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount,total_amount,taxable_category,tax_amount_bool
1021351,2967295,2017-07-28,17,BABY CARE,1.0,0,0.05,1.05,False,0.0
587170,2533114,2016-11-25,34,BABY CARE,2.0,0,0.1,2.1,False,0.0
592618,2538562,2016-11-28,37,BOOKS,2.0,0,0.1,2.1,True,0.1
824377,2770321,2017-04-08,4,BOOKS,2.0,0,0.1,2.1,True,0.1
397321,2343265,2016-08-10,8,BABY CARE,1.0,0,0.05,1.05,False,0.0


In [16]:
# Create integer Date Columns based on the 'date' column
# change datatype of 'date' to datetime64 so it can be parsed
baby_books['date'] = baby_books['date'].astype('datetime64[ns]')

In [17]:
# parse the 'month' from the date 'column'
baby_books["month"] = baby_books["date"].dt.month
# parse the 'day' from the date 'column'
baby_books["day_of_week"] = baby_books["date"].dt.day

baby_books

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount,total_amount,taxable_category,tax_amount_bool,month,day_of_week
1021351,2967295,2017-07-28,17,BABY CARE,1.0,0,0.05,1.05,False,0.0,7,28
587170,2533114,2016-11-25,34,BABY CARE,2.0,0,0.1,2.1,False,0.0,11,25
592618,2538562,2016-11-28,37,BOOKS,2.0,0,0.1,2.1,True,0.1,11,28
824377,2770321,2017-04-08,4,BOOKS,2.0,0,0.1,2.1,True,0.1,4,8
397321,2343265,2016-08-10,8,BABY CARE,1.0,0,0.05,1.05,False,0.0,8,10


##### **Advanced column Creation with NumPy Select() method** </b></br> Can create columns based on multiple conditions </br> More flexible than np.where() or pd.where() methods </br> the output can be categories or calculations/values

In [18]:
# use np.select() to select from conditions and choices
conditions = [
    # condition 1 links with choices list 1
    (baby_books['date'] == '2017-07-28') & (baby_books['family'] == 'BABY CARE'),
    # condition 2 links with choices list 2
    (baby_books['date'] == '2016-11-28') & (baby_books['family'] == 'BOOKS'),
    # condition 3 links with choices list 3
    (baby_books['date'] == '2016-11-25') & (baby_books['store_nbr'] > 28),    
]

choices = ['Winter Clearance', 'Christmas Eve', 'New Store Special']

# Creation of new column using np.select from conditions and choices and if no match default value is output
baby_books['Sales_Name'] = np.select(conditions, choices, default='No Sale')

baby_books

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount,total_amount,taxable_category,tax_amount_bool,month,day_of_week,Sales_Name
1021351,2967295,2017-07-28,17,BABY CARE,1.0,0,0.05,1.05,False,0.0,7,28,Winter Clearance
587170,2533114,2016-11-25,34,BABY CARE,2.0,0,0.1,2.1,False,0.0,11,25,New Store Special
592618,2538562,2016-11-28,37,BOOKS,2.0,0,0.1,2.1,True,0.1,11,28,Christmas Eve
824377,2770321,2017-04-08,4,BOOKS,2.0,0,0.1,2.1,True,0.1,4,8,No Sale
397321,2343265,2016-08-10,8,BABY CARE,1.0,0,0.05,1.05,False,0.0,8,10,No Sale


In [19]:
# counts the values in the new categorical column
baby_books['Sales_Name'].value_counts()

Sales_Name
No Sale              2
Winter Clearance     1
New Store Special    1
Christmas Eve        1
Name: count, dtype: int64

##### **Mapping Values to Columns** </b></br> Can map values to columns or an entire DataFame </br> can pass a dictionary with existing values as the key and new values as values `{old_value:new_value}` </br> - may not use.... but better than multiple conditions if using np.select() method </br> can also use `lambda function` to map formatting to values </br> **`NOTE`** If re-categorizing smaller catergories to major categories, if the category value is not included in the dictionary it will be `NaN`. However map may be better to use for this instead of select due to less logical conditions needing to be created

In [20]:
products_df

Unnamed: 0,product,price,product_id
0,Dairy,2.56,1
1,Dairy,2.56,2
2,Dairy,4.55,3
3,Vegetable,2.74,4
4,Fruits,5.44,5


In [21]:
# mapping values from existing values
#create mapped values dictionary
mapping_dict = {'Dairy':'Non-Vegan', 'Vegetable':'Vegan', 'Fruits':'Vegan'}

# create new column and use .map(dictionary_variable) on column of interest to map values to new column. Creates new DataFrame
products_df['Vegan?'] = products_df['product'].map(mapping_dict)
products_df

Unnamed: 0,product,price,product_id,Vegan?
0,Dairy,2.56,1,Non-Vegan
1,Dairy,2.56,2,Non-Vegan
2,Dairy,4.55,3,Non-Vegan
3,Vegetable,2.74,4,Vegan
4,Fruits,5.44,5,Vegan


In [22]:
# use lambda function to assign currency formatting to price values, Creates new DataFrame
products_df['price'] = products_df['price'].map(lambda x: f'${x}')

products_df

Unnamed: 0,product,price,product_id,Vegan?
0,Dairy,$2.56,1,Non-Vegan
1,Dairy,$2.56,2,Non-Vegan
2,Dairy,$4.55,3,Non-Vegan
3,Vegetable,$2.74,4,Vegan
4,Fruits,$5.44,5,Vegan


In [25]:
retail_df['family'].value_counts()

family
AUTOMOTIVE                    31968
HOME APPLIANCES               31968
SCHOOL AND OFFICE SUPPLIES    31968
PRODUCE                       31968
PREPARED FOODS                31968
POULTRY                       31968
PLAYERS AND ELECTRONICS       31968
PET SUPPLIES                  31968
PERSONAL CARE                 31968
MEATS                         31968
MAGAZINES                     31968
LIQUOR,WINE,BEER              31968
LINGERIE                      31968
LAWN AND GARDEN               31968
LADIESWEAR                    31968
HOME CARE                     31968
HOME AND KITCHEN II           31968
BABY CARE                     31968
HOME AND KITCHEN I            31968
HARDWARE                      31968
GROCERY II                    31968
GROCERY I                     31968
FROZEN FOODS                  31968
EGGS                          31968
DELI                          31968
DAIRY                         31968
CLEANING                      31968
CELEBRATION          

In [29]:
# for analytics purposes, re-categorize subject families into major family's but anything not included in dictionary will become NaN in new column
family_category = {
    'PRODUCE':'Grocery',
    'PREPARED FOODS':'Grocery',
    'POULTRY':'Grocery',
    'MEATS':'Grocery',
    'GROCERY II':'Grocery',
    'GROCERY I':'Grocery',
    'FROZEN FOODS':'Grocery',
    'EGGS':'Grocery',
    'DELI':'Grocery',
    'DAIRY':'Grocery',
    'BREAD/BAKERY':'Grocery',
    'BEVERAGES':'Grocery',
    'SEAFOOD':'Grocery'
}

retail_df['product'] = retail_df['family'].map(family_category)
retail_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,product
0,1945944,2016-01-01,1,AUTOMOTIVE,0.000,0,
1,1945945,2016-01-01,1,BABY CARE,0.000,0,
2,1945946,2016-01-01,1,BEAUTY,0.000,0,
3,1945947,2016-01-01,1,BEVERAGES,0.000,0,Grocery
4,1945948,2016-01-01,1,BOOKS,0.000,0,
...,...,...,...,...,...,...,...
1054939,3000883,2017-08-15,9,POULTRY,438.133,0,Grocery
1054940,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,Grocery
1054941,3000885,2017-08-15,9,PRODUCE,2419.729,148,Grocery
1054942,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,


##### **Column Creation with Assign** </br> `.assign()` Method creates multiple columns at once and returns a DataFrame that can be assigned</br> Can be Chained together with of data processing methods </br> new column does not need to be in quotations </br> boolean, arithmetic, map can be used within assign() methods </br> can also create columns based on columns created in the same .assign() method but `MUST USE LAMBDA FUNCTION TO DO ARITHMETIC/BOOLEAN Logic` </br> can be chained with `.query()` at end to filter newly assigned columns

In [40]:
# Create random sample from retail_df
sample_df = retail_df.sample(10, random_state=2022)
sample_df = sample_df.drop(columns='product')
sample_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
86825,2032769,2016-02-18,45,BEAUTY,7.0,0
487764,2433708,2016-09-30,44,MEATS,1718.515,54
358165,2304109,2016-07-19,9,HOME AND KITCHEN II,10.0,0
770952,2716896,2017-03-09,40,CELEBRATION,7.0,0
239109,2185053,2016-05-14,18,MEATS,116.755,0
550589,2496533,2016-11-04,8,HOME APPLIANCES,0.0,0
260289,2206233,2016-05-26,12,HOME CARE,122.0,3
393399,2339343,2016-08-08,47,CELEBRATION,23.0,0
311328,2257272,2016-06-23,44,CELEBRATION,30.0,0
869484,2815428,2017-05-03,6,AUTOMOTIVE,4.0,0


In [41]:
# using .assign() method new column does not need to be in quotes
sample_df.assign(tax_amount=sample_df['sales']*0.05).round(2)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount
86825,2032769,2016-02-18,45,BEAUTY,7.0,0,0.35
487764,2433708,2016-09-30,44,MEATS,1718.52,54,85.93
358165,2304109,2016-07-19,9,HOME AND KITCHEN II,10.0,0,0.5
770952,2716896,2017-03-09,40,CELEBRATION,7.0,0,0.35
239109,2185053,2016-05-14,18,MEATS,116.76,0,5.84
550589,2496533,2016-11-04,8,HOME APPLIANCES,0.0,0,0.0
260289,2206233,2016-05-26,12,HOME CARE,122.0,3,6.1
393399,2339343,2016-08-08,47,CELEBRATION,23.0,0,1.15
311328,2257272,2016-06-23,44,CELEBRATION,30.0,0,1.5
869484,2815428,2017-05-03,6,AUTOMOTIVE,4.0,0,0.2


In [52]:
# using .assign() method to create complex column creations 
sample_df.assign(
    tax_amount = (sample_df['sales']*0.05).round(2).map(lambda x: f'${x}'),
    on_promotion_flag = sample_df['onpromotion'] > 0,
    year = sample_df['date'].str[:4].astype('int'),
)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount,on_promotion_flag,year
86825,2032769,2016-02-18,45,BEAUTY,7.0,0,$0.35,False,2016
487764,2433708,2016-09-30,44,MEATS,1718.515,54,$85.93,True,2016
358165,2304109,2016-07-19,9,HOME AND KITCHEN II,10.0,0,$0.5,False,2016
770952,2716896,2017-03-09,40,CELEBRATION,7.0,0,$0.35,False,2017
239109,2185053,2016-05-14,18,MEATS,116.755,0,$5.84,False,2016
550589,2496533,2016-11-04,8,HOME APPLIANCES,0.0,0,$0.0,False,2016
260289,2206233,2016-05-26,12,HOME CARE,122.0,3,$6.1,True,2016
393399,2339343,2016-08-08,47,CELEBRATION,23.0,0,$1.15,False,2016
311328,2257272,2016-06-23,44,CELEBRATION,30.0,0,$1.5,False,2016
869484,2815428,2017-05-03,6,AUTOMOTIVE,4.0,0,$0.2,False,2017


In [53]:
# using .assign() method to create complex column creations chained with .query() method
sample_df.assign(
    tax_amount = (sample_df['sales']*0.05).round(2).map(lambda x: f'${x}'),
    on_promotion_flag = sample_df['onpromotion'] > 0,
    year = sample_df['date'].str[:4].astype('int'),
).query('on_promotion_flag == True')

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount,on_promotion_flag,year
487764,2433708,2016-09-30,44,MEATS,1718.515,54,$85.93,True,2016
260289,2206233,2016-05-26,12,HOME CARE,122.0,3,$6.1,True,2016


##### **If using columns in .assign() method to create columns in same .assign() method </br> `lambda function must be used, otherwise it will not work`**

In [57]:
# MUST USE LAMBDA FUNCTION TO APPLY ARITHMETIC AND BOOLEAN LOGIC IF CREATING COLUMNS BASED ON COLUMNS IN ASSIGN METHOD

# using .assign() method to create complex column creations chained with .query() method
sample_df.assign(
    tax_amount = (sample_df['sales']*0.05).round(2).map(lambda x: f'${x}'),
    on_promotion_flag = sample_df['onpromotion'] > 0,
    year = sample_df['date'].str[:4].astype('int'),
    onpromotion_ratio = sample_df['sales'] / sample_df['onpromotion'],
    # this has to use lambda function to work with columns within .assign method
    sales_onpromo_target = lambda x: x['onpromotion_ratio'] > 100
)


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,tax_amount,on_promotion_flag,year,onpromotion_ratio,sales_onpromo_target
86825,2032769,2016-02-18,45,BEAUTY,7.0,0,$0.35,False,2016,inf,True
487764,2433708,2016-09-30,44,MEATS,1718.515,54,$85.93,True,2016,31.824352,False
358165,2304109,2016-07-19,9,HOME AND KITCHEN II,10.0,0,$0.5,False,2016,inf,True
770952,2716896,2017-03-09,40,CELEBRATION,7.0,0,$0.35,False,2017,inf,True
239109,2185053,2016-05-14,18,MEATS,116.755,0,$5.84,False,2016,inf,True
550589,2496533,2016-11-04,8,HOME APPLIANCES,0.0,0,$0.0,False,2016,,False
260289,2206233,2016-05-26,12,HOME CARE,122.0,3,$6.1,True,2016,40.666667,False
393399,2339343,2016-08-08,47,CELEBRATION,23.0,0,$1.15,False,2016,inf,True
311328,2257272,2016-06-23,44,CELEBRATION,30.0,0,$1.5,False,2016,inf,True
869484,2815428,2017-05-03,6,AUTOMOTIVE,4.0,0,$0.2,False,2017,inf,True
