# 0. Imports and Data Preparation

In [1]:
import numpy as np
import pandas as pd

In [2]:
# create DataFrame
df = pd.DataFrame(
    {
        "first_name": ["Jane", "John", "Max", "Emily", "Ashley"],
        "last_name": ["Doe", "Doe", "Dune", "Smith", "Fox"],
        "id": [101, 103, 143, 118, 128],
    }
)

df

Unnamed: 0,first_name,last_name,id
0,Jane,Doe,101
1,John,Doe,103
2,Max,Dune,143
3,Emily,Smith,118
4,Ashley,Fox,128


# 1. Use a constant value

In [3]:
df.loc[:, "department"] = "engineering"

df

Unnamed: 0,first_name,last_name,id,department
0,Jane,Doe,101,engineering
1,John,Doe,103,engineering
2,Max,Dune,143,engineering
3,Emily,Smith,118,engineering
4,Ashley,Fox,128,engineering


# 2. Use array-like structure

In [4]:
df.loc[:, "salary"] = [45000, 43000, 42000, 45900, 54000]

df

Unnamed: 0,first_name,last_name,id,department,salary
0,Jane,Doe,101,engineering,45000
1,John,Doe,103,engineering,43000
2,Max,Dune,143,engineering,42000
3,Emily,Smith,118,engineering,45900
4,Ashley,Fox,128,engineering,54000


In [5]:
# Use of Numpy

df.loc[:, "salary"] = np.random.randint(40000, 55000, size=5)

df

  df.loc[:, "salary"] = np.random.randint(40000, 55000, size=5)


Unnamed: 0,first_name,last_name,id,department,salary
0,Jane,Doe,101,engineering,50962
1,John,Doe,103,engineering,42427
2,Max,Dune,143,engineering,42702
3,Emily,Smith,118,engineering,53675
4,Ashley,Fox,128,engineering,44701


# 3. Derive from other columns

In [6]:
df.loc[:, "name"] = df["first_name"] + " " + df["last_name"]

df

Unnamed: 0,first_name,last_name,id,department,salary,name
0,Jane,Doe,101,engineering,50962,Jane Doe
1,John,Doe,103,engineering,42427,John Doe
2,Max,Dune,143,engineering,42702,Max Dune
3,Emily,Smith,118,engineering,53675,Emily Smith
4,Ashley,Fox,128,engineering,44701,Ashley Fox


We can also use the `cat` function under the `str` accessor to combine (i.e. concatenate) strings.

In [7]:
df.loc[:, "name"] = df["first_name"].str.cat(df["last_name"], sep=" ")

df

Unnamed: 0,first_name,last_name,id,department,salary,name
0,Jane,Doe,101,engineering,50962,Jane Doe
1,John,Doe,103,engineering,42427,John Doe
2,Max,Dune,143,engineering,42702,Max Dune
3,Emily,Smith,118,engineering,53675,Emily Smith
4,Ashley,Fox,128,engineering,44701,Ashley Fox


# 4. The insert function

By default, new columns are added at the end so it becomes the last column. If we need to add the new column at a specific location (e.g. as the first one), we can use the `insert` function.

For instance, in the previous example, having the `name` column as last while the `first_name` and `last_name` are at the beginning doesn’t seem nice.

Let’s drop the `name` column and add it back again but as the first column.

In [8]:
# drop the name column
df = df.drop(["name"], axis=1)

# add the name column as the first column
df.insert(0, "name", df["first_name"].str.cat(df["last_name"], sep=" "))

df

Unnamed: 0,name,first_name,last_name,id,department,salary
0,Jane Doe,Jane,Doe,101,engineering,50962
1,John Doe,John,Doe,103,engineering,42427
2,Max Dune,Max,Dune,143,engineering,42702
3,Emily Smith,Emily,Smith,118,engineering,53675
4,Ashley Fox,Ashley,Fox,128,engineering,44701


The `insert` function has 3 parameters:

- The first one is the index of the new column (0 for the first column, 1 for the second, and so on).
- The second on is the column name.
- Third one is the column values.

# 5. Pandas where function

The `where` function of Pandas allows for adding a new column with values determined using a condition based on other columns.

In [9]:
# initialize the column with all 0s
df.loc[:, "high_salary"] = 0

# update the values as 1 for the rows that do not fit the given condition
df.loc[:, "high_salary"] = df.where(df["salary"] <= 48000, 1)

df

Unnamed: 0,name,first_name,last_name,id,department,salary,high_salary
0,Jane Doe,Jane,Doe,101,engineering,50962,1
1,John Doe,John,Doe,103,engineering,42427,0
2,Max Dune,Max,Dune,143,engineering,42702,0
3,Emily Smith,Emily,Smith,118,engineering,53675,1
4,Ashley Fox,Ashley,Fox,128,engineering,44701,0


We added a new column called `high_salary` which takes a value of 1 if the `salary` is more than 48000 and 0 otherwise.

The `where` function updates the values that do not fit the condition. This is the reason why we specify the condition as being equal to or less than 48000.

The values for the rows that fit the condition remain the same.

# 6. NumPy where function

We can also use the `where` function of NumPy for adding new columns. It’s more flexible than Pandas’s `where` because it allows for updating values that fit and do not fit the condition.

In [10]:
# drop the existing high_salary column
df = df.drop(["high_salary"], axis=1)

# create the column
df.loc[:, "high_salary"] = np.where(df["salary"] <= 48000, 0, 1)

df

Unnamed: 0,name,first_name,last_name,id,department,salary,high_salary
0,Jane Doe,Jane,Doe,101,engineering,50962,1
1,John Doe,John,Doe,103,engineering,42427,0
2,Max Dune,Max,Dune,143,engineering,42702,0
3,Emily Smith,Emily,Smith,118,engineering,53675,1
4,Ashley Fox,Ashley,Fox,128,engineering,44701,0


We did not have to initialize the column with all 0s because we can directly create it with 1s and 0s depending on the given condition.

The first parameter of NumPy's `where` function specifies the condition. The second one is the value to be used for rows that fit the condition and the third one is for rows that do not fit the condition.

# 7. NumPy select function

The `select` function of NumPy can evaluate multiple conditions and assign a separate value for each one. Thus, we can use it for creating conditional columns as well.

The conditions and associated values are written in a Python list. Then, we just pass them as arguments to the `select` function. We can also define a default value to be used for rows that do not fit any of the given conditions.

Let’s create a `salary_cond` column with values high, mid, and low. The values are determined according to the values in the `salary` column.

In [11]:
# create conditions list
conditions = [
    (df["salary"] > 50000),
    (df["salary"] <= 50000) & (df["salary"] > 45000),
    (df["salary"] <= 45000),
]

# create values list
values = ["high", "mid", "low"]

# create the column
df.loc[:, "salary_cond"] = np.select(conditions, values)

df

Unnamed: 0,name,first_name,last_name,id,department,salary,high_salary,salary_cond
0,Jane Doe,Jane,Doe,101,engineering,50962,1,high
1,John Doe,John,Doe,103,engineering,42427,0,low
2,Max Dune,Max,Dune,143,engineering,42702,0,low
3,Emily Smith,Emily,Smith,118,engineering,53675,1,high
4,Ashley Fox,Ashley,Fox,128,engineering,44701,0,low


# 8. Pandas assign function

We can use the `assign` function for creating multiple columns in a single operation. They can be derived from the existing ones or created from scratch.

In [12]:
# drop the columns first
df = df.drop(["department", "high_salary", "salary_cond"], axis=1)

# create the columns
df = df.assign(
    department="engineering",
    high_salary=np.where(df["salary"] <= 48000, 0, 1),
    salary_condition=np.select(conditions, values),
)

# display DataFrame
df

Unnamed: 0,name,first_name,last_name,id,salary,department,high_salary,salary_condition
0,Jane Doe,Jane,Doe,101,50962,engineering,1,high
1,John Doe,John,Doe,103,42427,engineering,0,low
2,Max Dune,Max,Dune,143,42702,engineering,0,low
3,Emily Smith,Emily,Smith,118,53675,engineering,1,high
4,Ashley Fox,Ashley,Fox,128,44701,engineering,0,low


# 9. Pandas apply function

The `apply` function, as its name suggests, applies a function along an axis (i.e. columns or rows), which can be used for adding a new column to a DataFrame.

In [13]:
# create a DataFrame with random integers
df = pd.DataFrame(np.random.randint(10, size=(4, 5)), columns=list("ABCDE"))

df

Unnamed: 0,A,B,C,D,E
0,8,5,4,2,2
1,9,3,4,9,9
2,5,6,7,8,2
3,9,1,2,4,9


In [14]:
# create a new column called total
df["total"] = df.apply(np.sum, axis=1)

df

Unnamed: 0,A,B,C,D,E,total
0,8,5,4,2,2,21
1,9,3,4,9,9,34
2,5,6,7,8,2,28
3,9,1,2,4,9,25


# 10. Lambda expressions

In [15]:
# create DataFrame
rates = pd.DataFrame(
    {
        "item": ["A", "B", "C", "D"],
        "rates": [[11, 15, 12], [5, 7, 4], [24, 18, 22], [42, 39, 27]],
    }
)

rates

Unnamed: 0,item,rates
0,A,"[11, 15, 12]"
1,B,"[5, 7, 4]"
2,C,"[24, 18, 22]"
3,D,"[42, 39, 27]"


Let’s say we need to create a new column that has the minimum value of the lists in the `rates` column. We can do this task with `apply` function and a lambda expression as follows:

In [16]:
# create the min_rate column
rates["min_rate"] = rates["rates"].apply(lambda x: pd.Series(x).min())

rates

Unnamed: 0,item,rates,min_rate
0,A,"[11, 15, 12]",11
1,B,"[5, 7, 4]",4
2,C,"[24, 18, 22]",18
3,D,"[42, 39, 27]",27
