# Modifying DataFrames - Intro

This chapter covers various methods to modify `pandas.DataFrame`s.

# Preparations

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../../data/raw/financial_data_intro.csv")
df.head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False
2,14651,2007,British American Tobacco PLC,312230,GBR,2007-12-31,110448107,37161.97,4226.559,False


# Inserting Columns

In [3]:
# insert column with a scalar value - the value will be propagated for filling the entire column
df["new column"] = "something new"
df.head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,new column
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,something new
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,something new
2,14651,2007,British American Tobacco PLC,312230,GBR,2007-12-31,110448107,37161.97,4226.559,False,something new


In [4]:
# insert a boolean column
df["is_after_2007"] = df["u_year"] > 2007
df.head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,new column,is_after_2007
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,something new,False
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,something new,False
2,14651,2007,British American Tobacco PLC,312230,GBR,2007-12-31,110448107,37161.97,4226.559,False,something new,False


In [5]:
# Calculations with several columns (note that you can either use the `[]` or the `.` on the right side of the `=`)
df["roa"] = df["cb_ni"] / df["cb_at"]
df.head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,new column,is_after_2007,roa
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,something new,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,something new,False,0.106661
2,14651,2007,British American Tobacco PLC,312230,GBR,2007-12-31,110448107,37161.97,4226.559,False,something new,False,0.113733


# Modify values with `loc`

`loc` can not only be used for "reading" a subset of rows and/or columns from a DataFrame but can also be used to change the values of such a subset.

[https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html)

In [6]:
# Replace all values of u_year with 9999 firm-years from 2005:
df.loc[df["u_year"] == 2005, "u_year"] = 9999
df.head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,new column,is_after_2007,roa
0,14651,9999,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,something new,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,something new,False,0.106661
2,14651,2007,British American Tobacco PLC,312230,GBR,2007-12-31,110448107,37161.97,4226.559,False,something new,False,0.113733


In [7]:
# Take a look at some of the changed rows
df[df["u_year"] == 9999].head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,new column,is_after_2007,roa
0,14651,9999,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,something new,False,0.08269
15,13722,9999,Bob Evans Farms Inc.,311612,USA,2006-04-30,96761101,1209.183,54.774,False,something new,False,0.045298
27,10161,9999,BP PLC,324110,GBR,2005-12-31,55622104,206914.0,22341.0,False,something new,False,0.107972


In [8]:
# Check: there should no longer be entries with years == 2005
df.loc[df["u_year"] == 2005]

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,new column,is_after_2007,roa


In [9]:
# Undo the damage:
df.loc[df["u_year"] == 9999, "u_year"] = 2005
df.head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,new column,is_after_2007,roa
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,something new,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,something new,False,0.106661
2,14651,2007,British American Tobacco PLC,312230,GBR,2007-12-31,110448107,37161.97,4226.559,False,something new,False,0.113733


# Deleting columns

`DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')`

Deletes columns by label or rows by index label. Note that, by default, rows are dropped (axis=0)! For dropping columns you need to either specify `axis=1` or the `columns=...` argument instead of the `labels=...` argument (see below).

Furthermore, `.drop()` returns a changed copy of the DataFrame. Specify `inplace=True` when the original DataFrame is to be modified.

[https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html)

In [10]:
# use columns=[] so that pandas looks for columns to be removed and not for a row with index "roa"!
df.drop(columns=["roa"]).head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,new column,is_after_2007
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,something new,False
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,something new,False
2,14651,2007,British American Tobacco PLC,312230,GBR,2007-12-31,110448107,37161.97,4226.559,False,something new,False


In [11]:
# Lists of labels can also be provided, and we overwrite the dataframe with a reduced copy
df = df.drop(columns=["new column", "is_after_2007"])
df.head(3)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,roa
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,0.106661
2,14651,2007,British American Tobacco PLC,312230,GBR,2007-12-31,110448107,37161.97,4226.559,False,0.113733


# Exercise 1

1. Load the file 'financial_data_intro.csv' into a `pd.DataFrame`.
2. Replace *u_iso3* with "UK" for all rows that have *u_iso3* == "GBR".
3. Convert *cb_naics* to data type `str`.
4. Create a new column *industry_label* that takes the value "special" if *cb_naics* starts with "6", and "normal" otherwise.
5. Replace *cb_ni* with `pd.NA` if *u_year* == 2005.
6. Calculate a new column taking the value 1 if *cb_ni* is negative and 0 if *cb_ni* is >= 0.
7. Bonus: Create a new column *industry_label_2* that takes the value True if *cb_naics* starts with "6", and False otherwise.
8. Bonus: delete all numeric columns in-place.
9. Bonus: convert the column *u_fye* to a `datetime` data type

# The `replace` method

`pd.Series`, and therefore each column, offers the `replace` method to substitute specific values.

Even though you can use `loc`, `replace` is more convenient for certain use cases, such as replacing multiple values at once.

>**Note:** `replace` is also a method of `pd.DataFrame` where you can replace values across the entire DataFrame or a subset of columns. But I recommend you proceed one column at a time!

In [12]:
# here, we replace all occurrences of "GBR" with "UK" in the "u_iso3" column
df["u_iso3"] = df["u_iso3"].replace("GBR", "UK")
df[["u_iso3", "u_company_name"]].head(3)

Unnamed: 0,u_iso3,u_company_name
0,UK,British American Tobacco PLC
1,UK,British American Tobacco PLC
2,UK,British American Tobacco PLC


In [13]:
# here we "undo" the replacement
df["u_iso3"] = df["u_iso3"].replace("UK", "GBR")
df[["u_iso3", "u_company_name"]].head(3)

Unnamed: 0,u_iso3,u_company_name
0,GBR,British American Tobacco PLC
1,GBR,British American Tobacco PLC
2,GBR,British American Tobacco PLC


In [14]:
# the "roa" column has -inf values:
df["roa"].describe()

  sqr = _ensure_numeric((avg - values) ** 2)


count    824.000000
mean           -inf
std             NaN
min            -inf
25%       -0.002574
50%        0.036855
75%        0.064930
max        0.650325
Name: roa, dtype: float64

In [15]:
# let's replace them with NaN
# Note: both inf and nan are defined in the numpy module
import numpy as np

df["roa"] = df["roa"].replace(-np.inf, np.nan)
df["roa"].describe()

count     823.000000
mean       -4.644628
std        97.648108
min     -2749.000000
25%        -0.002505
50%         0.036916
75%         0.064992
max         0.650325
Name: roa, dtype: float64

In [16]:
# if you are not sure, whether you also have positive infinity values, you can replace both:
df["roa"] = df["roa"].replace([np.inf, -np.inf], np.nan)
df["roa"].describe()

count     823.000000
mean       -4.644628
std        97.648108
min     -2749.000000
25%        -0.002505
50%         0.036916
75%         0.064992
max         0.650325
Name: roa, dtype: float64

# Rename columns

see [https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html#pandas.DataFrame.rename](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html#pandas.DataFrame.rename)

In [17]:
# Renaming some of the columns
df.rename(columns={"u_year": "the year", "u_company_name": "the Name of the Company"}).head(2)

Unnamed: 0,u_company_name_id,the year,the Name of the Company,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,roa
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,0.106661


In [18]:
# the original df is not changed
df.head(2)

Unnamed: 0,u_company_name_id,u_year,u_company_name,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,roa
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,0.106661


In [19]:
# now overwrite the df
df = df.rename(columns={"u_year": "the year", "u_company_name": "the Name of the Company"})
df.head(2)

Unnamed: 0,u_company_name_id,the year,the Name of the Company,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,roa
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,0.106661


In [20]:
# There are also functions to add prefixes or suffixes to the column names - these cannot make changes in-place!
df = df.add_prefix("my_prefix_")
df.head(2)

Unnamed: 0,my_prefix_u_company_name_id,my_prefix_the year,my_prefix_the Name of the Company,my_prefix_cb_naics,my_prefix_u_iso3,my_prefix_u_fye,my_prefix_cb_cusip,my_prefix_cb_at,my_prefix_cb_ni,my_prefix_cb_financial_industry,my_prefix_roa
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,0.106661


In [21]:
# Remove the prefix, e.g. by directly changing the df.columns property!
df.columns = df.columns.str.replace("my_prefix_", "")
df.head(2)

Unnamed: 0,u_company_name_id,the year,the Name of the Company,cb_naics,u_iso3,u_fye,cb_cusip,cb_at,cb_ni,cb_financial_industry,roa
0,14651,2005,British American Tobacco PLC,312230,GBR,2005-12-31,110448107,32737.984,2707.11,False,0.08269
1,14651,2006,British American Tobacco PLC,312230,GBR,2006-12-31,110448107,34816.074,3713.506,False,0.106661


# Exercise 2

1. Load the file 'bank-additional-full.csv' into a `pd.DataFrame` (note that you must specify the correct separator with `pd.read_csv(..., sep=";")`).
2. Rename the column 'y' to 'target'.
3. Drop the column 'duration'.
4. Give all columns, other than 'y', the prefix 'feature_'.
5. Replace the value 999 in column 'feature_pdays' with `pd.NA`.