# **Previous Content**

# Pandas Dataframes

In [None]:
# Create an alias with the as keyword while importing
# Now you can refer to the Pandas package as pd instead of pandas
import pandas as pd

In [None]:
# A Pandas DataFrame is a 2 dimensional data structure, like a table with rows and columns
songs_data = {
  "name": ["Flowers", "As it Was", "Anti-Hero", "Eyes Closed", "Just the Way You Are"],
  "length_in_seconds": [201, 163, 201, 194, 221],
  "release_year": [2023, 2022, 2022, 2023, 2010]
}
# Load songs_data into a DataFrame object
songs_df = pd.DataFrame(songs_data)
print(songs_df) # Notice the values are labeled with their integer indices by default (i.e., first value has index 0, second value has index 1, etc.)

                   name  length_in_seconds  release_year
0               Flowers                201          2023
1             As it Was                163          2022
2             Anti-Hero                201          2022
3           Eyes Closed                194          2023
4  Just the Way You Are                221          2010


In [None]:
# Add a list of labels to give each row a label with the index argument
songs_df_letter_label = pd.DataFrame(songs_data, index = ["a", "b", "c", "d", "e"])
print(songs_df_letter_label)

                   name  length_in_seconds  release_year
a               Flowers                201          2023
b             As it Was                163          2022
c             Anti-Hero                201          2022
d           Eyes Closed                194          2023
e  Just the Way You Are                221          2010


# Locate Rows and Columns Using loc to Specify **Labels**

In [None]:
# Use loc to return one or more row(s) by labels
print(songs_df.loc[0])

In [None]:
# Use a list of labels to return Row 0, 1 and 2
print(songs_df.loc[[0, 1, 2]])

In [None]:
# Specify multiple rows of the DataFrame with from and to labels separated by a colon
print(songs_df.loc[0: 2]) # Notice: both from and to lables are included in the result when using loc (i.e. to specify labels)

In [None]:
# Use loc to specify columns by including their labels in another list
print(songs_df.loc[[0, 1],['name', 'release_year'] ])

In [None]:
# Use : to represent all the row labels
print(songs_df.loc[:, ['name', 'release_year']])

In [None]:
print(songs_df.loc[[0, 1], :])

In [None]:
# Alternatively, Use [] to access one column by labels
print(songs_df['name'])

In [None]:
print(songs_df[['name', 'release_year']])

**Previous Practice: Explain why the following code returns an error and debug it with loc**

In [None]:
print(songs_df_letter_label.loc[[0, 1]])

In [None]:
print(songs_df_letter_label.loc[["a", "b"]])

**Previous Practice: Explain why the following code returns an error and fix it**

In [None]:
print(songs_df['name', 'release_year'])

In [None]:
print(songs_df[['name', 'release_year']])

In [None]:
# The following will also give an error
print(songs_df.loc['name', 'release_year'])

**Discussion: Explain the difference between df['name'], df[['name']], df.loc[:, 'name'] and df.loc[:, ['name']]**

**Hint: Check their data types**

In [None]:
songs_df

In [None]:
print(type(songs_df['name']))
print(songs_df['name'].shape) # a Series has the shape (n,), where n is the number of elements
songs_df['name'][0] # this is the same as songs_df.loc[0, 'name']

In [None]:
print(type(songs_df[['name']]))
print(songs_df[['name']].shape) # this dataframe has the shape (n, 1), where n is the number of rows, and 1 indicates a single column
# songs_df[['name']][0] # This line will return an error because songs_df[['name']] is not a pandas series
songs_df[['name']].loc[0, 'name']

In [None]:
print(type(songs_df.loc[:, 'name']))

In [None]:
print(type(songs_df.loc[:, ['name']]))

# Locate Rows and Columns Using iloc to Specify **Integer Indices**

In [None]:
# Use iloc to return one row by integer index
print(songs_df_letter_label.iloc[0])

In [None]:
# Use a list of integer indices to return Row 0 and 1 of the songs_df_letter_label dataframe
print(songs_df_letter_label.iloc[[0, 1]])

In [None]:
# Specify multiple rows of the DataFrame with from and to labels separated by a colon
print(songs_df_letter_label.iloc[0: 2]) # Notice: the to index is excluded from the result (ATTENTION: this is DIFFERENT from loc!!!)

**Practice 1: Explain why the following code returns an error and debug it using iloc and loc**

In [None]:
print(songs_df_letter_label.iloc[[0, 1], ['name', 'release_year']])

In [None]:
songs_df_letter_label

In [None]:
print(songs_df_letter_label.loc[['a','b'], ['name', 'release_year']])

In [None]:
print(songs_df_letter_label.iloc[[0, 1], [0, 2]])

# Read CSV

In [None]:
# Load the CSV into a dataframe
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
print(sales) # If the dataframe has many rows, Pandas will only return the first 5 rows and the last 5 rows

In [None]:
# Use to_string() to print the entire dataframe
# This is helpful for viewing a large dataset in its entirety
print(sales.to_string())

In [None]:
# Colab Notebook gives a nice grid view of the dataframe
sales

In [None]:
# The head() method returns the headers and a specified number of rows, starting from the top
print(sales.head(5))

In [None]:
# Use info() to get more information about the dataframe
print(sales.info())

In [None]:
sales['Price'].describe() # use the describe() to get a statistical description of a column

In [None]:
# sales['Price'].describe() # use the describe() to get a statistical description of a column
sales.describe() # get a statistical description of the entire dataframe

# Correct Wrong Values

In [None]:
# Suppose after double checking, we conclude the price in Row 0 should not be 2099.00
# One way to fix wrong values is to replace them with correct values
sales.loc[0, 'Price'] = 20.99 # loc locates values by labels

You may not be able to replace the wrong data one by one for big datasets. To replace wrong data for larger data sets you can create some rules. For example, you can set some boundaries for legal values, and replace any values that are outside of the boundaries.

In [None]:
sales.loc[sales['Price'] > 15, 'Price'] = 15
print(sales)

In [None]:
sales['Price'] > 15  # returns a pandas series of boolean values that indicate which rows satisfy the condition

In [None]:
sales.loc[sales['Price'] > 15] # returns a pandas dataframe with rows corresponding to the True values in the previous pandas series

In [None]:
sales.loc[sales['Price'] > 15, 'Price']

**Practice 2: Change the price of Product P001
to 12 if the quantity of a transaction is greater than 1**

**Hint: View how to filter with multiple conditions at https://www.geeksforgeeks.org/filter-pandas-dataframe-with-multiple-conditions/**

# Clean Empty Cells

In [None]:
sales

In [None]:
# One way to deal with empty cells is to remove rows that contain empty cells
# This is usually OK if the dataset is big and removing a few rows will not have a big impact on the analysis results
sales_drop_na = sales.dropna()
print(sales_drop_na) # sales_drop_na does not have the row with index 6

In [None]:
print(sales) # By default, dropna() returns a new dataframe and will not change the original dataframe
# So the row with index 6 still exists in df

In [None]:
sales.dropna(inplace = True)
print(sales) # dropna(inplace = True) will NOT return a new DataFrame. Instead, it will remove all rows containing NULL values from the original dataframe
# The row with index 6 is now removed from the sales dataframe

In [None]:
# Reload the CSV into a dataframe
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
print(sales) # sales now has a null value in the Price column

In [None]:
sales.dropna(subset=['Price'], inplace = True) # Remove rows with a NULL value in the Price column
print(sales)

In [None]:
# Reload the CSV into a dataframe
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
print(sales) # sales has a null value in the Price column

In [None]:
# The following code looks correct but it only returns a pandas series with the sixth position replaced by 20
sales["Price"].fillna(20)

In [None]:
print(sales) # sales still has the null value in the Price column

In [None]:
# Replace NULL values in the Price column with the number 20
sales["Price"].fillna(20, inplace = True) # We need to set inplace = True so we are modifying df
print(sales)

In [None]:
# Reload the dataframe and follow the suggestion in the warning message
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
sales.fillna({"Price": 20}, inplace=True)

In [None]:
# Reload the dataframe and follow the suggestion in the warning message
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
sales["Price"] = sales["Price"].fillna(20)

# **New Content**

# Handle Duplicates

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
# Use duplicated() to discover duplicates
# duplicated() returns a Boolean value for each row, i.e., True for every row that is a duplicate, othwerwise False
print(df.duplicated())
# Note that Rows with index 8 and 9 are the same
# Row 8 is not a duplicate, and Row 9 is a duplicate
# Documentation at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html: the keep parameter by default is 'first'

In [None]:
# Use drop_duplicates() to remove duplicates
df.drop_duplicates(inplace = True) # We set inplace = True to make sure that the method does NOT return a new dataframe, but it will remove all duplicates from the original dataframe
print(df)
# Note that Row 8 is kept, and Row 9 is removed
# Documentation at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html: the keep parameter by default is 'first'

In [None]:
# Reload the CSV into a dataframe
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
print(sales) # sales has a null value in the Price column

In [None]:
# To drop duplicated rows based on one column's value in a pandas DataFrame, you can use the drop_duplicates() and specify the column name using the subset parameter
# By default, this method keeps the first occurrence of each duplicated row and drops the rest
sales.drop_duplicates(subset=['Transaction_ID']) # By default, inplace = False
# sales.drop_duplicates(subset=['Transaction_ID'], inplace = True)
# print(sales)

In [None]:
# Reload the CSV into a dataframe
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
# To drop duplicated rows based on one column's value in a pandas DataFrame, you can use the drop_duplicates() and specify the column name using the subset parameter
# By default, this method keeps the first occurrence of each duplicated row and drops the rest (i.e.,  keep='first')
sales.drop_duplicates(subset=['Product_ID'], keep='first')

In [None]:
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
# Use keep='last' to drop duplicates except for the last occurrence
sales.drop_duplicates(subset=['Product_ID'], keep='last')

In [None]:
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
# Use keep=False to drop all duplicates
sales.drop_duplicates(subset=['Product_ID'], keep=False)

**Practice 3: Write code to get all the unique pairs of Customer_ID and Product_ID from the "df" dataframe**

# Clean Date Format (Optional)

In [None]:
sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sales_data.csv')
# Pandas has a to_datetime() method for converting cell values to dates
sales['Date'] = pd.to_datetime(sales['Date'], format='mixed', dayfirst =False)
print(sales)
# Documentation at https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html:
# Use format='mixed' to infer the format for each element individually. This is risky, so we set dayfirst = False which indicates that we don't prefer to parse with day first.