In [None]:
# Dropping Columns and Rows
# Sometimes, your dataset includes extra information you don’t need — columns that are irrelevant to your goals, or rows that are incomplete or duplicated. Cleaning up your data often starts with dropping the parts you don’t want.

# In Pandas, the .drop() method is your main tool for this. Here are the syntax basics:

# # Drop a column
# df = df.drop('column_name', axis=1)

# # Drop multiple columns
# df = df.drop(['col1', 'col2'], axis=1)

# # Drop a row by index
# df = df.drop(0)

# # Drop rows with missing values
# df = df.dropna()


# Note:
# axis=0 means rows
# axis=1 means columns
# By default, .drop() returns a new DataFrame unless you use inplace=True.


In [None]:
# Why Drop Columns or Rows?
# Columns might be completely irrelevant to your analysis (e.g., “ID” or “notes” fields)
# Rows might be testing data, error logs, or outliers
# You may need to eliminate incomplete records with missing critical values
# Sometimes you'll filter out rows temporarily to check something — .drop() is one way to do this

In [None]:
# Examples:

# # Drop a single column
# df = df.drop('Unnamed: 0', axis=1)

# # Drop multiple columns
# df = df.drop(['email', 'phone'], axis=1)

# # Drop a row by index (e.g., row 5)
# df = df.drop(5, axis=0)

# # Drop rows 0 through 3
# df = df.drop([0, 1, 2, 3], axis=0)
# You can also drop in place, which means that the DataFrame is modified without having to save it by typing in 'df =' first:

# df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# Common Mistakes to Watch Out For
# Forgetting axis=1 when dropping columns (the default is axis=0, so it tries to drop a row instead)
# Not assigning the result back to df (unless using inplace=True)
# Dropping rows by label when your index isn't numeric (consider using .reset_index() first if needed)

In [1]:
# Practice Dropping Rows and Columns


# Leave the following code, it will import pandas and create a sample dataframe.
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['NYC', 'LA', 'Chicago', 'Miami'],
    'Email': ['a@example.com', 'b@example.com', 'c@example.com', 'd@example.com']
}

df = pd.DataFrame(data)


# 1. Drop the Email column and store in new variable
df_no_email = df.drop('Email', axis=1)
print(df_no_email)

# 2. Drop row with index 2 in place
df.drop(2, axis=0, inplace=True)
print(df)

# Re-create DataFrame for next steps- just leave this alone
df = pd.DataFrame(data)

# 3. Drop City and Email columns, assign back to df
df = df.drop(['City', 'Email'], axis=1)
print(df)

# Re-create again for next step
df = pd.DataFrame(data)

# 4. Drop first and last row, store in df_trimmed
df_trimmed = df.drop([df.index[0], df.index[-1]])
# print(df_trimmed)

      Name  Age     City
0    Alice   25      NYC
1      Bob   30       LA
2  Charlie   35  Chicago
3    David   40    Miami
    Name  Age   City          Email
0  Alice   25    NYC  a@example.com
1    Bob   30     LA  b@example.com
3  David   40  Miami  d@example.com
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40


In [None]:
# Replacing Values

# Sometimes, the data you're working with has placeholder or incorrect values — like "N/A", "?", "none", or "missing" instead of actual nulls (NaN). Other times, values are simply mislabeled or need to be standardized (e.g., "yes" and "Yes" should be treated the same).
# The .replace() method in Pandas is a quick and flexible way to update those values.

# Here is the basic syntax:
# df['column_name'].replace(to_replace, value)
# You can replace a single value, multiple values (using a dictionary), or across the entire DataFrame. See the following examples:

# df['Gender'].replace('M', 'Male') # Replaces a single value
# df['Gender'].replace({'M': 'Male', 'F': 'Female'}) # Replaces multiple values
# df.replace('?', np.nan) # Replaces all ? in the DataFrame with NaN
# Pro tip: Replacing is often a first step before handling missing values — for example, converting "N/A" or -1 into np.nan so you can later drop or fill those values cleanly.

In [None]:
# Handling Missing Values

# In the real world, data is rarely perfect. It's common to have missing or incomplete values in your dataset. These could be empty cells in a spreadsheet, null entries in a database, or NaN (Not a Number) values in a DataFrame.
# Missing values matter because they can break your analysis or skew results — especially if they exist in important columns you're trying to calculate things on.


# How do we detect missing values in pandas?
# df.isnull() # Returns a DataFrame of True/False values showing where data is missing
# But an easier way to view it is by summing the values of True:
# df.isnull().sum() # This will tell you how many missing values are in each column.


# How do we handle missing values?
# There are a few common strategies:

# 1. Drop rows or columns with missing values
# df.dropna()
# By default, drops any row with at least one missing value. If you want to drop an entire column with any missing values, set axis = 1:
# df.dropna(axis=1)


# 2. Fill missing values with something else
# df.fillna(0)                      # Replace missing values with 0
# df.fillna('Unknown')             # Replace missing strings with a placeholder
# df.fillna(df['Age'].mean())      # Replace with the column mean (or median, or mode)


# Best Practices
# There’s no one-size-fits-all answer - your approach depends on how much is missing and what kind of analysis you're doing. But here are some helpful guidelines:
# Drop the column if it's mostly missing and not essential
# Drop the row if it’s just a few and you have enough data
# Fill in the value if it’s just missing a little and you can justify what to use


# Common fill strategies:
# Use the mean or median for numbers
# Use the most common value for categories
# Use "Unknown" or "Not Provided" for missing strings
# Use 0 or a flag like False for missing booleans
# Try not to overthink it - missing values are extremely common in real-world data. But use caution: how you handle them can significantly impact your results. Always consider why the data might be missing and what your end goals are.
# The approach you take should make sense for your context, and be something you can clearly explain later.

In [None]:
# Practice Handling Missing Data


# Leave this code alone
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, np.nan, 35, 40, None],
    'City': ['NYC', 'LA', None, 'Miami', 'Boston']
}
df = pd.DataFrame(data)

# Answer the questions below!
# 1. View the DataFrame
print(df)

# 2. Show where values are missing
print(df.isnull())

# 3. Count of missing values by column
print(df.isnull().sum())

# 4. Drop rows with missing data
print(df.dropna())

# 5. Re-create original - leave this part alone
df = pd.DataFrame(data)

# 6. Drop columns with missing data
print(df.dropna(axis=1))

# 7. Re-create again - Leave this alone again
df = pd.DataFrame(data)

# 8. Fill missing Age with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())
print(df)

# 9. Fill missing City with "Unknown"
df['City'] = df['City'].fillna('Unknown')
print(df)


In [None]:
# Fixing Data Types
# When you load in a dataset, the column data types (called dtypes in pandas) don’t always come in the way you expect. Maybe a column full of numbers gets read as strings, or a column with dates stays as plain text. If you don’t catch these issues early, they can quietly cause bugs or errors later in your analysis.

# To view the current data types of your columns, use:
# df.dtypes
# Or, as you remember from the last lesson, the data types are shown when using df.info()


# Why Data Types Matter
# Data types affect what you can do with each column. For example:

# You can’t calculate a mean on a column of strings, even if they look like numbers.
# You can’t filter by date unless pandas knows it’s a datetime object.
# You might waste memory or processing time using a float64 column that only holds integers from 1–5.


# Common Fixes with .astype()
# To change a column’s data type, use:

# df['ColumnName'] = df['ColumnName'].astype(new_type)
# Here are some common examples:

# # Convert numeric strings to actual numbers
# df['Revenue'] = df['Revenue'].astype(float)

# # Convert integers to strings (e.g., zip codes)
# df['ZipCode'] = df['ZipCode'].astype(str)

# # Convert float to integer (use with caution)
# df['Quantity'] = df['Quantity'].astype(int)
# Note: Converting from float to int will drop decimal values (not round!), so make sure that makes sense for your context.


# Dates Require Special Handling
# To convert a column to datetime format (so you can do time-based filtering or aggregation), use:

# df['Date'] = pd.to_datetime(df['Date'])
# Pandas is smart about most common date formats, but if it doesn’t work automatically, you can provide a format string like this:

# df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
# You’ll learn more about working with dates in the data wrangling module — for now, just know that this is how you convert plain text to actual dates.


# Best Practices
# Always check df.dtypes and df.info() right after loading your data.
# Be intentional: only convert types if you’re confident in what they should be.
# If you’re unsure whether something is numeric or not, try using .astype(float) in a test cell and see if it throws an error — it’s often faster than overthinking it.

In [None]:
# Removing Duplicates
# Sometimes, your dataset includes rows that are exact copies of one another - either due to data entry errors, accidental merges, or export quirks. These duplicate rows can skew your analysis, inflate counts, or create confusion if not handled properly.
# Luckily, Pandas makes it easy to check for and remove duplicates using:

# .duplicated() — returns a Boolean Series indicating if a row is a duplicate of a previous one
# .drop_duplicates() — returns a new DataFrame with duplicates removed

# Example:
# import pandas as pd
# # Sample data
# data = {
#     'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob'],
#     'Age': [25, 30, 35, 25, 30]
# }
# df = pd.DataFrame(data)
# print(df)

# Output:

#       Name  Age
# 0    Alice   25
# 1      Bob   30
# 2  Charlie   35
# 3    Alice   25
# 4      Bob   30

# We can clearly see that rows 0 and 3 are identical, and so are rows 1 and 4, but what about when it isn't so obvious?


# How to Find and Remove Duplicates
# # Check which rows are duplicates (compares to earlier rows)
# df.duplicated()

# # Drop duplicates (keeps first occurrence by default)
# df_cleaned = df.drop_duplicates()


# Optional Arguments for drop_duplicates()
# keep='first' (default): keeps the first occurrence, drops the rest
# keep='last': keeps the last occurrence
# keep=False: drops all duplicates
# You can also specify a subset of columns to check for duplicates:

# # Only consider 'Name' column when looking for duplicates
# df.drop_duplicates(subset='Name')


# Best Practice Tips
# Use .duplicated() before .drop_duplicates() so you understand what you're removing.
# Think carefully before dropping duplicates — sometimes they’re valid!
# If you're only concerned about certain columns (e.g., name/email duplicates), use the subset parameter.

In [None]:
# Practice removing duplicates


# LEAVE THIS CODE ALONE
import pandas as pd

# Load sample data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'Diana'],
    'Age': [25, 30, 35, 25, 30, 40],
    'City': ['NYC', 'LA', 'SF', 'NYC', 'LA', 'NYC']
}

df = pd.DataFrame(data)

# ANSWER THE FOLLOWING QUESTIONS

# 1. Find which rows are duplicates (across all columns)
print("\nDuplicated rows (True if row is a duplicate of an earlier one):")
print(df.duplicated())

# 2. Drop duplicate rows (keeping the first occurrence)
print("\nDataFrame with duplicates dropped (keep='first'):")
print(df.drop_duplicates())

# 3. Drop duplicate rows (keeping only the last occurrence)
print("\nDataFrame with duplicates dropped (keep='last'):")
print(df.drop_duplicates(keep='last'))

# 4. Drop ALL duplicates (remove both original and duplicate)
print("\nDataFrame with ALL duplicates removed (keep=False):")
print(df.drop_duplicates(keep=False))