##### .columns and .shape

In [None]:
import pandas as pd

# Creating a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)

# Getting the column names
columns = df.columns
print(f"Column names: {columns}")
# Output: Column names: Index(['Name', 'Age', 'City'], dtype='object')

# Getting the shape of the DataFrame
shape = df.shape
print(f"Shape of the DataFrame: {shape}")
# Output: Shape of the DataFrame: (3, 3)


##### Checking for duplicates

In [None]:
import pandas as pd

# Sample data
data = {
    'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
    'Age': [25, 30, 25, 35, 30],
    'City': ['NY', 'LA', 'NY', 'SF', 'LA']
}

df = pd.DataFrame(data)

# Check for duplicated rows using df.duplicated() 
# The default is to check for duplicate rows considering all columns and marking the second occurrence as True.
duplicates = df.duplicated()
print("df.duplicated() (default behavior):\n", duplicates)
# Output:
# 0    False
# 1    False
# 2    True   -> Second occurrence of 'Alice', '25', 'NY'
# 3    False
# 4    True   -> Second occurrence of 'Bob', '30', 'LA'
# dtype: bool

# Find duplicated rows considering only the 'Name' column
duplicates_by_name = df.duplicated(subset=['Name'])
print("\nDuplicates considering only 'Name':\n", duplicates_by_name)
# Output:
# 0    False
# 1    False
# 2    True   -> Duplicate 'Alice'
# 3    False
# 4    True   -> Duplicate 'Bob'
# dtype: bool

# Check for duplicates but mark all occurrences (including the first one) as True
duplicates_keep_false = df.duplicated(keep=False)
print("\nMark all duplicates (keep=False):\n", duplicates_keep_false)
# Output:
# 0    True   -> Mark all 'Alice' occurrences
# 1    True   -> Mark all 'Bob' occurrences
# 2    True
# 3    False  -> 'Charlie' is unique
# 4    True
# dtype: bool

# Drop duplicate rows, keeping the first occurrence of each duplicate
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after drop_duplicates (keep='first'):\n", df_no_duplicates)
# Output:
#       Name  Age City
# 0    Alice   25   NY   -> Keeps the first occurrence of 'Alice'
# 1      Bob   30   LA   -> Keeps the first occurrence of 'Bob'
# 3  Charlie   35   SF   -> 'Charlie' is unique

# Drop duplicate rows but keeping the last occurrence of each duplicate
df_no_duplicates_last = df.drop_duplicates(keep='last')
print("\nDataFrame after drop_duplicates (keep='last'):\n", df_no_duplicates_last)
# Output:
#       Name  Age City
# 2    Alice   25   NY   -> Keeps the last occurrence of 'Alice'
# 4      Bob   30   LA   -> Keeps the last occurrence of 'Bob'
# 3  Charlie   35   SF   -> 'Charlie' is unique

# Drop duplicates based only on the 'Name' column (consider other columns ignored)
df_no_name_duplicates = df.drop_duplicates(subset=['Name'])
print("\nDataFrame after drop_duplicates (subset='Name'):\n", df_no_name_duplicates)
# Output:
#       Name  Age City
# 0    Alice   25   NY   -> Keeps the first 'Alice'
# 1      Bob   30   LA   -> Keeps the first 'Bob'
# 3  Charlie   35   SF   -> 'Charlie' is unique

# Drop duplicates based only on 'Name' and 'City' columns
df_no_name_city_duplicates = df.drop_duplicates(subset=['Name', 'City'])
print("\nDataFrame after drop_duplicates (subset=['Name', 'City']):\n", df_no_name_city_duplicates)
# Output:
#       Name  Age City
# 0    Alice   25   NY   -> Keeps the first 'Alice', 'NY'
# 1      Bob   30   LA   -> Keeps the first 'Bob', 'LA'
# 3  Charlie   35   SF   -> 'Charlie' is unique


Keep parameter in duplicated()


In [None]:
import pandas as pd

# Sample data
data = {
    'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
    'Age': [25, 30, 25, 35, 30],
    'City': ['NY', 'LA', 'NY', 'SF', 'LA']
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Duplicates with keep='first' (default)
print("\nDuplicates with keep='first':")
print(df.duplicated(keep='first'))
# Output:
# 0    False  -> Keeps first 'Alice'
# 1    False  -> Keeps first 'Bob'
# 2    True   -> Second 'Alice' is marked as a duplicate
# 3    False  -> 'Charlie' is unique
# 4    True   -> Second 'Bob' is marked as a duplicate
# dtype: bool

# Duplicates with keep='last'
print("\nDuplicates with keep='last':")
print(df.duplicated(keep='last'))
# Output:
# 0    True   -> First 'Alice' is now marked as a duplicate
# 1    True   -> First 'Bob' is marked as a duplicate
# 2    False  -> Last 'Alice' is kept
# 3    False  -> 'Charlie' is unique
# 4    False  -> Last 'Bob' is kept
# dtype: bool

# Duplicates with keep=False (mark all duplicates)
print("\nDuplicates with keep=False:")
print(df.duplicated(keep=False))
# Output:
# 0    True   -> First 'Alice' marked as a duplicate
# 1    True   -> First 'Bob' marked as a duplicate
# 2    True   -> Second 'Alice' marked as a duplicate
# 3    False  -> 'Charlie' is unique
# 4    True   -> Second 'Bob' marked as a duplicate
# dtype: bool

# Drop duplicates with keep='first' (default)
df_no_duplicates_first = df.drop_duplicates(keep='first')
print("\nDataFrame after drop_duplicates (keep='first'):")
print(df_no_duplicates_first)
# Output:
#       Name  Age City
# 0    Alice   25   NY   -> Keeps the first 'Alice'
# 1      Bob   30   LA   -> Keeps the first 'Bob'
# 3  Charlie   35   SF   -> 'Charlie' is unique

# Drop duplicates with keep='last'
df_no_duplicates_last = df.drop_duplicates(keep='last')
print("\nDataFrame after drop_duplicates (keep='last'):")
print(df_no_duplicates_last)
# Output:
#       Name  Age City
# 2    Alice   25   NY   -> Keeps the last 'Alice'
# 4      Bob   30   LA   -> Keeps the last 'Bob'
# 3  Charlie   35   SF   -> 'Charlie' is unique

# Drop all duplicates with keep=False (remove all duplicates)
df_no_duplicates_all = df.drop_duplicates(keep=False)
print("\nDataFrame after drop_duplicates (keep=False):")
print(df_no_duplicates_all)
# Output:
#       Name  Age City
# 3  Charlie   35   SF   -> 'Charlie' is the only unique row left


##### notnull() and isnull()

In [None]:
import pandas as pd

# Creating a DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', None],
    'Age': [25, 30, None, 22],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York'],
    'Salary': [50000, None, 70000, 45000]
}
df = pd.DataFrame(data)

# Checking for missing values
missing_values = df.isnull()
print("Missing values (True means missing):")
print(missing_values)
# Output:
# Missing values (True means missing):
#    Name    Age   City  Salary
# 0  False  False  False   False
# 1  False  False  False    True
# 2  False   True  False   False
# 3   True  False  False   False

# Checking for non-missing values
non_missing_values = df.notnull()
print("Non-missing values (True means not missing):")
print(non_missing_values)
# Output:
# Non-missing values (True means not missing):
#    Name    Age   City  Salary
# 0   True   True   True    True
# 1   True   True   True   False
# 2   True  False   True    True
# 3  False   True   True    True


#####

##### Usage of replace()

In [None]:
import pandas as pd

# Creating a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York']
}
df = pd.DataFrame(data)

# 1. Replacing 'New York' with 'NYC'
df_replaced = df.replace('New York', 'NYC')
print("DataFrame after replacing 'New York' with 'NYC':")
print(df_replaced)
# Output:
#       Name  Age         City
# 0    Alice   25          NYC
# 1      Bob   30  Los Angeles
# 2  Charlie   35       Chicago
# 3    David   40          NYC
print()  # Newline for clarity

# 2. Replacing multiple values using a dictionary
replacements = {'New York': 'NYC', 'Los Angeles': 'LA'}
df_replaced_multi = df.replace(replacements)
print("DataFrame after replacing multiple values:")
print(df_replaced_multi)
# Output:
#       Name  Age     City
# 0    Alice   25      NYC
# 1      Bob   30       LA
# 2  Charlie   35  Chicago
# 3    David   40      NYC
print()  # Newline for clarity

# 3. Replacing values in a specific column
df_replaced_column = df.replace({'City': replacements})
print("DataFrame after replacing values in 'City' column:")
print(df_replaced_column)
# Output:
#       Name  Age       City
# 0    Alice   25        NYC
# 1      Bob   30         LA
# 2  Charlie   35    Chicago
# 3    David   40        NYC
print()  # Newline for clarity

# 4. Replacing using regular expressions
df_replaced_regex = df.replace(to_replace=r'^New', value='Old', regex=True)
print("DataFrame after replacing 'New' with 'Old' in city names:")
print(df_replaced_regex)
# Output:
#       Name  Age         City
# 0    Alice   25     Old York
# 1      Bob   30  Los Angeles
# 2  Charlie   35       Chicago
# 3    David   40     Old York
print()  # Newline for clarity

# 5. Replacing in place
df.replace('Chicago', 'CHI', inplace=True)
print("DataFrame after replacing 'Chicago' with 'CHI' in place:")
print(df)
# Output:
#       Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35          CHI
# 3    David   40     New York


##### Usage of df.select_dtypes()

In [None]:
import pandas as pd

# Creating a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'Salary': [50000.0, 60000.0, 70000.0, 80000.0],
    'Employed': [True, False, True, True]
}
df = pd.DataFrame(data)

# 1. Selecting only numeric columns (integers and floats)
numeric_cols = df.select_dtypes(include='number')
print("Numeric Columns:")
print(numeric_cols)
# Output:
#    Age   Salary
# 0   25  50000.0
# 1   30  60000.0
# 2   35  70000.0
# 3   40  80000.0
print()  # Newline for clarity

# 2. Selecting only object (string) columns
string_cols = df.select_dtypes(include='object')
print("Object/String Columns:")
print(string_cols)
# Output:
#       Name
# 0    Alice
# 1      Bob
# 2  Charlie
# 3    David
print()  # Newline for clarity

# 3. Selecting only boolean columns
bool_cols = df.select_dtypes(include='bool')
print("Boolean Columns:")
print(bool_cols)
# Output:
#    Employed
# 0      True
# 1     False
# 2      True
# 3      True
print()  # Newline for clarity

# 4. Exclude numeric columns (integers and floats)
non_numeric_cols = df.select_dtypes(exclude='number')
print("Non-Numeric Columns:")
print(non_numeric_cols)
# Output:
#       Name  Employed
# 0    Alice      True
# 1      Bob     False
# 2  Charlie      True
# 3    David      True
print()  # Newline for clarity

# 5. Select both boolean and numeric columns
bool_and_numeric_cols = df.select_dtypes(include=['bool', 'number'])
print("Boolean and Numeric Columns:")
print(bool_and_numeric_cols)
# Output:
#    Age   Salary  Employed
# 0   25  50000.0      True
# 1   30  60000.0     False
# 2   35  70000.0      True
# 3   40  80000.0      True
print()  # Newline for clarity

# 6. Select numeric columns but exclude floats
int_cols_only = df.select_dtypes(include='number', exclude='float')
print("Only Integer Columns (Exclude Floats):")
print(int_cols_only)
# Output:
#    Age
# 0   25
# 1   30
# 2   35
# 3   40
print()  # Newline for clarity

# Advanced Use Case 1: Selecting specific numeric types
# Creating a DataFrame with mixed numeric types
data_mixed = {
    'Integers': [1, 2, 3, 4],
    'Floats': [1.5, 2.5, 3.5, 4.5],
    'Booleans': [True, False, True, False]
}
df_mixed = pd.DataFrame(data_mixed)

# Select only integer columns
int_cols = df_mixed.select_dtypes(include='int')
print("Integer Columns:")
print(int_cols)
# Output:
#    Integers
# 0         1
# 1         2
# 2         3
# 3         4
print()  # Newline for clarity

# Advanced Use Case 2: Selecting categorical columns
# Adding a categorical column to DataFrame
df['Department'] = pd.Categorical(['HR', 'Engineering', 'Marketing', 'HR'])

# Selecting only categorical columns
cat_cols = df.select_dtypes(include='category')
print("Categorical Columns:")
print(cat_cols)
# Output:
#     Department
# 0           HR
# 1  Engineering
# 2    Marketing
# 3           HR


##### astype()

In [None]:
import pandas as pd

# Example 1: Convert Strings to Dates
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Joining_Date': ['2021-01-01', '2020-06-15', '2019-12-22', '2022-03-10']
}
df = pd.DataFrame(data)

# Convert 'Joining_Date' to datetime
df['Joining_Date'] = df['Joining_Date'].astype('datetime64[ns]')
print("DataFrame after converting 'Joining_Date' to datetime:")
print(df)
# Output:
#       Name Joining_Date
# 0    Alice   2021-01-01
# 1      Bob   2020-06-15
# 2  Charlie   2019-12-22
# 3    David   2022-03-10
print()  # Newline for clarity

# Example 2: Convert Strings/Numbers to Categorical
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', 'Engineering', 'Marketing', 'HR']
}
df = pd.DataFrame(data)

# Convert 'Department' to categorical
df['Department'] = df['Department'].astype('category')
print("DataFrame after converting 'Department' to categorical:")
print(df)
# Output:
#       Name   Department
# 0    Alice           HR
# 1      Bob  Engineering
# 2  Charlie    Marketing
# 3    David           HR
print("Dtype of 'Department':", df['Department'].dtype)
# Output:
# Dtype of 'Department': category
print()  # Newline for clarity

# Example 3: Convert Object to Numeric
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': ['25', '30', '35', '40'],  # Age is stored as strings
    'Salary': ['50000.0', '60000.0', '70000.0', '80000.0']  # Salary is stored as strings
}
df = pd.DataFrame(data)

# Convert 'Age' to int and 'Salary' to float
df['Age'] = df['Age'].astype(int)
df['Salary'] = df['Salary'].astype(float)

print("DataFrame after converting 'Age' to int and 'Salary' to float:")
print(df)
# Output:
#       Name  Age   Salary
# 0    Alice   25  50000.0
# 1      Bob   30  60000.0
# 2  Charlie   35  70000.0
# 3    David   40  80000.0
print("Dtype of 'Age':", df['Age'].dtype)
print("Dtype of 'Salary':", df['Salary'].dtype)
# Output:
# Dtype of 'Age': int64
# Dtype of 'Salary': float64
print()  # Newline for clarity

# Advanced Use Case 1: Handling Invalid Conversions
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': ['25', '30', 'unknown', '40']  # 'unknown' is a non-numeric string
}
df = pd.DataFrame(data)

# Convert 'Age' to numeric, coerce errors
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

print("DataFrame after converting 'Age' with errors coerced:")
print(df)
# Output:
#       Name   Age
# 0    Alice  25.0
# 1      Bob  30.0
# 2  Charlie   NaN  # 'unknown' is converted to NaN
# 3    David  40.0
print()  # Newline for clarity

# Advanced Use Case 2: Convert Multiple Columns at Once
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': ['25', '30', '35', '40'],
    'Salary': ['50000.0', '60000.0', '70000.0', '80000.0']
}
df = pd.DataFrame(data)

# Convert 'Age' to int and 'Salary' to float using a dictionary
df = df.astype({'Age': int, 'Salary': float})

print("DataFrame after converting 'Age' to int and 'Salary' to float using a dictionary:")
print(df)
# Output:
#       Name  Age   Salary
# 0    Alice   25  50000.0
# 1      Bob   30  60000.0
# 2  Charlie   35  70000.0
# 3    David   40  80000.0


##### Handling conversion errors and to dateframe

In [None]:
import pandas as pd

# Handling Numeric Conversion with pd.to_numeric()
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': ['25', '30', 'unknown', '40'],  # 'unknown' is not a valid number
    'Salary': ['50000', '60000', 'invalid', '80000']  # 'invalid' is not a valid number
}
df = pd.DataFrame(data)

# Convert 'Age' and 'Salary' to numeric, coercing invalid entries to NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

print("DataFrame after converting 'Age' and 'Salary' with invalid entries coerced to NaN:")
print(df)
# Output:
#       Name   Age   Salary
# 0    Alice  25.0  50000.0
# 1      Bob  30.0  60000.0
# 2  Charlie   NaN      NaN
# 3    David  40.0  80000.0
print()  # Newline for clarity

# Convert to Datetime with pd.to_datetime()
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Joining_Date': ['2021-01-01', 'June 15, 2020', '22 Dec 2019', 'March 10, 2022'],
    'Invalid_Date': ['2021-01-01', 'invalid', '22 Dec 2019', 'unknown']  # Contains invalid dates
}
df = pd.DataFrame(data)

# Convert 'Joining_Date' to datetime
df['Joining_Date'] = pd.to_datetime(df['Joining_Date'], errors='raise')  # Raise errors for invalid data
print("DataFrame after converting 'Joining_Date' to datetime:")
print(df)
# Output:
#       Name Joining_Date
# 0    Alice   2021-01-01
# 1      Bob   2020-06-15
# 2  Charlie   2019-12-22
# 3    David   2022-03-10
print()  # Newline for clarity

# Convert 'Invalid_Date' to datetime with errors='coerce'
df['Invalid_Date'] = pd.to_datetime(df['Invalid_Date'], errors='coerce')

print("DataFrame after converting 'Invalid_Date' to datetime (coercing invalid values):")
print(df)
# Output:
#       Name Joining_Date Invalid_Date
# 0    Alice   2021-01-01   2021-01-01
# 1      Bob   2020-06-15          NaT  # 'invalid' converted to NaT (Not a Time)
# 2  Charlie   2019-12-22   2019-12-22
# 3    David   2022-03-10          NaT  # 'unknown' converted to NaT (Not a Time)
print()  # Newline for clarity

# Explicitly specifying date format for conversion
df['Joining_Date'] = pd.to_datetime(df['Joining_Date'], format='%Y-%m-%d', errors='coerce')

print("DataFrame after explicitly converting 'Joining_Date' with a specified format:")
print(df)
# Output:
#       Name Joining_Date Invalid_Date
# 0    Alice   2021-01-01   2021-01-01
# 1      Bob          NaT          NaT
# 2  Charlie          NaT   2019-12-22
# 3    David          NaT          NaT


##### drop_duplicates()

In [None]:
import pandas as pd

# Creating a DataFrame with duplicate rows
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'David'],
    'Age': [25, 30, 35, 25, 30, 40],
    'Department': ['HR', 'Engineering', 'Marketing', 'HR', 'Engineering', 'Sales']
}
df = pd.DataFrame(data)

# Dropping duplicate rows based on all columns
df_no_duplicates = df.drop_duplicates()
print("DataFrame after dropping duplicates (considering all columns):")
print(df_no_duplicates)
# Output:
#       Name  Age   Department
# 0    Alice   25           HR
# 1      Bob   30  Engineering
# 2  Charlie   35    Marketing
# 5    David   40        Sales

print()  # Newline for clarity

# Dropping duplicates based on the 'Name' column only
df_no_duplicates_name = df.drop_duplicates(subset=['Name'])
print("DataFrame after dropping duplicates based on the 'Name' column:")
print(df_no_duplicates_name)
# Output:
#       Name  Age   Department
# 0    Alice   25           HR
# 1      Bob   30  Engineering
# 2  Charlie   35    Marketing
# 5    David   40        Sales

print()  # Newline for clarity

# Dropping duplicates based on the 'Name' column, but keeping the last occurrence
df_no_duplicates_last = df.drop_duplicates(subset=['Name'], keep='last')
print("DataFrame after dropping duplicates based on the 'Name' column, keeping the last occurrence:")
print(df_no_duplicates_last)
# Output:
#       Name  Age   Department
# 3    Alice   25           HR
# 4      Bob   30  Engineering
# 2  Charlie   35    Marketing
# 5    David   40        Sales

print()  # Newline for clarity

# Dropping all occurrences of duplicate names
df_no_duplicates_all = df.drop_duplicates(subset=['Name'], keep=False)
print("DataFrame after dropping all duplicates in the 'Name' column:")
print(df_no_duplicates_all)
# Output:
#       Name  Age   Department
# 2  Charlie   35    Marketing
# 5    David   40        Sales

print()  # Newline for clarity

# Dropping duplicates in-place (modifies the original DataFrame)
df.drop_duplicates(subset=['Name'], inplace=True)
print("Original DataFrame after dropping duplicates in-place:")
print(df)
# Output:
#       Name  Age   Department
# 0    Alice   25           HR
# 1      Bob   30  Engineering
# 2  Charlie   35    Marketing
# 5    David   40        Sales


##### duplicated()

In [None]:
import pandas as pd

# Creating a DataFrame with some duplicate rows
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Bob'],
    'Age': [25, 30, 35, 25, 40, 30],
    'Department': ['HR', 'Engineering', 'Marketing', 'HR', 'Sales', 'Engineering']
}

df = pd.DataFrame(data)

# Using df.duplicated() to find duplicated rows based on all columns
duplicated_rows = df[df.duplicated()]
print("Duplicated rows based on all columns:")
print(duplicated_rows)
# Output:
#    Name  Age   Department
# 3  Alice   25           HR
# 5    Bob   30  Engineering

print()  # Newline for clarity

# Identifying duplicated rows based on the 'Name' column only
duplicated_rows_name = df[df.duplicated(subset=['Name'])]
print("Duplicated rows based on the 'Name' column:")
print(duplicated_rows_name)
# Output:
#    Name  Age   Department
# 3  Alice   25           HR
# 5    Bob   30  Engineering

print()  # Newline for clarity

# Identifying duplicated rows, keeping the last occurrence
duplicated_rows_last = df[df.duplicated(keep='last')]
print("Duplicated rows based on all columns, keeping the last occurrence:")
print(duplicated_rows_last)
# Output:
#    Name  Age   Department
# 0  Alice   25           HR
# 1    Bob   30  Engineering

print()  # Newline for clarity

# Identifying all duplicate rows (both first and last occurrences)
all_duplicated_rows = df[df.duplicated(keep=False)]
print("All duplicated rows (both first and last occurrences):")
print(all_duplicated_rows)
# Output:
#    Name  Age   Department
# 0  Alice   25           HR
# 1    Bob   30  Engineering
# 3  Alice   25           HR
# 5    Bob   30  Engineering
