Given a Pandas DataFrame, remove duplicate rows and  reset the index of the DataFrame


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.stats import mstats

# 1. Given a Pandas DataFrame, remove duplicate rows and reset the index of the DataFrame.
def remove_duplicates_and_reset_index(df):
    """
    Removes duplicate rows from a Pandas DataFrame and resets its index.

    Args:
        df (pd.DataFrame): The input Pandas DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with duplicates removed and index reset.
    """
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)  # drop=True prevents adding the old index as a new column
    return df

# Sample Usage:
data = {'col1': [1, 2, 2, 3, 4, 4, 5], 'col2': ['a', 'b', 'b', 'c', 'd', 'd', 'e']}
df = pd.DataFrame(data)
df_cleaned = remove_duplicates_and_reset_index(df)
print("DataFrame after removing duplicates and resetting index:\n", df_cleaned)


Implement a program that reads a CSV file into a Pandas  DataFrame and handles missing values using Imputation+


In [None]:


# 2. Implement a program that reads a CSV file into a Pandas DataFrame and handles missing values using Imputation.
def handle_missing_values_with_imputation(csv_file, strategy='mean'):
    """
    Reads a CSV file into a Pandas DataFrame and handles missing values using imputation.

    Args:
        csv_file (str): Path to the CSV file.
        strategy (str, optional): Imputation strategy. Can be 'mean', 'median', 'most_frequent', or 'constant'.
                                 Defaults to 'mean'.  If 'constant', use `fill_value`.

    Returns:
        pd.DataFrame: The DataFrame with missing values imputed.
    """
    try:
        df = pd.read_csv(csv_file)
        imputer = SimpleImputer(strategy=strategy)
        df.iloc[:, :] = imputer.fit_transform(df)  # Impute all columns (can be adjusted)
        return df
    except FileNotFoundError:
        print(f"Error: File '{csv_file}' not found.")
        return None

# Sample CSV file creation:
data = {'A': [1, 2, np.nan, 4, 5], 'B': [6, np.nan, 8, 9, 10], 'C': [11, 12, 13, np.nan, 15]}
df = pd.DataFrame(data)
df.to_csv('missing_data.csv', index=False)

df_imputed = handle_missing_values_with_imputation('missing_data.csv', strategy='mean')
if df_imputed is not None:
    print("\nDataFrame after imputation:\n", df_imputed)



 Create a function that takes a Pandas DataFrame and  converts text data into numerical values using One-Hot  Encoding


In [None]:

# 3. Create a function that takes a Pandas DataFrame and converts text data into numerical values using One-Hot Encoding.
def one_hot_encode_categorical_data(df, columns_to_encode=None):
    """
    Converts text data into numerical values using One-Hot Encoding.

    Args:
        df (pd.DataFrame): The input Pandas DataFrame.
        columns_to_encode (list, optional): List of column names to encode. If None, encodes all object type columns.
                                          Defaults to None.

    Returns:
        pd.DataFrame: The DataFrame with One-Hot Encoded columns.
    """

    if columns_to_encode is None:
        columns_to_encode = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True) #drop_first avoids multicollinearity
    return df

# Sample Usage:
data = {'Product': ['A', 'B', 'A', 'C', 'B'], 'Color': ['Red', 'Blue', 'Green', 'Red', 'Blue'], 'Price': [10, 20, 15, 25, 18]}
df = pd.DataFrame(data)
df_encoded = one_hot_encode_categorical_data(df)
print("\nDataFrame after One-Hot Encoding:\n", df_encoded)


Given a Pandas DataFrame, normalize the numerical  features using Z-Score Normalization


In [None]:


# 4. Given a Pandas DataFrame, normalize the numerical features using Z-Score Normalization.
def normalize_numerical_features_zscore(df, columns_to_normalize=None):
    """
    Normalizes numerical features using Z-Score Normalization (StandardScaler).

    Args:
        df (pd.DataFrame): The input Pandas DataFrame.
        columns_to_normalize (list, optional): List of column names to normalize. If None, normalizes all numeric columns.
                                             Defaults to None.

    Returns:
        pd.DataFrame: The DataFrame with normalized numerical features.
    """

    if columns_to_normalize is None:
        columns_to_normalize = df.select_dtypes(include=np.number).columns
    scaler = StandardScaler()
    df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
    return df

# Sample Usage:
data = {'A': [10, 20, 30, 40, 50], 'B': [100, 200, 150, 250, 180], 'Category': ['X', 'Y', 'X', 'Z', 'Y']}
df = pd.DataFrame(data)
df_normalized = normalize_numerical_features_zscore(df)
print("\nDataFrame after Z-Score Normalization:\n", df_normalized)


Write a Python program that uses Scikit-Learn to perform  data standardization on a dataset


In [None]:


# 5. Write a Python program that uses Scikit-Learn to perform data standardization on a dataset.
def standardize_data_sklearn(df, columns_to_standardize=None):
    """
    Performs data standardization using Scikit-Learn's StandardScaler.

    Args:
        df (pd.DataFrame): The input Pandas DataFrame.
        columns_to_standardize (list, optional): List of column names to standardize. If None, standardizes all numeric columns.
                                               Defaults to None.

    Returns:
        pd.DataFrame: The DataFrame with standardized data.
    """

    if columns_to_standardize is None:
        columns_to_standardize = df.select_dtypes(include=np.number).columns
    scaler = StandardScaler()
    df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])
    return df

# Sample Usage: (Reusing df from previous example)
df_standardized = standardize_data_sklearn(df)
print("\nDataFrame after Standardization (using Scikit-Learn):\n", df_standardized)


Implement a program that reads a JSON file into a Pandas  DataFrame and handles outliers using Winsorization


In [None]:


# 6. Implement a program that reads a JSON file into a Pandas DataFrame and handles outliers using Winsorization.
import json
from scipy.stats import mstats

def handle_outliers_with_winsorization(json_file, columns_to_winsorize=None, limits=(0.05, 0.95)):
    """
    Reads a JSON file into a Pandas DataFrame and handles outliers using Winsorization.

    Args:
        json_file (str): Path to the JSON file.
        columns_to_winsorize (list, optional): List of column names to winsorize. If None, winsorizes all numeric columns.
                                            Defaults to None.
        limits (tuple, optional): Tuple specifying the lower and upper percentiles for winsorization. Defaults to (0.05, 0.95).

    Returns:
        pd.DataFrame: The DataFrame with outliers handled using Winsorization.
    """

    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
        df = pd.DataFrame(data)

        if columns_to_winsorize is None:
            columns_to_winsorize = df.select_dtypes(include=np.number).columns

        for col in columns_to_winsorize:
            df[col] = mstats.winsorize(df[col], limits=limits)
        return df

    except FileNotFoundError:
        print(f"Error: File '{json_file}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{json_file}'.")
        return None

# Sample JSON file creation:
json_data = {'A': [10, 20, 100, 40, 500], 'B': [100, 200, 150, -50, 180], 'C': [5, 10, 15, 20, 100]}
with open('outlier_data.json', 'w') as f:
    json.dump(json_data, f)

df_winsorized = handle_outliers_with_winsorization('outlier_data.json', limits=(0.1, 0.9))
if df_winsorized is not None:
    print("\nDataFrame after Winsorization:\n", df_winsorized)



Create a function that takes a Pandas DataFrame and  removes irrelevant features using Feature Selection  techniques.


In [None]:

# 7. Create a function that takes a Pandas DataFrame and removes irrelevant features using Feature Selection techniques.
def remove_irrelevant_features(df, target_col, k=5):
    """
    Removes irrelevant features using Feature Selection techniques (SelectKBest with f_classif for classification).

    Args:
        df (pd.DataFrame): The input Pandas DataFrame.
        target_col (str): The name of the target column.
        k (int, optional): The number of top features to select. Defaults to 5.

    Returns:
        pd.DataFrame: The DataFrame with selected features.
    """

    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Handle non-numeric features (important for f_classif)
    X = pd.get_dummies(X, drop_first=True)  # One-hot encode categorical features

    selector = SelectKBest(f_classif, k=k)
    X_new = selector.fit_transform(X, y)

    selected_features = X.columns[selector.get_support()]
    df_selected = df[selected_features.tolist() + [target_col]] # Keep the target column

    return df_selected

# Sample Usage:
data = {'feature1': [1, 2, 3, 4, 5],
        'feature2': [5, 4, 3, 2, 1],
        'category1': ['A', 'B', 'A', 'C', 'B'],
        'category2': ['X', 'Y', 'X', 'Z', 'Y'],
        'target': [0, 1, 0, 1, 0]}
df = pd.DataFrame(data)

df_feature_selected = remove_irrelevant_features(df, 'target', k=2)
print("\nDataFrame after Feature Selection:\n", df_feature_selected)


Given a CSV file with customer details, preprocess the  data for further analysis (e.g., handle missing values, scale  features)


In [None]:


# 8. Given a CSV file with customer details, preprocess the data for further analysis (e.g., handle missing values, scale features).
def preprocess_customer_data(csv_file):
    """
    Preprocesses customer data from a CSV file for further analysis.
    Handles missing values, scales numerical features, and encodes categorical features.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        pd.DataFrame: The preprocessed Pandas DataFrame, or None if file not found.
    """

    try:
        df = pd.read_csv(csv_file)

        # Handle Missing Values (Example - customize as needed)
        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].fillna(df[col].mode()[0])  # Impute categorical with mode
            else:
                df[col] = df[col].fillna(df[col].median())  # Impute numerical with median

        # Scale Numerical Features (Example - customize)
        numerical_cols = df.select_dtypes(include=np.number).columns
        numerical_cols = numerical_cols.drop(labels=['CustomerID'], errors='ignore') # Assuming 'CustomerID' is an ID, not for scaling
        df[numerical_cols] = StandardScaler().fit_transform(df[numerical_cols])

        # Encode Categorical Features (Example - customize)
        categorical_cols = df.select_dtypes(include=['object']).columns
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

        return df

    except FileNotFoundError:
        print(f"Error: File '{csv_file}' not found.")
        return None

# Sample CSV file creation:
customer_data = {'CustomerID': [1, 2, 3, 4, 5],
                 'Age': [25, 30, np.nan, 40, 35],
                 'City': ['New York', 'London', 'Paris', 'New York', np.nan],
                 'Salary': [50000, 60000, 55000, np.nan, 70000],
                 'Gender': ['Male', 'Female', 'Male', 'Female', 'Male']}
customer_df = pd.DataFrame(customer_data)
customer_df.to_csv('customer_data.csv', index=False)

df_preprocessed_customer = preprocess_customer_data('customer_data.csv')
if df_preprocessed_customer is not None:
    print("\nPreprocessed Customer Data:\n", df_preprocessed_customer)


Write a Python program that uses Scikit-Learn to perform  data transformation using PCA (Principal Component  Analysis)


In [2]:


# 9. Write a Python program that uses Scikit-Learn to perform data transformation using PCA (Principal Component Analysis).
def apply_pca(df, n_components=2, columns_to_use=None):
    """
    Applies Principal Component Analysis (PCA) to reduce the dimensionality of the data.

    Args:
        df (pd.DataFrame): The input Pandas DataFrame.
        n_components (int, optional): The number of principal components to retain. Defaults to 2.
        columns_to_use (list, optional): List of column names to use for PCA. If None, uses all numeric columns.
                                        Defaults to None.

    Returns:
        pd.DataFrame: A new Pandas DataFrame containing the principal components.
    """

    if columns_to_use is None:
        columns_to_use = df.select_dtypes(include=np.number).columns

    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df[columns_to_use])
    pc_cols = [f'PC{i+1}' for i in range(n_components)]  # Create column names like PC1, PC2
    df_pca = pd.DataFrame(data=principal_components, columns=pc_cols)
    return df_pca

# Sample Usage: (Using the customer_df from previous example, but after preprocessing)
df_pca = apply_pca(df_preprocessed_customer, n_components=2)
print("\nDataFrame after PCA:\n", df_pca)



NameError: name 'df_preprocessed_customer' is not defined

Implement a function that takes a Pandas DataFrame and  performs data discretization on a numerical feature


In [1]:

# 10. Implement a function that takes a Pandas DataFrame and performs data discretization on a numerical feature.
def discretize_numerical_feature(df, col_to_discretize, bins=5, labels=None, method='equal_width'):
    """
    Performs data discretization on a numerical feature.

    Args:
        df (pd.DataFrame): The input Pandas DataFrame.
        col_to_discretize (str): The name of the numerical column to discretize.
        bins (int or list, optional): The number of equal-width bins or a list of bin edges. Defaults to 5.
        labels (list, optional): Labels to assign to the bins. If None, uses default bin labels. Defaults to None.
        method (str, optional): Discretization method. Can be 'equal_width', 'equal_freq', or 'custom'.
                               Defaults to 'equal_width'. If 'custom', use `bins` as bin edges.

    Returns:
        pd.DataFrame: The DataFrame with the discretized feature added as a new column.
    """

    if method == 'equal_width':
        df[f'{col_to_discretize}_binned'] = pd.cut(df[col_to_discretize], bins=bins, labels=labels)
    elif method == 'equal_freq':
        df[f'{col_to_discretize}_binned'] = pd.qcut(df[col_to_discretize], q=bins, labels=labels)
    elif method == 'custom':
        df[f'{col_to_discretize}_binned'] = pd.cut(df[col_to_discretize], bins=bins, labels=labels, right=False) #right=False: bins[i-1] < x <= bins[i]
    else:
        raise ValueError("Invalid discretization method. Choose 'equal_width', 'equal_freq', or 'custom'.")

    return df

# Sample Usage: (Using customer_df again)
df_discretized = discretize_numerical_feature(df_preprocessed_customer.copy(), 'Age', bins=3, labels=['Young', 'Middle-Aged', 'Senior'], method='equal_width')
print("\nDataFrame after Discret",df_discretized)

NameError: name 'df_preprocessed_customer' is not defined