## Encoding Methods

In [2]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.5-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
Downloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Downloading statsmodels-0.14.5-cp313-cp313-win_amd64.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ------ --------------------------------- 1.6/9.6 MB 7.7 MB/s eta 0:00:02
   ------------- -------------------------- 3.1/9.6 MB 7.7 MB/s eta 0:00:01
   --------------------- ------------------ 5.2/9.6 MB 8.2 MB/s eta 0:00:01
   ------------------------------- -------- 7.6/9.6 MB 8.8 MB/s eta 0:00:01
   ---------------------------------------  9.4/9.6 MB 8.9 MB/s eta 0:00:01
   -------------------

In [3]:
# Importing necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from category_encoders import BinaryEncoder, TargetEncoder



In [4]:
# data

# ID,Color,Size,Shape
# 1,Red,Large,Circle
# 2,Green,Medium,Square
# 3,Blue,Small,Triangle
# 4,Red,Medium,Square
# 5,Green,Small,Circle
# 6,Blue,Large,Triangle


In [5]:

# Sample dataset for encoding
data = {'ID': [1, 2, 3, 4, 5, 6],
        'Color': ['Red', 'Green', 'Blue', 'Red', 'Green', 'Blue'],
        'Size': ['Large', 'Medium', 'Small', 'Medium', 'Small', 'Large'],
        'Shape': ['Circle', 'Square', 'Triangle', 'Square', 'Circle', 'Triangle']}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the dataset
print("Original Dataset:")
print(df)


Original Dataset:
   ID  Color    Size     Shape
0   1    Red   Large    Circle
1   2  Green  Medium    Square
2   3   Blue   Small  Triangle
3   4    Red  Medium    Square
4   5  Green   Small    Circle
5   6   Blue   Large  Triangle


## Label Encoding

In [6]:

# --- Label Encoding (Single column encoding) ---
label_encoder = LabelEncoder()
df['Color_Label'] = label_encoder.fit_transform(df['Color'])

print("\nLabel Encoded 'Color' column:")
print(df[['Color', 'Color_Label']])

# Explanation:
# Label encoding converts categorical data into numerical format. Each unique category gets a unique integer value.
# 'Red' -> 2, 'Green' -> 1, 'Blue' -> 0




Label Encoded 'Color' column:
   Color  Color_Label
0    Red            2
1  Green            1
2   Blue            0
3    Red            2
4  Green            1
5   Blue            0


##  One-Hot Encoding

In [7]:
# --- One-Hot Encoding ---
one_hot_encoder = pd.get_dummies(df[['Color', 'Size']], drop_first=True)

print("\nOne-Hot Encoded 'Color' and 'Size' columns:")
print(one_hot_encoder)

# Explanation:
# One-Hot encoding creates a binary column for each category of the feature.
# `drop_first=True` removes one of the categories to avoid multicollinearity.
# 'Color' -> [Color_Green, Color_Red], 'Size' -> [Size_Medium, Size_Small]




One-Hot Encoded 'Color' and 'Size' columns:
   Color_Green  Color_Red  Size_Medium  Size_Small
0        False       True        False       False
1         True      False         True       False
2        False      False        False        True
3        False       True         True       False
4         True      False        False        True
5        False      False        False       False


## --  Ordinal Encoding --

    # Explanation:
    # Ordinal encoding assigns a numerical value to each category based on the order provided.
    # Here 'Small' -> 0, 'Medium' -> 1, 'Large' -> 2

In [9]:
# --- Ordinal Encoding (For Ordinal categorical data) ---
ordinal_encoder = OrdinalEncoder(categories=[['Small', 'Medium', 'Large']])
df['Size_Ordinal'] = ordinal_encoder.fit_transform(df[['Size']])

print("\nOrdinal Encoded 'Size' column:")
print(df[['Size', 'Size_Ordinal']])






Ordinal Encoded 'Size' column:
     Size  Size_Ordinal
0   Large           2.0
1  Medium           1.0
2   Small           0.0
3  Medium           1.0
4   Small           0.0
5   Large           2.0


# --- Frequency Encoding ---
    # Explanation:
    # Frequency encoding assigns the count of each category's occurrence in the dataset.
    # Example: 'Red' appears 2 times, so it's encoded as 2.


In [10]:
# --- Frequency Encoding ---
df['Color_Frequency'] = df.groupby('Color')['Color'].transform('count')
print("\nFrequency Encoded 'Color' column:")
print(df[['Color', 'Color_Frequency']])

# Explanation:
# Frequency encoding assigns the count of each category's occurrence in the dataset.
# Example: 'Red' appears 2 times, so it's encoded as 2.



Frequency Encoded 'Color' column:
   Color  Color_Frequency
0    Red                2
1  Green                2
2   Blue                2
3    Red                2
4  Green                2
5   Blue                2


# --- Binary Encoding ---


    # Explanation:
    # Binary encoding converts categories into binary representations.
    # Each category is first encoded as an integer and then transformed into binary.


In [11]:
# --- Binary Encoding ---
binary_encoder = BinaryEncoder(cols=['Color'])
df_binary = binary_encoder.fit_transform(df['Color'])

print("\nBinary Encoded 'Color' column:")
print(df_binary)




Binary Encoded 'Color' column:
   Color_0  Color_1
0        0        1
1        1        0
2        1        1
3        0        1
4        1        0
5        1        1


# --- Target Encoding ---
    # Assume we have a target variable, let's create a dummy one for this example.

    # Explanation:
    # Target encoding assigns the mean value of the target variable for each category.
    # For example, if 'Red' has target values of [1, 0], its mean will be (1+0)/2 = 0.5.


In [12]:
# --- Target Encoding ---
df['Target'] = [1, 0, 1, 0, 1, 0]

target_encoder = TargetEncoder(cols=['Color'])
df['Color_TargetEncoded'] = target_encoder.fit_transform(df['Color'], df['Target'])

print("\nTarget Encoded 'Color' column based on 'Target' variable:")
print(df[['Color', 'Color_TargetEncoded']])



Target Encoded 'Color' column based on 'Target' variable:
   Color  Color_TargetEncoded
0    Red                  0.5
1  Green                  0.5
2   Blue                  0.5
3    Red                  0.5
4  Green                  0.5
5   Blue                  0.5


# --- Hashing Encoding ---
    # Hashing encoding is particularly useful when we have a large number of categories.

    # Explanation:
    # Hashing encoding applies a hash function to convert each category into a number of binary columns.
    # It is memory efficient, but can introduce collisions.


In [15]:
# --- Hashing Encoding ---
hasher = FeatureHasher(input_type='string', n_features=3)  # n_features can be changed based on requirement
hashed_features = hasher.transform(df[['Color']].astype(str).values)

df_hashed = pd.DataFrame(hashed_features.toarray(), columns=['hash1', 'hash2', 'hash3'])

print("\nHashing Encoded 'Color' column:")
print(df_hashed)




Hashing Encoded 'Color' column:
   hash1  hash2  hash3
0    0.0    0.0    1.0
1    0.0    1.0    0.0
2   -1.0    0.0    0.0
3    0.0    0.0    1.0
4    0.0    1.0    0.0
5   -1.0    0.0    0.0


## Output the final dataset with all encodings applied

In [13]:
# Output the final dataset with all encodings applied
final_df = pd.concat([df, one_hot_encoder,df_binary, df_hashed], axis=1)

print("\nFinal Dataset with Encodings:")
print(final_df)


NameError: name 'df_hashed' is not defined