In [16]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

encoder = LabelEncoder()

df = pd.read_csv('countries_encode.csv')

encoder.fit(df['Country'])

df['Country'] = encoder.transform(df['Country'])

df['Country'] = encoder.fit_transform(df['Country'])

df

Unnamed: 0,Participants,Country
0,14,0
1,17,2
2,20,1
3,24,2
4,42,0
5,34,2
6,53,0
7,2,1


In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

# Create sample data for demonstration
data = {
    'education': ['High School', 'Bachelor', 'Master', 'PhD', 'High School', 'Bachelor'],
    'color': ['Red', 'Blue', 'Green', 'Red', 'Blue', 'Orange'],
    'size': ['Small', 'Medium', 'Large', 'Medium', 'Small', 'Large'],
    'rating': [3, 4, 5, 4, 2, 5]
}
df = pd.DataFrame(data)
print("Original Data:")
print(df)

Original Data:
     education   color    size  rating
0  High School     Red   Small       3
1     Bachelor    Blue  Medium       4
2       Master   Green   Large       5
3          PhD     Red  Medium       4
4  High School    Blue   Small       2
5     Bachelor  Orange   Large       5


In [18]:
# Label Encoder - assigns arbitrary numbers to categories
le = LabelEncoder()

# Apply to education column
df['education_label'] = le.fit_transform(df['education'])
print("\nLabel Encoding (Education):")
print(df[['education', 'education_label']])

# Apply to color column
le_color = LabelEncoder()
df['color_label'] = le_color.fit_transform(df['color'])
print("\nLabel Encoding (Color):")
print(df[['color', 'color_label']])


Label Encoding (Education):
     education  education_label
0  High School                1
1     Bachelor                0
2       Master                2
3          PhD                3
4  High School                1
5     Bachelor                0

Label Encoding (Color):
    color  color_label
0     Red            3
1    Blue            0
2   Green            1
3     Red            3
4    Blue            0
5  Orange            2


In [19]:
# Ordinal Encoder - preserves meaningful order
oe = OrdinalEncoder(categories=[['Small', 'Medium', 'Large']])
df['size_ordinal'] = oe.fit_transform(df[['size']])

print("\nOrdinal Encoding (Size - has natural order):")
print(df[['size', 'size_ordinal']])

# For education (has natural progression)
education_order = [['High School', 'Bachelor', 'Master', 'PhD']]
oe_edu = OrdinalEncoder(categories=education_order)
df['education_ordinal'] = oe_edu.fit_transform(df[['education']])

print("\nOrdinal Encoding (Education - has progression):")
print(df[['education', 'education_ordinal']])


Ordinal Encoding (Size - has natural order):
     size  size_ordinal
0   Small           0.0
1  Medium           1.0
2   Large           2.0
3  Medium           1.0
4   Small           0.0
5   Large           2.0

Ordinal Encoding (Education - has progression):
     education  education_ordinal
0  High School                0.0
1     Bachelor                1.0
2       Master                2.0
3          PhD                3.0
4  High School                0.0
5     Bachelor                1.0


In [None]:
# One-Hot Encoder - creates separate binary columns
ohe = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' avoids multicollinearity

# Apply to color (no natural order)
color_encoded = ohe.fit_transform(df[['color']])
color_columns = [f'color_{cat}' for cat in ohe.categories_[0][1:]]  # Skip first due to drop='first'

color_df = pd.DataFrame(color_encoded, columns=color_columns)
df = pd.concat([df, color_df], axis=1)

#print(df)

print("\nOne-Hot Encoding (Color - no natural order):")
print(df[['color'] + color_columns])


One-Hot Encoding (Color - no natural order):
    color  color_Green  color_Orange  color_Red
0     Red          0.0           0.0        1.0
1    Blue          0.0           0.0        0.0
2   Green          1.0           0.0        0.0
3     Red          0.0           0.0        1.0
4    Blue          0.0           0.0        0.0
5  Orange          0.0           1.0        0.0


In [21]:
# Summary of encoding methods
encoding_guide = pd.DataFrame({
    'Encoder': ['Label Encoder', 'Ordinal Encoder', 'One-Hot Encoder'],
    'Use Case': [
        'Target variable, single column preprocessing',
        'Categories with natural order (Small→Medium→Large)',
        'Categories without order (Color, Country, etc.)'
    ],
    'Output': [
        'Single column with integers',
        'Single column with ordered integers', 
        'Multiple binary columns'
    ],
    'Example': [
        'Species: [cat=0, dog=1, bird=2]',
        'Education: [HS=0, Bachelor=1, Master=2, PhD=3]',
        'Color: [Red=[1,0,0], Blue=[0,1,0], Green=[0,0,1]]'
    ]
})

print("\n" + "="*80)
print("ENCODING METHODS GUIDE:")
print("="*80)
for idx, row in encoding_guide.iterrows():
    print(f"\n{row['Encoder']}:")
    print(f"  Use Case: {row['Use Case']}")
    print(f"  Output: {row['Output']}")
    print(f"  Example: {row['Example']}")


ENCODING METHODS GUIDE:

Label Encoder:
  Use Case: Target variable, single column preprocessing
  Output: Single column with integers
  Example: Species: [cat=0, dog=1, bird=2]

Ordinal Encoder:
  Use Case: Categories with natural order (Small→Medium→Large)
  Output: Single column with ordered integers
  Example: Education: [HS=0, Bachelor=1, Master=2, PhD=3]

One-Hot Encoder:
  Use Case: Categories without order (Color, Country, etc.)
  Output: Multiple binary columns
  Example: Color: [Red=[1,0,0], Blue=[0,1,0], Green=[0,0,1]]
