<a href="https://colab.research.google.com/github/kilos11/Machine_Learning-Tensorflow-/blob/main/OneHotEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/100-days-of-machine-learning-main/day27-one-hot-encoding/cars.csv')

In [4]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [5]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

## 1. OneHotEncoding using Pandas

In [None]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


## 2. K-1 OneHotEncoding

In [7]:
# Using pandas' get_dummies function to perform one-hot encoding on categorical columns in the DataFrame (df)
# columns=['fuel', 'owner']: Specifying the columns to encode ('fuel' and 'owner' in this case)
# drop_first=True: Dropping the first level of each categorical variable to avoid multicollinearity in regression models
# This creates binary columns for each category, indicating the presence or absence of the category in each row
encoded_df = pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)


## 3. OneHotEncoding using Sklearn

In [6]:
# Importing the necessary function from scikit-learn for splitting the dataset
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and testing sets
# X_train: Training features, X_test: Testing features
# y_train: Corresponding labels for training set, y_test: Corresponding labels for testing set
# df.iloc[:, 0:4]: Selecting the first 4 columns (features) as training features
# df.iloc[:, -1]: Selecting the last column (target variable) as labels
# test_size=0.2: Setting aside 20% of the data for testing, 80% for training
# random_state=2: Setting a random seed for reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, -1], test_size=0.2, random_state=2)


In [None]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
# Creating an instance of the OneHotEncoder from scikit-learn
# drop='first': Dropping the first level of each categorical variable to avoid multicollinearity in regression models
# sparse=False: Generating a dense array instead of a sparse matrix for easier interpretation and compatibility
# dtype=np.int32: Specifying the data type of the encoded values as 32-bit integers for memory efficiency
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32)


In [None]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [None]:
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [None]:
X_train_new.shape

(6502, 7)

In [None]:
# Using NumPy's hstack function to horizontally stack arrays
# X_train[['brand', 'km_driven']].values: Extracting the values of the 'brand' and 'km_driven' columns from the X_train DataFrame
# X_train_new: Additional array or matrix to be horizontally stacked with the previous extracted values
# This operation combines the two arrays along their second axis, effectively adding new features to the existing ones
combined_array = np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))


## 4. OneHotEncoding with Top Categories

In [None]:
counts = df['brand'].value_counts()

In [None]:
df['brand'].nunique()
threshold = 100

In [None]:
repl = counts[counts <= threshold].index

In [None]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
8093,0,0,0,0,1,0,0,0,0,0,0,0,0
3274,0,0,0,0,0,0,1,0,0,0,0,0,0
2966,0,0,0,0,0,0,1,0,0,0,0,0,0
1092,1,0,0,0,0,0,0,0,0,0,0,0,0
5355,0,0,0,0,0,0,0,0,0,0,0,0,1
