# nominal or one hot encoding 

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder 

In [42]:
#create a dataframe with categorical variables
df=pd.DataFrame({
    'color':['red','blue','green','red','blue','green']
})

In [43]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,red
4,blue
5,green


# create an instance of one hot encoder

In [44]:
encoder=OneHotEncoder()

In [45]:
encoded=encoder.fit_transform(df[['color']])
#it will give sparse matrix so convert it to array
#fit the encoder to the dataframe and transform the categorical variable 

# transform it to dataframe and put the column names there

In [46]:
encoded

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [47]:
encoder.get_feature_names_out()

array(['color_blue', 'color_green', 'color_red'], dtype=object)

In [48]:
encoded_df=pd.DataFrame(encoded.toarray(),columns=encoder.get_feature_names_out())

In [49]:
encoded_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


# concat the both actual and ecoded one 

In [50]:
pd.concat([df,encoded_df],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,red,0.0,0.0,1.0
4,blue,1.0,0.0,0.0
5,green,0.0,1.0,0.0


# internal practising for better performance 

In [51]:
import seaborn as sns
df1=sns.load_dataset('tips')

In [52]:
df1

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [53]:
encoder2=OneHotEncoder()

In [56]:
data=encoder2.fit_transform(df1[['sex']])

In [57]:
data

<244x2 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

In [58]:
encoder2.get_feature_names_out()

array(['sex_Female', 'sex_Male'], dtype=object)

In [59]:
encoded_df1=pd.DataFrame(data.toarray(),columns=encoder2.get_feature_names_out())

In [60]:
encoded_df1

Unnamed: 0,sex_Female,sex_Male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0
...,...,...
239,0.0,1.0
240,1.0,0.0
241,0.0,1.0
242,0.0,1.0


In [75]:
df1.drop('sex',axis=1,inplace=True)

KeyError: "['sex'] not found in axis"

In [62]:
df1

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.50,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3
240,27.18,2.00,Yes,Sat,Dinner,2
241,22.67,2.00,Yes,Sat,Dinner,2
242,17.82,1.75,No,Sat,Dinner,2


In [73]:
df1=pd.concat([df1,encoded_df1])

In [74]:
df1.head(2)

Unnamed: 0,sex_Female,sex_Male
0,0.0,0.0
1,0.0,0.0


In [69]:
df1=df1[['sex_Female','sex_Male']].fillna(0)

In [72]:
df1.head(3)

Unnamed: 0,sex_Female,sex_Male
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
