<a href="https://colab.research.google.com/github/jyoti-kumbhar/Data-Science/blob/main/prac3b_handling_categorical_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Practical 3B


Aim: Perform feature dummification to convert categorical variables into numerical representations.


T091 Jyoti Kumbhar

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [3]:
df=pd.read_csv("/content/weather_forecast.csv")

In [4]:
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [5]:
# Encoding Nominal Categorical Features
one_hot = LabelBinarizer()
one_hot.fit_transform(df['Temperature'])

array([[0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [6]:
one_hot.classes_

array(['Cool', 'Hot', 'Mild'], dtype='<U4')

In [7]:
one_hot.inverse_transform(one_hot.transform(df['Temperature']))

array(['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild',
       'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'], dtype='<U4')

In [8]:
# Using pandas for One-Hot Encoding
df1 = pd.get_dummies(df, columns=['Outlook', 'Humidity'], drop_first=True)
df1.head()

Unnamed: 0,Temperature,Windy,Play,Outlook_Rain,Outlook_Sunny,Humidity_Normal
0,Hot,Weak,No,False,True,False
1,Hot,Strong,No,False,True,False
2,Hot,Weak,Yes,False,False,False
3,Mild,Weak,Yes,True,False,False
4,Cool,Weak,Yes,True,False,True


In [9]:
#Encoding Multi-Label Features
one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(df)

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1]])

In [12]:
#Encoding Ordinal Categorical Features
scale_mapper = {
    "Hot": 1,
    "Mild": 2,
    "Cool": 3
}

df["Temperature"].replace(scale_mapper).head()

  df["Temperature"].replace(scale_mapper).head()


Unnamed: 0,Temperature
0,1
1,1
2,1
3,2
4,3


In [14]:
# Encoding Dictionaries of Features
from sklearn.feature_extraction import DictVectorizer

dictvectorizer = DictVectorizer(sparse=False)
# converts dataframe into dictionary
df3=df.to_dict(orient='records')
features = dictvectorizer.fit_transform(df3)
dictvectorizer.get_feature_names_out()

array(['Humidity=High', 'Humidity=Normal', 'Outlook=Overcast',
       'Outlook=Rain', 'Outlook=Sunny', 'Play=No', 'Play=Yes',
       'Temperature=Cool', 'Temperature=Hot', 'Temperature=Mild',
       'Windy=Strong', 'Windy=Weak'], dtype=object)

In [19]:
df4 = df.copy()
# chooses 5% of the dataframe entries to be NaN
nan_mask = np.random.rand(*df4.shape) < 0.05
df4 = df4.mask(nan_mask)

In [24]:
# Fill Missing with Most Frequent
from sklearn.impute import SimpleImputer
X_complete = np.vstack((df4, df4))
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit_transform(X_complete)

array([['Sunny', 'Hot', 'High', 'Weak', 'No'],
       ['Sunny', 'Hot', 'High', 'Strong', 'No'],
       ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
       ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
       ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
       ['Sunny', 'Mild', 'High', 'Weak', 'No'],
       ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
       ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
       ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
       ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'High', 'Strong', 'No'],
       ['Sunny', 'Hot', 'High', 'Weak', 'No'],
       ['Sunny', 'Hot', 'High', 'Strong', 'No'],
       ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
       ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rain', 'Cool', 'Normal', 'Strong', 'No

In [37]:
# Handling Imbalanced Classes
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
df_encoded = df.apply(LabelEncoder().fit_transform)
# Split features & target
X = df_encoded.drop("Play", axis=1)
y = df_encoded["Play"]

# make binary target
target = np.where(y == 0, 0, 1)
RandomForestClassifier(class_weight="balanced")

In [39]:
# Downsampling majority class
# Indices for each class
i_class0 = np.where(y == 0)[0]
i_class1 = np.where(y == 1)[0]

# Count
n_class0 = len(i_class0)
n_class1 = len(i_class1)

# Downsample majority class (Yes)
i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)

# Combine downsampled Yes + No
X_down = np.vstack((X.iloc[i_class0], X.iloc[i_class1_downsampled]))
y_down = np.hstack((y.iloc[i_class0], y.iloc[i_class1_downsampled]))

print("After downsampling:", np.bincount(y_down))

After downsampling: [5 5]


In [40]:
#Upsampling majority class
i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)

# Combine upsampled No + Yes
X_up = np.vstack((X.iloc[i_class0_upsampled], X.iloc[i_class1]))
y_up = np.hstack((y.iloc[i_class0_upsampled], y.iloc[i_class1]))

print("After upsampling:", np.bincount(y_up))


After upsampling: [9 9]
