<a href="https://colab.research.google.com/github/jadhav-rakesh/ML/blob/main/ds5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

* Nominal
* Ordinal

#Encoding Nominal Categorical Features

In [16]:
#feature with nominal classes that has no intrinsic ordering (e.g., apple, pear, banana), and you want to encode the feature into numerical values

from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

features = np.array([["Texas"],
                    ["California"],
                    ["Texas"],
                    ["Delaware"],
                    ["Texas"]])

one_hot = LabelBinarizer()

one_hot.fit_transform(features)


array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [17]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [18]:
one_hot.inverse_transform(one_hot.transform(features))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [19]:
# use pandas to one-hot encode the feature:

pd.get_dummies(features[:, 0], dtype=int)

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [24]:
multiclass_feature = [("Texas", "Florida"),
                      ("California", "Alabama"),
                      ("Texas", "Florida"),
                      ("Delaware", "Florida"),
                      ("Texas", "Alabama")]

one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [21]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delaware', 'Florida', 'Texas'],
      dtype=object)

* * after one-hot encoding a feature, we drop one of the one-hot encoded features in the resulting matrix to avoid linear dependence.


#Encoding Ordinal Categorical Features

In [26]:
df = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

scale_mapper = {"Low":1,
                "Medium":2,
                "High":3}

df["Score"].replace(scale_mapper)

  df["Score"].replace(scale_mapper)


Unnamed: 0,Score
0,1
1,1
2,2
3,2
4,3


In [28]:
df = pd.DataFrame({"Score": ["Low",
                                    "Low",
                                    "Medium",
                                    "Medium",
                                    "High",
                                    "Barely More Than Medium"]})

scale_mapper = {"Low":1,
                "Medium":2,
                "Barely More Than Medium":3,
                "High":4}

df["Score"].replace(scale_mapper)

  df["Score"].replace(scale_mapper)


Unnamed: 0,Score
0,1
1,1
2,2
3,2
4,4
5,3


In [30]:
scale_mapper = {"Low":1,
                "Medium":2,
                "Barely More Than Medium":2.1,
                "High":3}

df["Score"].replace(scale_mapper)

  df["Score"].replace(scale_mapper)


Unnamed: 0,Score
0,1.0
1,1.0
2,2.0
3,2.0
4,3.0
5,2.1


#Encoding Dictionaries of Features

In [34]:
from sklearn.feature_extraction import DictVectorizer

data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]

dictvectorizer = DictVectorizer(sparse=False)

features = dictvectorizer.fit_transform(data_dict)
print(features)

[[4. 2. 0.]
 [3. 4. 0.]
 [0. 1. 2.]
 [0. 2. 2.]]


In [38]:
feature_names = dictvectorizer.get_feature_names_out()

feature_names



array(['Blue', 'Red', 'Yellow'], dtype=object)

In [40]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [41]:
doc_1_word_count = {"Red": 2, "Blue": 4}
doc_2_word_count = {"Red": 4, "Blue": 3}
doc_3_word_count = {"Red": 1, "Yellow": 2}
doc_4_word_count = {"Red": 2, "Yellow": 2}

doc_word_counts = [doc_1_word_count,
                   doc_2_word_count,
                   doc_3_word_count,
                   doc_4_word_count]

dictvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

#Imputing Missing Class Values

In [45]:
#categorical feature containing missing values that you want to replace with predicted values.

from sklearn.neighbors import KNeighborsClassifier

X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])

clf = KNeighborsClassifier(3, weights="distance")
trained_model = clf.fit(X[:, 1:], X[:, 0])

imputed_values = trained_model.predict(X_with_nan[:, 1:])

X_with_imputed = np.hstack((imputed_values.reshape(-1, 1), X_with_nan[:, 1:]))

np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [46]:
from sklearn.impute import SimpleImputer

X_complete = np.vstack((X_with_nan, X))

imputer = SimpleImputer(strategy = "most_frequent")

imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

iris = load_iris()

features = iris.data

target = iris.target

features = features[40:, :]
target = target[40:]

target = np.where((target==0), 0, 1)

target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [50]:
weights = {0:0.9, 1:0.1}

RandomForestClassifier(class_weight=weights)

In [51]:
RandomForestClassifier(class_weight="balanced")

* In downsampling, we randomly sample without replacement from the majority class (i.e., the class with more observations) to create a new subset of observations equal in size to the minority class.

In [53]:
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

n_class0 = len(i_class0)
n_class1 = len(i_class1)

i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)

np.hstack((target[i_class0], target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [55]:
np.vstack((features[i_class0, :], features[i_class1_downsampled,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

* In upsampling, for every observation in the majority class, we randomly select an observation from the minority class with replacement

In [57]:
i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)

np.concatenate((target[i_class0_upsampled], target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [58]:
np.vstack((features[i_class0_upsampled, :], features[i_class1, :]))[0:5]

array([[5.1, 3.8, 1.6, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.3, 1.4, 0.2]])

#imbalanced classes


1. simply to collect more observations
2. model evaluation metric better suited to imbalanced classes[confusion matrices, precision, recall, F1 scores, and ROC curves]
3. use the class weighing parameters included in implementations of some models
4. downsampling
5. upsampling
