In [1]:
#importing warnings filter and ingoring all future warnings

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np 
import pandas as pd


In [3]:
data = pd.DataFrame({"Name": ["P1", "P2", "P3", "P4", "P5", "P6"], "Category": ["c1", "c2", "c3", "c2", "c3", "c1"], "Price": [100, 200, 400, 100, np.nan, 400]})
data

Unnamed: 0,Name,Category,Price
0,P1,c1,100.0
1,P2,c2,200.0
2,P3,c3,400.0
3,P4,c2,100.0
4,P5,c3,
5,P6,c1,400.0


In [4]:
type(data)

pandas.core.frame.DataFrame

In [5]:
#Encoding 
pd.get_dummies(data["Category"])

Unnamed: 0,c1,c2,c3
0,True,False,False
1,False,True,False
2,False,False,True
3,False,True,False
4,False,False,True
5,True,False,False


In [6]:
# encoding as type int
pd.get_dummies(data["Category"]).astype(int)

Unnamed: 0,c1,c2,c3
0,1,0,0
1,0,1,0
2,0,0,1
3,0,1,0
4,0,0,1
5,1,0,0


In [7]:
# encoding using OneHotEncoder

onehot = OneHotEncoder()
transformed_data = onehot.fit_transform(data["Category"].to_numpy().reshape(-1,1)).toarray()
onehot_data = pd.DataFrame(transformed_data, columns = onehot.get_feature_names_out(["Category"]))

In [8]:
# imputing missing data

imputer = SimpleImputer(strategy='mean')
imputed = imputer.fit_transform(data["Price"].to_numpy().reshape(-1,1))
imputed_data = pd.DataFrame(imputed, columns = imputer.get_feature_names_out(["Price"]))
imputed_data

Unnamed: 0,Price
0,100.0
1,200.0
2,400.0
3,100.0
4,240.0
5,400.0


In [9]:
#using Pipeline, SimpleImputer and ColumnTransformer to transform dataset

numeric_features = ["Price"]
categorical_features = ["Category"]

numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy="mean")), ("scale", StandardScaler())])

categorical_pipeline = Pipeline(steps = [('impute', SimpleImputer(strategy="most_frequent")),("one_hot", OneHotEncoder(handle_unknown='ignore'))])


full_processor_data = ColumnTransformer(transformers=[('number',numeric_pipeline, numeric_features), ('category', categorical_pipeline, categorical_features)], remainder = 'passthrough')

fully_processed_data = full_processor_data.fit_transform(data)



In [10]:
#fully_processed_data is a numpy array.
type(fully_processed_data)

#Now I want to make a pandas dataframe of it.

modified_data = pd.DataFrame(fully_processed_data, columns=full_processor_data.get_feature_names_out())

In [11]:
modified_data

Unnamed: 0,number__Price,category__Category_c1,category__Category_c2,category__Category_c3,remainder__Name
0,-1.130602,1.0,0.0,0.0,P1
1,-0.323029,0.0,1.0,0.0,P2
2,1.292117,0.0,0.0,1.0,P3
3,-1.130602,0.0,1.0,0.0,P4
4,0.0,0.0,0.0,1.0,P5
5,1.292117,1.0,0.0,0.0,P6


In [19]:
sentences = ["I like apples", "I like orange", "I don't eat bananas beacuse of the amount of sugar they contain"]

#creating the set of unique words

unique_words = set(word for sentence in sentences for word in sentence.split())

#creating a dictionary of words with its indices

word_index = {word: i for i, word in enumerate(unique_words)}

# converting a sentence to One Hot Encoder vector

def sentence_to_ohe(sentence, word_index):
    oh_vector = [0] * len(word_index)
    for word in sentence.split():
        if word in word_index:
            oh_vector[word_index[word]]=1
        else:
            print(f"Word \'{word}\' is not present in a words' dictionary.")

    return f"This is one hot encoder vector for the provided sentence: {oh_vector}."

In [20]:
sentence_to_ohe("I like apple", word_index)

Word 'apple' is not present in a words' dictionary.


'This is one hot encoder vector for the provided sentence: [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0].'

In [21]:
sentence_to_ohe("Actually I do not like apples", word_index)

Word 'Actually' is not present in a words' dictionary.
Word 'do' is not present in a words' dictionary.
Word 'not' is not present in a words' dictionary.


'This is one hot encoder vector for the provided sentence: [0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0].'