In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
import seaborn as sns

In [3]:
data = pd.read_csv("data/customer.csv")
data.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [4]:
data = data.iloc[:, 2:]

In [5]:
data.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [6]:
X=data.drop("purchased", axis=1)
y=data["purchased"]

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

# ORDINAL ENCODING

In [8]:
from sklearn.preprocessing import OrdinalEncoder

oenc = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

In [9]:
oenc.fit(X_train)

OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

In [10]:
X_train = oenc.transform(X_train)
X_test = oenc.transform(X_test)
X_train

array([[2., 0.],
       [1., 1.],
       [2., 0.],
       [1., 1.],
       [0., 1.],
       [0., 0.],
       [1., 2.],
       [0., 1.],
       [2., 0.],
       [2., 1.],
       [0., 2.],
       [2., 1.],
       [1., 0.],
       [1., 0.],
       [2., 2.],
       [2., 1.],
       [1., 1.],
       [2., 2.],
       [0., 0.],
       [2., 2.],
       [2., 2.],
       [0., 2.],
       [2., 2.],
       [0., 0.],
       [0., 2.],
       [0., 2.],
       [1., 0.],
       [0., 2.],
       [2., 1.],
       [2., 1.],
       [1., 0.],
       [2., 0.],
       [0., 2.],
       [1., 1.],
       [0., 0.],
       [2., 0.],
       [1., 2.],
       [2., 1.],
       [0., 1.],
       [1., 1.]])

# LABEL ENCODING 
 * Only to use on target variable i.e. y if the target variable is categorical

In [11]:
from sklearn.preprocessing import LabelEncoder

lenc = LabelEncoder()
lenc.fit(y_train)
lenc.classes_

array(['No', 'Yes'], dtype=object)

In [12]:
y_train= lenc.transform(y_train)
y_test=lenc.transform(y_test)

# ONE-HOT ENCODING

In [13]:
data = pd.read_csv("data/cars.csv") 
data.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [14]:
data["fuel"].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [15]:
data["brand"].nunique()

32

# using Pandas

In [16]:
pd.get_dummies(data, columns = ['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


# K-1 one hot encoding

In [17]:
pd.get_dummies(data,columns= ['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


* We don't use pandas in machine learning projects for one hot because it doesn't remember the order of encoding or classification into dummy categories 

# using Scikit learn

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [19]:
X = data.drop("selling_price", axis=1)
y = data["selling_price"]

In [20]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2)

In [21]:
ohe = OneHotEncoder()
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()

In [22]:
X_test_new = ohe.fit_transform(X_test[['fuel','owner']]).toarray()

In [23]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 120000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 60000, 1.0, ..., 1.0, 0.0, 0.0],
       ['Mahindra', 80000, 0.0, ..., 1.0, 0.0, 0.0],
       ...,
       ['Hyundai', 123278, 0.0, ..., 1.0, 0.0, 0.0],
       ['Ford', 40000, 0.0, ..., 1.0, 0.0, 0.0],
       ['Nissan', 90000, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

# One hot encoding with Top Categories

In [24]:
counts = data["brand"].value_counts()

In [25]:
data['brand'].nunique()
threshold = 100

In [26]:
repl=counts[counts<=threshold].index

In [27]:
pd.get_dummies(data['brand'].replace(repl, 'others'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


# ColumnTransformer

In [28]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [29]:
data = pd.read_csv("data/covid_toy.csv")
data.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [30]:
X = data.drop("has_covid", axis=1)
y = data["has_covid"]

In [31]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size=0.2)

In [32]:
data.isna().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

* age - numerical 
* gender - nominal categorical - OneHotEncoder
* fever - numerical - missing values - SimpleImputer
* cough - ordinal categorical - OrdinalEncoder
* city - nominal categorical - OneHotEncoder
* has_covid - categorical target - LabelEncoder

# Aam Zindagi - 
 ## without Column Transformer

In [33]:
# GENDER, CITY
ohe = OneHotEncoder(drop='first',sparse=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])
X_train_gender_city.shape

(80, 4)

In [34]:
# COUGH
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])
X_test_cough = oe.fit_transform(X_test[['cough']])
X_train_cough.shape

(80, 1)

In [43]:
# FEVER
se = SimpleImputer()
X_train_fever = se.fit_transform(X_train[['fever']])
X_test_fever = se.fit_transform(X_test[['fever']])
X_train_fever.shape

(80, 1)

In [36]:
# get AGE
X_train_age = X_train.drop(columns=['gender','fever','cough','city'],axis=1).values
X_test_age = X_test.drop(columns=['gender','fever','cough','city'],axis=1).values
X_train_age.shape

(80, 1)

In [37]:
X_train_transformed= np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
X_test_tranformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough), axis=1)
X_train_transformed.shape

(80, 7)

# Mentos Zindagi -
## using ColumnTransformer

In [68]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder


In [69]:
data = pd.read_csv("data/covid_toy.csv")
data.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [70]:
X = data.drop("has_covid", axis=1)
y = data["has_covid"]

In [71]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size=0.2)

In [72]:
from sklearn.compose import ColumnTransformer

In [73]:
transformer = ColumnTransformer(transformers=[
    ('sim_imp',SimpleImputer(),['fever'])],
     remainder='passthrough')

In [74]:
X_train_f = transformer.fit_transform(X_train)


In [75]:
X_train_f

array([[102.0, 64, 'Male', 'Mild', 'Bangalore'],
       [99.0, 65, 'Male', 'Mild', 'Bangalore'],
       [99.0, 49, 'Female', 'Strong', 'Bangalore'],
       [104.0, 51, 'Male', 'Mild', 'Kolkata'],
       [100.0, 13, 'Female', 'Strong', 'Kolkata'],
       [102.0, 5, 'Male', 'Mild', 'Kolkata'],
       [102.0, 82, 'Female', 'Strong', 'Kolkata'],
       [99.0, 66, 'Male', 'Strong', 'Bangalore'],
       [99.0, 14, 'Female', 'Mild', 'Mumbai'],
       [100.88888888888889, 38, 'Male', 'Mild', 'Delhi'],
       [100.0, 11, 'Female', 'Strong', 'Kolkata'],
       [99.0, 72, 'Male', 'Mild', 'Bangalore'],
       [102.0, 69, 'Female', 'Mild', 'Bangalore'],
       [100.88888888888889, 84, 'Female', 'Mild', 'Bangalore'],
       [102.0, 20, 'Male', 'Strong', 'Delhi'],
       [98.0, 10, 'Female', 'Strong', 'Kolkata'],
       [100.88888888888889, 23, 'Male', 'Mild', 'Mumbai'],
       [104.0, 34, 'Female', 'Strong', 'Delhi'],
       [102.0, 74, 'Male', 'Mild', 'Mumbai'],
       [100.0, 5, 'Female', 'Mild', 

In [41]:
categorical_features = ['Sex', 'Embarked']
one_hot = OneHotEncoder(sparse=False , drop='first')
one_hot_transformer = ColumnTransformer([
   ('one_hot',one_hot,['Sex'])],remainder='passthrough')
X_train = one_hot_transformer.fit_transform(X_train)
X_test = one_hot_transformer.transform(X_test)

ValueError: A given column is not a column of the dataframe