In [1]:
import pandas as pd

- [Encoding of categorical variables](#Encoding-of-categorical-variables)
- [Random variables](#Random-variables)

# Encoding of categorical variables

In [2]:
df_master = pd.DataFrame({
    'color' : ['yellow', 'blue', 'red', 'yellow', 'red', 'red'],
    'label' : [1, 2, 3, 1, 3, 3]
})
df_master

Unnamed: 0,color,label
0,yellow,1
1,blue,2
2,red,3
3,yellow,1
4,red,3
5,red,3


In [3]:
X, y = df_master[['color']], df_master.label
X

Unnamed: 0,color
0,yellow
1,blue
2,red
3,yellow
4,red
5,red


In [4]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model

SVC(kernel='linear')

## With pandas.get_dummiess

### Encode all data and then split to train and test sets

In [5]:
X_encoded = pd.get_dummies(X)
X_encoded

Unnamed: 0,color_blue,color_red,color_yellow
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0
5,0,1,0


In [6]:
split_index = 3
X_train, X_test, y_train, y_test = X_encoded[:split_index], X_encoded[split_index:], y[:split_index], y[split_index:]
display(X_train); display(X_test)

Unnamed: 0,color_blue,color_red,color_yellow
0,0,0,1
1,1,0,0
2,0,1,0


Unnamed: 0,color_blue,color_red,color_yellow
3,0,0,1
4,0,1,0
5,0,1,0


- <span style="color:green;">Works fine</span>

### Split data to train and test and then encode each of them

In [7]:
X_train, X_test, y_train, y_test = X[:split_index], X[split_index:], y[:split_index], y[split_index:]
display(X_train); display(X_test)

Unnamed: 0,color
0,yellow
1,blue
2,red


Unnamed: 0,color
3,yellow
4,red
5,red


In [8]:
X_train_encoded = pd.get_dummies(X_train)
X_train_encoded

Unnamed: 0,color_blue,color_red,color_yellow
0,0,0,1
1,1,0,0
2,0,1,0


In [9]:
model.fit(X_train_encoded, y_test)

SVC(kernel='linear')

In [10]:
X_test_encoded = pd.get_dummies(X_test)
X_test_encoded

Unnamed: 0,color_red,color_yellow
3,0,1
4,1,0
5,1,0


In [11]:
model.predict(X_test_encoded)

Feature names seen at fit time, yet now missing:
- color_blue



ValueError: X has 2 features, but SVC is expecting 3 features as input.

- <span style="color:red;">The shape of the train and test color features is not the same (3 for train and 2 for test)</span>
- <span style="color:red;">The encoding is not the same: [1, 0] = red in test and [0, 1, 0] in train</span>

### New data

**- Known feature values**

In [None]:
new_data_1 = pd.DataFrame({'color': ['red', 'yellow', 'blue']})
new_data_1

In [None]:
new_data_1_encoded = pd.get_dummies(new_data_1)
new_data_1_encoded

<span style="color:green;">Works fine</span>

In [None]:
model.predict(new_data_1_encoded)

- **Unknown feature values**

In [None]:
X_train.color.unique()

In [None]:
new_data_2 = pd.DataFrame({'color': ['purple', 'blue', 'yellow']})
display(new_data_2)

In [None]:
new_data_2.color.unique()

In [None]:
set(new_data_2.color.unique()) - set(X_train.color.unique())

In [None]:
new_data_2_encoded = pd.get_dummies(new_data_2)
new_data_2_encoded

In [None]:
model.predict(new_data_2_encoded)

<span style="color:red;">The encoding worked even though the feature value is unknown</span>

## With OneHotEncoder

In [None]:
split_index = 3
X_train, X_test, y_train, y_test = X[:split_index], X[split_index:], y[:split_index], y[split_index:]
display(X_train); display(X_test)

### Known features

- Train

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='error', dtype=int, sparse=False)
encoder.fit(X_train.color.values.reshape(-1,1))

In [None]:
encoder.categories_

In [None]:
encoded_colors = encoder.transform(X_train.color.values.reshape(-1,1))
encoded_colors

In [None]:
encoded_colors_columns = encoder.get_feature_names(X_train.columns)
encoded_colors_columns

In [None]:
encoded_colors_df = pd.DataFrame(data=encoded_colors, columns=encoded_colors_columns, index=X_train.index)
encoded_colors_df

In [None]:
X_encoded = X_train.copy().join(encoded_colors_df)
X_encoded

In [None]:
X_encoded = X_encoded.drop('color', axis=1)
X_encoded

In [None]:
model.fit(X_encoded, y_train)

- Predict

In [None]:
encoded_colors_test = encoder.transform(X_test.color.values.reshape(-1,1))
encoded_colors_test

In [None]:
encoded_colors_columns = encoder.get_feature_names(X_test.columns)
encoded_colors_columns

In [None]:
encoded_colors_test_df = pd.DataFrame(data=encoded_colors_test, columns=encoded_colors_columns, index=X_test.index)
encoded_colors_test_df

In [None]:
X_encoded_test = X_test.copy().join(encoded_colors_test_df).drop('color', axis=1)
X_encoded_test

In [None]:
model.predict(X_encoded_test)

### Unknown features

In [None]:
X_train.color.unique()

In [None]:
new_data = pd.DataFrame({'color': ['purple', 'blue', 'yellow']})
display(new_data)

- Raise an error

In [None]:
encoder.transform(new_data.color.values.reshape(-1,1))

- Ignore exception

In [None]:
encoder_ignore = OneHotEncoder(handle_unknown='ignore', dtype=int, sparse=False)
encoder_ignore.fit(X_train.color.values.reshape(-1,1))

In [None]:
encoder_ignore.transform(new_data.color.values.reshape(-1,1))

In [None]:
model.predict(encoder_ignore.transform(new_data.color.values.reshape(-1,1)))

# Random variables

In [None]:
# fixing seed

# Extracting functions to python modules

- Python path
- Re-importing functions at each import

## Python path

In [None]:
from app.import_example import say_hello

In [None]:
!ls

In [None]:
import sys

sys.path

In [None]:
!ls ..

In [None]:
sys.path.append('..')

In [None]:
sys.path

In [None]:
from app.import_example import say_hello

In [None]:
say_hello()

In [None]:
# Made change in say_hello function

In [None]:
say_hello()

In [None]:
from app.import_example import say_hello

say_hello()

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append('..')

In [None]:
from app.import_example import say_hello

say_hello()

## Importing from python modules while changing code 

In [None]:
%load_ext autoreload

In [None]:
%aimport

In [None]:
%autoreload 2
%aimport

In [None]:
%autoreload 0
%aimport