# Machine Learning with Python - Feature Engineering
The basics of feature engineering

### sklearn
Provides lots of tools to help!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#read in our titanic data
df_og = pd.read_csv('data/train.csv') 

## one hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
df_emb = df_og['Embarked']
df_emb.value_counts()

In [None]:
df_emb = df_emb.values.reshape(-1,1)
# create an encoder and fit the dataframe
enc = OneHotEncoder(sparse=False).fit(df_emb)
encoded = enc.transform(df_emb)

# convert it to a dataframe
ohe_df = pd.DataFrame(
     encoded, 
     columns=enc.get_feature_names_out()
)
print(ohe_df.head())
print(ohe_df.shape)

## Text Embedding
[OpenAI embedding service](https://beta.openai.com/docs/guides/embeddings/what-are-embeddings)

## Feature Scaling

In [None]:
df_og.Fare.hist()

In [None]:
df_og['logFare'] = np.log10(df_og['Fare'], where=df_og['Fare']>0)

In [None]:
df_og.logFare.describe()

## Principal Component Analysis
Work derived, in part, from example in [this blog post.](https://towardsdatascience.com/image-compression-using-principal-component-analysis-pca-253f26740a9f) I have reduced the size of the data in order to include it in the GitHub repo.

In [None]:
mnist = pd.read_csv('data/mnist.csv')
mnist.head(10)

In [None]:
mnist.shape

In [None]:
import matplotlib.pyplot as plt
mnist.drop(columns='label', inplace=True)

In [None]:
second_image = mnist.iloc[9].values.reshape([28,28])
plt.imshow(second_image, cmap='gray_r')
plt.title('Tenth image: Digit 4', fontsize=15, pad=15)

Next step would be is to scale the features so they are on the same or similar ranges. PCA is very senstive so scale as the method is based on explained variance. Larger sacled values would produce much greater variance. However, in an image all values are already scaled at each pixel (0 to 255 in this case).

In [None]:
print(mnist.iloc[1].min())
print(mnist.iloc[1].max())

In [None]:
from sklearn.decomposition import PCA

pca_784 = PCA(n_components=784)
pca_784.fit(mnist)

plt.grid()
plt.plot(np.cumsum(pca_784.explained_variance_ratio_ * 100))
plt.xlabel('Number of components')
plt.ylabel('Explained variance')

In [None]:
# let's just use the first 25 components
pca_25 = PCA(n_components=25)
mnist_pca_25_reduced = pca_25.fit_transform(mnist)
mnist_pca_25_recovered = pca_25.inverse_transform(mnist_pca_25_reduced)

image_pca_25 = mnist_pca_25_recovered[1,:].reshape([28,28])
plt.imshow(image_pca_25, cmap='gray_r')
plt.title('Compressed image with 25 components', fontsize=15, pad=15)

In [None]:
# let's expand to 200 components
pca_200 = PCA(n_components=200)
mnist_pca_200_reduced = pca_200.fit_transform(mnist)
mnist_pca_200_recovered = pca_200.inverse_transform(mnist_pca_200_reduced)

image_pca_200 = mnist_pca_200_recovered[1,:].reshape([28,28])
plt.imshow(image_pca_200, cmap='gray_r')
plt.title('Compressed image with 200 components', fontsize=15, pad=15)

In [None]:
#explained variance
print(np.cumsum(pca_25.explained_variance_ratio_ * 100)[-1])
print(np.cumsum(pca_200.explained_variance_ratio_ * 100)[-1])