# Data And Features   
## Feature Representation 

In [2]:
import pandas as pd

## Encoding Categorical Features 

### Ordinal Categirical Feature  

In [6]:
ordered_satisfaction = ['Very Unhappy', 'Unhappy', 'Neutral', 'Happy', 'Very Happy']
df = pd.DataFrame({'satisfaction': ['Mad', 'Happy', 'Unhappy', 'Neutral']})
df

Unnamed: 0,satisfaction
0,Mad
1,Happy
2,Unhappy
3,Neutral


In [9]:
# Deprecated way (still works, though)
df['satisfaction2'] = df.satisfaction.astype('category', ordered=True, categories=ordered_satisfaction).cat.codes
df

  """Entry point for launching an IPython kernel.


Unnamed: 0,satisfaction,satisfaction2
0,Mad,-1
1,Happy,3
2,Unhappy,1
3,Neutral,2


In [28]:
# New way 
from pandas.api.types import CategoricalDtype
t = CategoricalDtype(categories=ordered_satisfaction, ordered=True)
df['satisfaction3'] = df.satisfaction.astype(dtype=t).cat.codes
df

Unnamed: 0,satisfaction,satisfaction2,satisfaction3
0,Mad,-1,-1
1,Happy,3,3
2,Unhappy,1,1
3,Neutral,2,2


### Nominal Categorical Feature  

In [38]:
df = pd.DataFrame({'vertebrates': [
    'Bird',
    'Bird'
    'Mammal',
    'Fish',
    'Amphibian',
    'Reptile',
    'Mammal',
]})
df

Unnamed: 0,vertebrates
0,Bird
1,BirdMammal
2,Fish
3,Amphibian
4,Reptile
5,Mammal


In [39]:
# One simple way, but may not be adequate for a certaion ML algorithm! 
df['vertebrates2'] = df.vertebrates.astype('category').cat.codes
df

Unnamed: 0,vertebrates,vertebrates2
0,Bird,1
1,BirdMammal,2
2,Fish,3
3,Amphibian,0
4,Reptile,5
5,Mammal,4


In [40]:
# Binary N encoding 
df2 = pd.get_dummies(df, columns=['vertebrates'])
df2

Unnamed: 0,vertebrates2,vertebrates_Amphibian,vertebrates_Bird,vertebrates_BirdMammal,vertebrates_Fish,vertebrates_Mammal,vertebrates_Reptile
0,1,0,1,0,0,0,0
1,2,0,0,1,0,0,0
2,3,0,0,0,1,0,0
3,0,1,0,0,0,0,0
4,5,0,0,0,0,0,1
5,4,0,0,0,0,1,0


---
# Pure Textual Features  
 Count the number of occurance of words. 

In [42]:
# Count Vectorizer from scikit-learn
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
corpus = [
    "Authman ran faster than Harry because he is an athlete.", 
    "Authman and Harry ran faster and faster."
]

In [57]:
# bow: Bag of Words
bow = CountVectorizer()
X = bow.fit_transform(corpus)

print(bow.get_feature_names())
print(X)             # sparse metrix
print(X.toarray())   # regular array

type(X)              

['an', 'and', 'athlete', 'authman', 'because', 'faster', 'harry', 'he', 'is', 'ran', 'than']
  (0, 2)	1
  (0, 0)	1
  (0, 8)	1
  (0, 7)	1
  (0, 4)	1
  (0, 6)	1
  (0, 10)	1
  (0, 5)	1
  (0, 9)	1
  (0, 3)	1
  (1, 1)	2
  (1, 6)	1
  (1, 5)	2
  (1, 9)	1
  (1, 3)	1
[[1 0 1 1 1 1 1 1 1 1 1]
 [0 2 0 1 0 2 1 0 0 1 0]]


scipy.sparse.csr.csr_matrix

---  
# Image Feature Encoding  

Scipy.misc no longer supports imread(). Recommended to use imageio.imread(). 
However, imageio depends on Pillow (PIL), and PIL from Conda64bit is for Win32. 
Pillow-3.4.2-cp36-cp36m-win_amd64.whl is downloaded from the following side. 
https://www.lfd.uci.edu/~gohlke/pythonlibs/   
Then, the following command was exectuted in anaconda prompt window. 
>`pip install Pillow-3.4.2-cp36-cp36m-win_amd64.whl` 

In [4]:
# Online video is as follows, but this no longer works with latest sciply 
#from scipy import misc

import imageio  # imageio is dependent on PIL (Pillow), which needs to be updated to Win64-version

In [5]:
img = imageio.imread('./data/Desert.jpg')
type(img)

imageio.core.util.Image

In [6]:
img.shape

(768, 1024, 3)

In [8]:
img.dtype

dtype('uint8')

In [10]:
# decimate 1/2 
img2 = img[::2,::2]
img2.shape

(384, 512, 3)

In [12]:
# normalize, and convert to 1D x color(3)
img2 = (img2/255.).reshape(-1,3)
img2.shape

(196608, 3)

In [14]:
red = img2[:,0]
green = img2[:,1]
blue = img2[:,2]

In [16]:
# gray scaling by manual scaling factors
gray = 0.299*red + 0.587*green + 0.114*blue

In [18]:
gray.shape

(196608,)

In [19]:
# Do something 

---  
# Audio Feature Encoding  

In [2]:
import scipy.io.wavfile as wavfile

In [3]:
sample_rate, audio_data = wavfile.read("./data/se_moa03.wav")
sample_rate

22050

In [30]:
type(audio_data)

numpy.ndarray

In [32]:
audio_data.shape

(25318,)

In [34]:
audio_data.dtype

dtype('uint8')

In [35]:
# Do something 