## Functions used in Machine Learning

In [1]:
from sklearn import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
iris_path = 'iris.data'
df = pd.read_csv(iris_path, na_values=['NA', '?'], header=None)

In [3]:
# encode the classes(setosa, versicolor, virginica) into [1,0,0], [0,1,0], [0,0,1]
# and store the classes by n(donate the number of the classes) fields
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    
    for x in dummies.columns:
        dummy_name = '{}-{}'.format(str(name), str(x))
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)
    
# encode the classes  into the binary type, 1 if the class is target class, nor 0
def encode_text_single_dummy(df, name, target_vals):
    for val in target_vals:
        is_value = lambda x: 1 if str(x) == str(val) else 0
        val_name = 'dummy-%s'%val
        df[dummy_name] = df[name].apply(is_value)
    
# encode the classes(setosa, versicolor, virginica) into 1, 2, 3
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    encode_name = 'le-%s'%name
    df[encode_name] = le.fit_transform(df[name])
    return le.classes_

# encode a numeric column as zscores
de encode_numeric_zscore(df, name, mean=None, sd=None):
    mean = mean or df[name].mean()
    sd = sd or df[name].std()
    
    df[name] = (df[name] - mean)/sd

# convert all missing value in specific column to the median
def convert_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

# convert all missing value in specific column to the default value
def convert_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)
    
# convert pandas dataFrame to x, y inputs that tensorflow needs
def to_xy(df, target):
    results = []
    for c in df.columns:
        if c != target:
            results.append(c)
    
    target_type = df[target].dtypes[0] if hasattr(df[target].dtypes, '__iter__') as df[target].dtypes
    # encode to int for classification, otherwise float, TensorFlow prefer 32bits
    if target_type in (np.int64, np.int32):
        # classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(results).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(results).astype(np.float32), df.as_matrix(target).astype(np.float32)

# plot chart of Regression
def chart_regression(pred, y, sort=True):
    df = pd.DataFrame({'pred': pred, 'y': y.flatten()})
    if sort:
        df.sort_values(by=['y'], inpalce=True)
    plt.plot(df['y'].tolist(), label='expected')
    plt.plot(df['pred'].tolist(), label='predict')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# remove all rows that deviate over the sd 
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd*df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)
    return df

# encode a column to a range between normalize_low and normalized_high
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = df[name].min()
        data_high = df[name].max()
    
    df[name] = (df[name]-data_low)*(normalized_high-normalized_low)/(data_high-data_low) + normalized_low

    return df[name]


        

In [14]:
encode_text_index(df, 4)

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [17]:
df[5] = df.apply(lambda row: list([row[1], row[2]]), axis=1)

In [40]:
df = pd.DataFrame([[1, 1.5], [2, 4], [5, 6]], columns=['int', 'float'])
df

Unnamed: 0,int,float
0,1,1.5
1,2,4.0
2,5,6.0


In [41]:
pd.get_dummies(df['int'])


Unnamed: 0,1,2,5
0,1,0,0
1,0,1,0
2,0,0,1
