In [1]:
import pandas as pd
import numpy as np
import sklearn.impute
import sklearn.model_selection
import sklearn.preprocessing

import warnings
warnings.filterwarnings('ignore')

import acquire

# The end product of this exercise should be the specified functions in a python script named prepare.py. Do these in your classification_exercises.ipynb first, then transfer to the prepare.py file.

### 1.) Iris Data

 a. Use the function defined in acquire.py to load the iris data.

In [2]:
df = acquire.get_iris_data()

b. Drop the species_id and measurement_id columns.

In [3]:
df = df.drop(columns=(['species_id', 'measurement_id']))

c. Rename the species_name column to just species.

In [4]:
df.rename(columns={'species_name': "species"}, inplace=True)

d. Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?

In [5]:
#first we split
train, test = sklearn.model_selection.train_test_split(df, random_state=830, train_size=.8)

In [6]:
encoder = sklearn.preprocessing.OneHotEncoder(sparse=False)

encoder.fit(train[['species']])

m = encoder.transform(train[['species']])
m

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0

In [7]:
encoder.categories_

[array(['setosa', 'versicolor', 'virginica'], dtype=object)]

In [8]:
pd.concat([
    train.species,
    pd.DataFrame(m, columns=encoder.categories_[0], index=train.index)
], axis=1)

Unnamed: 0,species,setosa,versicolor,virginica
135,virginica,0.0,0.0,1.0
58,versicolor,0.0,1.0,0.0
125,virginica,0.0,0.0,1.0
138,virginica,0.0,0.0,1.0
61,versicolor,0.0,1.0,0.0
...,...,...,...,...
41,setosa,1.0,0.0,0.0
3,setosa,1.0,0.0,0.0
116,virginica,0.0,0.0,1.0
51,versicolor,0.0,1.0,0.0


In [9]:
(pd.DataFrame(m, columns=encoder.categories_[0]).sum(axis=1) == 1).all()

True

In [42]:
# the inverse transform would be useful if we added more data that 
# would potentially create a curse of dimensionality.
# or if we needed to chnage from label encoding to onehotencoder, for similar reasons 

e. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [10]:
def drop_columns(df):
    df = df.drop(columns=(['species_id', 'measurement_id']))
    df.rename(columns={'species_name': "species"}, inplace=True)
    return df


def encode_species(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder(sparse=False)
    encoder.fit(train[['species']])
    # nice columns for display
    cols = ['species ' + c for c in encoder.categories_[0]]
    m = encoder.transform(train[['species']])
    train = pd.concat([
    train,
    pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='species')
    
    m = encoder.transform(test[['species']])
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='species')
    return train, test





def prep_iris(df):
    df = drop_columns(df)
    train, test = sklearn.model_selection.train_test_split(
        df, random_state=830, train_size=.8
    )
    train, test = encode_species(train, test)
    
    return train, test
   
    
    

In [11]:
#train, test = prep_iris(df)

#train.head()

### 2.) Titanic Data

a. Use the function you defined in acquire.py to load the titanic data set.

In [34]:
df = acquire.get_titanic_data()

b. Handle the missing values in the embark_town and embarked columns.

In [14]:
train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size=.8)

train.embark_town.isna().sum()

2

In [15]:
train.embark_town = train.embark_town.fillna('Southampton')
test.embark_town = test.embark_town.fillna('Southampton')

In [16]:
train.embarked = train.embarked.fillna('S')
test.embarked = test.embarked.fillna('S')

c. Remove the deck column.

In [17]:
train = train.drop(columns=['deck'])
test = test.drop(columns=['deck'])

d. Use a label encoder to transform the embarked column.

In [18]:
encoder = sklearn.preprocessing.OneHotEncoder(sparse=False)

encoder.fit(train[['embarked']])

m = encoder.transform(train[['embarked']])

pd.concat([
    train.embarked,
    pd.DataFrame(m, columns=encoder.categories_[0], index=train.index)
], axis=1)

Unnamed: 0,embarked,C,Q,S
329,C,1.0,0.0,0.0
749,Q,0.0,1.0,0.0
203,C,1.0,0.0,0.0
421,Q,0.0,1.0,0.0
97,C,1.0,0.0,0.0
...,...,...,...,...
98,S,0.0,0.0,1.0
322,Q,0.0,1.0,0.0
382,S,0.0,0.0,1.0
365,S,0.0,0.0,1.0


f. Fill the missing values in age. The way you fill these values is up to you. Consider the tradeoffs of different methods.

In [25]:
# if we drop them we lose lots of data, if we impute them we risk having 
# variables that are not accurate

imputer = sklearn.impute.SimpleImputer(strategy='mean')

imputer.fit(train[['age']])

train.age = imputer.transform(train[['age']])
test.age = imputer.transform(test[['age']])

e. Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?

In [30]:
# this would be beneficial if the units of measuremnt are foreign, which in this case
# the amount of money used was 

def scale_minmax(train, test, column_list):
    scaler = sklearn.preprocessing.MinMaxScaler()
    column_list_scaled = [col + '_scaled' for col in column_list]
    train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]), 
                                columns = column_list_scaled, 
                                index = train.index)
    train = train.join(train_scaled)

    test_scaled = pd.DataFrame(scaler.transform(test[column_list]), 
                                columns = column_list_scaled, 
                                index = test.index)
    test = test.join(test_scaled)

    return train, test



In [31]:
train, test = scale_minmax(train, test, column_list = ['age', 'fare'])

In [32]:
train.head(5)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,age_scaled,fare_scaled
329,329,1,1,female,16.0,0,1,57.9792,C,First,Cherbourg,0,0.195778,0.113168
749,749,0,3,male,31.0,0,0,7.75,Q,Third,Queenstown,1,0.384267,0.015127
203,203,0,3,male,45.5,0,0,7.225,C,Third,Cherbourg,1,0.566474,0.014102
421,421,0,3,male,21.0,0,0,7.7333,Q,Third,Queenstown,1,0.258608,0.015094
97,97,1,1,male,23.0,0,1,63.3583,C,First,Cherbourg,0,0.28374,0.123667


g. Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [38]:
def embark_titanic(df):
    df.embark_town = df.embark_town.fillna("Southampton")
    df.embarked = df.embarked.fillna("S")
    df = df.drop(columns=('deck'))
    return df
    
def impute_titanic(df):
    train, test = sklearn.model_selection.train_test_split(
        df, random_state=123, train_size=.8)
    
    imputer = sklearn.impute.SimpleImputer(strategy='mean')
    imputer.fit(train[['age']])
    train.age = imputer.transform(train[['age']])
    test.age = imputer.transform(test[['age']])
    return train, test

def encode_titanic(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder(sparse=False)
    encoder.fit(train[['embarked']])
    #getting those nice columns
    cols = ['embarked ' + c for c in encoder.categories_[0]]
    m = encoder.transform(train[['embarked']])
    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='embarked')
    
    m = encoder.transform(test[['embarked']])
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='embarked')
    return train, test
    
def scale_minmax(train, test, column_list):
    scaler = sklearn.preprocessing.MinMaxScaler()
    column_list_scaled = [col + '_scaled' for col in column_list]
    train_scaled = pd.DataFrame(scaler.fit_transform(train[column_list]), 
                                columns = column_list_scaled, 
                                index = train.index)
    train = train.join(train_scaled)

    test_scaled = pd.DataFrame(scaler.transform(test[column_list]), 
                                columns = column_list_scaled, 
                                index = test.index)
    test = test.join(test_scaled)

    return train, test
    

def prep_titanic(df):
    df = embark_titanic(df)
    train, test = impute_titanic(df)
    train, test = encode_titanic(train, test)
    train, test = scale_minmax(train, test, column_list = ['age', 'fare'])
    return train, test
    

In [41]:
train, test = prep_titanic(df)

train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,class,embark_town,alone,embarked C,embarked Q,embarked S,age_scaled,fare_scaled
329,329,1,1,female,16.0,0,1,57.9792,First,Cherbourg,0,1.0,0.0,0.0,0.195778,0.113168
749,749,0,3,male,31.0,0,0,7.75,Third,Queenstown,1,0.0,1.0,0.0,0.384267,0.015127
203,203,0,3,male,45.5,0,0,7.225,Third,Cherbourg,1,1.0,0.0,0.0,0.566474,0.014102
421,421,0,3,male,21.0,0,0,7.7333,Third,Queenstown,1,0.0,1.0,0.0,0.258608,0.015094
97,97,1,1,male,23.0,0,1,63.3583,First,Cherbourg,0,1.0,0.0,0.0,0.28374,0.123667
