# Entry 11 notebook - Consolidate Pre-processing - QSAR Fish Toxicity

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import category_encoders as ce

### Custom functions

In [2]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

def define_reserve(X_test, y_test):
    X_test, X_reserve, y_test, y_reserve = train_test_split(X_test, y_test, test_size = 0.5, random_state=12)
    return X_test, X_reserve, y_test, y_reserve

In [3]:
def feature_corr_coll(train_df, target, test_df, corr_type='spearman'):
    target_corr = pd.DataFrame(train_df.corrwith(target, axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = train_df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    
    collinear_features = set()
    for i in range(len(feature_corr.columns)):
        sliced_matrix = feature_corr.iloc[i, :i]
        if sum(sliced_matrix[abs(sliced_matrix) > 0.9]):
            colname = feature_corr.columns[i]
            collinear_features.add(colname)
    collinear_features = list(collinear_features)
    df_train = df_top.drop(collinear_features, axis=1)
    
    select_features = df_train.columns.tolist()
    df_test = test_df[select_features]
    
    return df_train, df_test

In [4]:
def preprocess_data(train_df, test_df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    index = train_df.index.tolist()
    test_index = test_df.index.tolist()
    
    num_features = train_df.select_dtypes('number').columns.tolist()    
    num_scale = scaler.fit_transform(train_df[num_features])
    train_num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
    test_num_scale = scaler.transform(test_df[num_features])
    test_num_df = pd.DataFrame(test_num_scale, columns=num_features, index=test_index)
    
    cat_features = train_df.select_dtypes('object').columns.tolist()
    cat_encoder = encoder(cols=cat_features)
    cat_encode = cat_encoder.fit_transform(train_df[cat_features])
    train_cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
    test_cat_encode = cat_encoder.transform(test_df[cat_features])
    test_cat_df = pd.DataFrame(test_cat_encode, columns=cat_features, index=test_index)
    
    train_df = pd.concat([train_num_df, train_cat_df], axis=1).fillna(-1)
    test_df = pd.concat([test_num_df, test_cat_df], axis=1).fillna(-1)
    
    return train_df, test_df

In [5]:
def train_and_predict(X_train, y_train, X_test, model=LinearRegression()):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return model, preds

# Get the Data

## Data

[QSAR fish toxicity Data Set](http://archive.ics.uci.edu/ml/datasets/QSAR+fish+toxicity) from UCI Machine Learning Repository.

Number of observations: 908

Number of Attributes (all integers): 7

6 molecular descriptors and 1 quantitative experimental response: 

1. CIC0 
2. SM1_Dz(Z) 
3. GATS1i 
4. NdsCH 
5. NdssC 
6. MLOGP 
7. quantitative response, LC50 [-LOG(mol/L)]

In [6]:
columns = ['cico', 'sm1_dz', 'gats1i', 'ndsch', 'ndssc', 'mlogp', 'response']
raw_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/00504/qsar_fish_toxicity.csv', sep=';', header=None, names=columns)
raw_df.head()

Unnamed: 0,cico,sm1_dz,gats1i,ndsch,ndssc,mlogp,response
0,3.26,0.829,1.676,0,1,1.453,3.77
1,2.189,0.58,0.863,0,0,1.348,3.115
2,2.125,0.638,0.831,0,0,1.348,3.531
3,3.027,0.331,1.472,1,0,1.807,3.51
4,2.094,0.827,0.86,0,0,1.886,5.39


In [7]:
X_train, X_test, y_train, y_test = split_data(raw_df, 'response', 0.6)
X_train, X_test = preprocess_data(X_train, X_test)
X_test, X_reserve, y_test, y_reserve = define_reserve(X_test, y_test)
model, preds = train_and_predict(X_train, y_train, X_test)
preds[:5]

array([[3.40293761],
       [4.26797301],
       [3.9204561 ],
       [4.29247837],
       [3.01448359]])