# Entry 11 notebook - Consolidate Pre-processing - QSAR Aquatic Toxicity

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import category_encoders as ce

### Custom functions

In [5]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

def define_reserve(X_test, y_test):
    X_test, X_reserve, y_test, y_reserve = train_test_split(X_test, y_test, test_size = 0.5, random_state=12)
    return X_test, X_reserve, y_test, y_reserve

In [6]:
def feature_corr_coll(train_df, target, test_df, corr_type='spearman'):
    target_corr = pd.DataFrame(train_df.corrwith(target, axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = train_df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    
    collinear_features = set()
    for i in range(len(feature_corr.columns)):
        sliced_matrix = feature_corr.iloc[i, :i]
        if sum(sliced_matrix[abs(sliced_matrix) > 0.9]):
            colname = feature_corr.columns[i]
            collinear_features.add(colname)
    collinear_features = list(collinear_features)
    df_train = df_top.drop(collinear_features, axis=1)
    
    select_features = df_train.columns.tolist()
    df_test = test_df[select_features]
    
    return df_train, df_test

In [7]:
def preprocess_data(train_df, test_df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    index = train_df.index.tolist()
    test_index = test_df.index.tolist()
    
    num_features = train_df.select_dtypes('number').columns.tolist()    
    num_scale = scaler.fit_transform(train_df[num_features])
    train_num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
    test_num_scale = scaler.transform(test_df[num_features])
    test_num_df = pd.DataFrame(test_num_scale, columns=num_features, index=test_index)
    
    cat_features = train_df.select_dtypes('object').columns.tolist()
    cat_encoder = encoder(cols=cat_features)
    cat_encode = cat_encoder.fit_transform(train_df[cat_features])
    train_cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
    test_cat_encode = cat_encoder.transform(test_df[cat_features])
    test_cat_df = pd.DataFrame(test_cat_encode, columns=cat_features, index=test_index)
    
    train_df = pd.concat([train_num_df, train_cat_df], axis=1).fillna(-1)
    test_df = pd.concat([test_num_df, test_cat_df], axis=1).fillna(-1)
    
    return train_df, test_df

In [8]:
def train_and_predict(X_train, y_train, X_test, model=LinearRegression()):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return model, preds

# Get the Data

## Data

[QSAR aquatic toxicity Data Set](http://archive.ics.uci.edu/ml/datasets/QSAR+aquatic+toxicity) from UCI Machine Learning Repository.

Number of observations: 546

Number of Attributes (all integers): 9

6 molecular descriptors and 1 quantitative experimental response: 

8 molecular descriptors and 1 quantitative experimental response:

1. TPSA(Tot) 
2. SAacc 
3. H-050 
4. MLOGP 
5. RDCHI 
6. GATS1p 
7. nN 
8. C-040 
9. quantitative response, LC50 [-LOG(mol/L)]

In [9]:
columns = ['tpsa', 'saacc', 'h050', 'mlogp', 'rdchi', 'gats1p', 'nn', 'c040', 'response']
raw_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/00505/qsar_aquatic_toxicity.csv', sep=';', header=None, names=columns)
raw_df.head()

Unnamed: 0,tpsa,saacc,h050,mlogp,rdchi,gats1p,nn,c040,response
0,0.0,0.0,0,2.419,1.225,0.667,0,0,3.74
1,0.0,0.0,0,2.638,1.401,0.632,0,0,4.33
2,9.23,11.0,0,5.799,2.93,0.486,0,0,7.019
3,9.23,11.0,0,5.453,2.887,0.495,0,0,6.723
4,9.23,11.0,0,4.068,2.758,0.695,0,0,5.979


In [10]:
X_train, X_test, y_train, y_test = split_data(raw_df, 'response', 0.6)
X_train, X_test = preprocess_data(X_train, X_test)
X_test, X_reserve, y_test, y_reserve = define_reserve(X_test, y_test)
model, preds = train_and_predict(X_train, y_train, X_test)
preds[:5]

array([[3.72541537],
       [4.39116437],
       [6.09011062],
       [5.90546361],
       [5.33519898]])