# Python test for machine-learning

### Importing data

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVR
import seaborn as sns
import io
import urllib.request
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
data=pd.read_csv('product_adme.csv')

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

**Evaluating the dataframe created** 
* evaluating datatypes
* evaluating certain columns 
* filtering data
* histogram for numerical values

In [None]:
#1
data.dtypes
#2
data['published_type'].describe()
#3
data[data["published_type"] == "logP"]
#4
data.hist(column="standard_value")

### para filtrar por count é:

In [None]:
teste=data.groupby("published_type").filter(lambda x: len(x) > 2000)
#plot
teste["published_type"].value_counts().plot(kind='bar')
#print list
teste["published_type"].value_counts().index

In [None]:
data_eval=data[data["published_type"] == "logP"]
[data_eval['standard_value'].sum(), # Total sum of the column values
 data_eval['standard_value'].mean(), # Mean of the column values
 data_eval['standard_value'].median(), # Median of the column values
 data_eval['standard_value'].nunique(), # Number of unique entries
 data_eval['standard_value'].max(), # Maximum of the column values
 data_eval['standard_value'].min()] # Minimum of the column values

In [None]:
data=data.drop(columns=["md", "cp","cr","at","aa","molregno.2","molregno.1","doc_id.1","doc_id.2","molregno.3","doc_id.1","doc_id.2"])
data=data.drop(columns=["src_id.1","chembl_id.1","assay_id.1","record_id.1"])

#selecting for pka
data=data[data["published_type"] == 'logP']

#keeping interesting columns
data=data[["max_phase","dosed_ingredient", "structure_type",  "molecule_type",
"oral", "parenteral", "topical", "black_box_warning",
"natural_product", "first_in_class", "chirality", "prodrug",
"inorganic_flag", "usan_year", "availability_type", "usan_stem",
"polymer_flag", "usan_substem", "usan_stem_definition",
"indication_class", "withdrawn_flag", "withdrawn_year",
"withdrawn_country", "withdrawn_reason", "mw_freebase", "alogp", "hba",
"hbd", "psa", "rtb", "ro3_pass", "num_ro5_violations", "acd_most_apka",
"acd_most_bpka", "acd_logp", "acd_logd", "molecular_species",
"full_mwt", "aromatic_rings", "heavy_atoms", "num_alerts",
"qed_weighted", "mw_monoisotopic",  "hba_lipinski",
"hbd_lipinski", "num_lipinski_ro5_violations","assay_type", "relationship_type", "confidence_score","standard_value"]]
data.dtypes

## Cleaning data
#### filtering for pka
pka_data_cleaning from [here](https://www.kaggle.com/mnoori/feature-selection-for-mlr-with-python)


In [None]:
def data_cleaning(df,null_cutoff,published_type):
    
    #removing duplicate and unmeaninfull columns
    df=df.drop(columns=["md", "cp","cr","at","aa","molregno.2","molregno.1","doc_id.1","doc_id.2","molregno.3","doc_id.1","doc_id.2"])
    df=df.drop(columns=["src_id.1","chembl_id.1","assay_id.1","record_id.1"])
    
    #selecting for pka
    df=df[df["published_type"] == published_type]

    #keeping interesting columns
    df=df[["max_phase","dosed_ingredient", "structure_type",  "molecule_type",
    "oral", "parenteral", "topical", "black_box_warning",
    "natural_product", "first_in_class", "chirality", "prodrug",
    "inorganic_flag", "usan_year", "availability_type", "usan_stem",
    "polymer_flag", "usan_substem", "usan_stem_definition",
    "indication_class", "withdrawn_flag", "withdrawn_year",
    "withdrawn_country", "withdrawn_reason", "mw_freebase","alogp","hba",
    "hbd", "psa", "rtb", "ro3_pass", "num_ro5_violations", "acd_most_apka",
    "acd_most_bpka", "acd_logp", "acd_logd", "molecular_species",
    "full_mwt", "aromatic_rings", "heavy_atoms", "num_alerts",
    "qed_weighted", "mw_monoisotopic",  "hba_lipinski",
    "hbd_lipinski", "num_lipinski_ro5_violations","assay_type", "relationship_type",
    "confidence_score","standard_value"]]

    #removing outlier far greater than average
    if published_type in ["pKa"]:
        df=df[df["standard_value"]<400]
    
    #dropping columns with more than a missing values
    null_values=df.isnull().sum()
    drop_missing_values=null_values[null_values>(null_cutoff*len(df))]
    df=df.drop(drop_missing_values.index, axis=1)    

    # counting null values in text columns
    text_cols_nullcount=df.select_dtypes(include=["object"]).isnull().sum().sort_values(ascending=False)
    text_cols_nullcols=text_cols_nullcount.index
    for col in text_cols_nullcols:
        mostcounts=df[col].value_counts().index.tolist()
        df[col]=df[col].fillna(mostcounts[0]) #replacing the missing column in a text with the highest number of values

    #missing values in numerical columns 
    num_cols=df.select_dtypes(include=["object","float64"]).columns #selecting numerical columns
    num_null_counts=df[num_cols].isnull().sum().sort_values(ascending=False) #counting null values in columns
    num_null_cols=num_null_counts[num_null_counts!=0].index #selecting the ones that have missing values
    df=df.fillna(df[num_null_cols].mode().to_dict(orient="records")[0]) #replacing missing with mode

    #passing categorical to numerical
    df=pd.get_dummies(df, prefix="is_")

    #remove duplicates
    df=df.drop_duplicates()
    
    return df

In [None]:
pka_data=data_cleaning(data,0.8,'pKa')
pka_data

In [None]:
logp_data=data_cleaning(data,0.8,'logP')
logp_data

analisado o gráfico em baixo, vemos que não existe correlação forte de de nenhuma coluna com o alvo - **standard_value**

In [None]:
logd_data=data_cleaning(df=data,published_type='logD',null_cutoff=0.8)    
logd_data

In [None]:
def check_correlation(df,target,corr_cutoff):
    data_train=df.sample(frac=0.7,random_state=200)
    data_test=df.drop(data_train.index)

    data_x=df.drop(columns=[target])
    data_y=df[target]

    data_x_train=data_train.drop(columns=[target])
    data_y_train=data_train[target]

    data_x_test=data_test.drop(columns=[target])
    data_y_test=data_test[target]
    
    corr=data_train.corr()
    #fig,ax=plt.subplots(figsize=(8,6))
    #sns.heatmap(corr)
    features=''
    features_text=''
    if len(corr[target].where(lambda x : x.abs()>corr_cutoff).dropna())>1:
        features=corr[target].where(lambda x : x.abs()>corr_cutoff).dropna()
        features_text=features.index.str.cat(sep=', ')+'\n'
    else:
        features='1'
        features_text='None'
    #print('The features correlated with target above the threshold %s are %s' %(corr_cutoff,features_text))
    return len(features)

check_correlation(logd_data,'standard_value',0.01)

para retirar colunas com variância abaixo de X, mas devolve um np  
além disso, temos variaveis "booleanas" o que torna complicado aplicar isto pq a variância não há de ser muito grande

In [None]:
#sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
#sel.fit_transform(pka_data_corrected_1)

In [None]:
def xMlr(df,target,frac=0.7,cv=10):
    i=0
    mse=0
    score=0
    while i<cv:
        np.random.seed(seed=123)
        pka_data_train=df.sample(frac=0.7,random_state=200)
        pka_data_test=df.drop(pka_data_train.index)

        pka_data_x=df.drop(columns=[target])
        pka_data_y=df[target]

        pka_data_x_train=df.drop(columns=[target])
        pka_data_y_train=df[target]

        pka_data_x_test=df.drop(columns=[target])
        pka_data_y_test=df[target]
        regr = linear_model.LinearRegression()
        regr.fit(pka_data_x_train, pka_data_y_train)
        #print(regr.coef_)
        mse+=(np.mean((regr.predict( pka_data_x_test)-pka_data_y_test)**2))
        score+=regr.score(pka_data_x_test, pka_data_y_test)
        i+=1
    #print("RMSE is %s. Score is %s." % (mse/cv, score/cv))
    return mse/cv, score/cv


MSE is mean squared error  
Explained variance score:   
1 is perfect prediction
and 0 means that there is no linear relationship
between X and y.

In [None]:
xMlr(logd_data,'standard_value')

# SVR

In [None]:
def xSVR(df,target,frac=0.7,cv=10):
    i=0
    mse=0
    score=0
    while i<cv:
        np.random.seed(seed=123)
        pka_data_train=df.sample(frac=0.7,random_state=200)
        pka_data_test=df.drop(pka_data_train.index)

        pka_data_x=df.drop(columns=[target])
        pka_data_y=df[target]

        pka_data_x_train=df.drop(columns=[target])
        pka_data_y_train=df[target]

        pka_data_x_test=df.drop(columns=[target])
        pka_data_y_test=df[target]
        clf = SVR(gamma='scale', C=1.0, epsilon=0.1)
        clf.fit(pka_data_x_train, pka_data_y_train) 
        mse+=(np.mean((clf.predict( pka_data_x_test)-pka_data_y_test)**2))
        score+=clf.score(pka_data_x_test, pka_data_y_test, sample_weight=None)
        i+=1
    return "MSE is %s. Score is %s." % (mse/cv, score/cv)

In [None]:
xSVR(pka_data,'standard_value')

In [None]:
def evaluation_test_train(df,length,cv,null_cutoff,correlation):
    result={}
    eval_df=df.groupby("published_type").filter(lambda x: len(x) > length)
    test_list=eval_df["published_type"].value_counts().index
    
    for item in test_list:
        try:
            test_df=data_cleaning(df,null_cutoff,str(item))
            if check_correlation(corr_cutoff=correlation,df=test_df,target='standard_value')>1:
                result[item]=xMlr(test_df,'standard_value',frac=0.7,cv=cv)
        except:
            print(item)
            continue
    return result

evaluation_test_train(data,1000,5,0.8,0.1)