# Machine Learning Pipelines Project

In [11]:
import pandas as pd # Dataframes
import numpy as np # Linear alg
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import tree, pipeline, preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.externals import joblib

<center><h1>parkinsons.data, parkinsons.names</h1></center>
<h2>Dataset description:</h2>
<p>
This dataset is composed of a range of biomedical voice measurements from 
31 people, 23 with Parkinson's disease (PD). Each column in the table is a 
particular voice measure, and each row corresponds one of 195 voice 
recording from these individuals ("name" column). <b>The main aim of the data 
is to discriminate healthy people from those with PD, according to "status" 
    column which is set to 0 for healthy and 1 for PD</b>.The rows of the CSV file contain an 
instance corresponding to one voice recording. There are around six 
recordings per patient, the name of the patient is identified in the first 
column.
</p>

<h3>Attribute Information:</h3>
<table>
<caption>Matrix column entries (attributes)</caption>
    <tr>
    <td><b>name</b></td>
    <td>ASCII subject name and recording number</td>
    </tr>
    <tr>
    <td><b>MDVP:Fo(Hz)</b></td>
    <td>Average vocal fundamental frequency</td>
    </tr>
    <tr>
    <td><b>MDVP:Fhi(Hz)</b></td>
    <td>Maximum vocal fundamental frequency</td>
    </tr>
    <tr>
    <td><b>MDVP:Flo(Hz)</b></td>
    <td>Minimum vocal fundamental frequency</td>
    </tr>
    <tr>
    <td><b>MDVP:Jitter(%),MDVP:Jitter(Abs),<br> MDVP:RAP,MDVP:PPQ,Jitter:DDP</b></td>
    <td>Several measures of variation in fundamental frequency</td>
    </tr>
    <tr>
    <td><b>MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,<br>Shimmer:APQ5,MDVP:APQ,Shimmer:DDA</b></td>
    <td>Several measures of variation in amplitude</td>
    </tr>
    <tr>
    <td><b>NHR,HNR</b></td>
    <td>Two measures of ratio of noise to tonal components in the voice</td>
    </tr>
    <tr>
    <td><b>status</b></td>
    <td>Health status of the subject: "Healthy" (one), Parkinson's (zero)</td>
    </tr>
    <tr>
    <td><b>RPDE,D2</b></td>
    <td>Two nonlinear dynamical complexity measures</td>
    </tr>
    <tr>
    <td><b>DFA</b></td>
    <td>Signal fractal scaling exponent</td>
    </tr>
    <tr>
    <td><b>spread1,spread2,PPE</b></td>
    <td>Three nonlinear measures of fundamental frequency variation </td>
    </tr>
</table>

In [12]:
def show_df_info(dataframe):
    # get the data type
    print(type(dataframe))
    print("amount of entries is %s" % dataframe.size)
    print("dimensions= %i" % dataframe.ndim)
    print("shape is ", end="")
    print(dataframe.shape)
    print("axes: ", end="")
    print(dataframe.axes)
    print("data types of columns:")
    print(dataframe.dtypes)
    print("features: %s" % dataframe.columns)
    print('\nNulls for each attribute: \n')
    return dataframe.isnull().sum()

In [13]:
# Data contains approximately 23 attributes

## PHASE 1: LOAD DATASET
data = pd.read_csv('parkinsons.data')
attributes = data.columns
data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [14]:
show_df_info(data) # return summary description of data

<class 'pandas.core.frame.DataFrame'>
amount of entries is 4680
dimensions= 2
shape is (195, 24)
axes: [RangeIndex(start=0, stop=195, step=1), Index(['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')]
data types of columns:
name                 object
MDVP:Fo(Hz)         float64
MDVP:Fhi(Hz)        float64
MDVP:Flo(Hz)        float64
MDVP:Jitter(%)      float64
MDVP:Jitter(Abs)    float64
MDVP:RAP            float64
MDVP:PPQ            float64
Jitter:DDP          float64
MDVP:Shimmer        float64
MDVP:Shimmer(dB)    float64
Shimmer:APQ3        float64
Shimmer:APQ5        float64
MDVP:APQ            float64
Shimmer:DDA         float64
NHR                 float64
HNR                 float64

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

In [15]:
# set target y = status, and X = attributes
## PHASE 2: SLICE DATASET
y = data['status']
X = data.drop('status', axis = 1)
X['name'] = X.name.str.extract(r'_(S.*)_').values # so that patient names can be linked later
X.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
0,S01,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.02971,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,S01,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.04368,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,S01,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0359,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,S01,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.03772,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,S01,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.04465,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [68]:
# examine 'T' and 'F' outcomes within target
y.value_counts()

1    147
0     48
Name: status, dtype: int64

In [17]:
# remove 'name' col from features since value counts show no variation
names = X['name']
X = X.drop('name', axis = 1)
X.head()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,...,0.02971,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,...,0.04368,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,...,0.0359,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,...,0.03772,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.04465,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [18]:
# Identify columns containing negative values
X_df = pd.DataFrame(X.min(), columns = ['Minimum'])

# columns containing negative values assigned to a list
colsNegative = [feature for feature in X_df[X_df['Minimum'] < 0].index.values]

for column in colsNegative:
    X[column] = X[column].apply(lambda x:x - X[column].min())

<p>From the table shown at the begining of this document describing the data, multicollinearity or redundancy among the features should also be considered.</p>

<h3>Attribute Information:</h3>
<table>
<caption>Matrix column entries (attributes)</caption>
    <tr>
    <td><b>name</b></td>
    <td>ASCII subject name and recording number</td>
    </tr>
    <tr>
    <td><b>MDVP:Fo(Hz)</b></td>
    <td>Average vocal fundamental frequency</td>
    </tr>
    <tr>
    <td><b>MDVP:Fhi(Hz)</b></td>
    <td>Maximum vocal fundamental frequency</td>
    </tr>
    <tr>
    <td><b>MDVP:Flo(Hz)</b></td>
    <td>Minimum vocal fundamental frequency</td>
    </tr>
    <tr>
    <td><b>MDVP:Jitter(%),MDVP:Jitter(Abs),<br> MDVP:RAP,MDVP:PPQ,Jitter:DDP</b></td>
    <td>Several measures of variation in fundamental frequency</td>
    </tr>
    <tr>
    <td><b>MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,<br>Shimmer:APQ5,MDVP:APQ,Shimmer:DDA</b></td>
    <td>Several measures of variation in amplitude</td>
    </tr>
    <tr>
    <td><b>NHR,HNR</b></td>
    <td>Two measures of ratio of noise to tonal components in the voice</td>
    </tr>
    <tr>
    <td><b>status ('Target')</b></td>
    <td>Health status of the subject: "Healthy" (one), Parkinson's (zero)</td>
    </tr>
    <tr>
    <td><b>RPDE,D2</b></td>
    <td>Two nonlinear dynamical complexity measures</td>
    </tr>
    <tr>
    <td><b>DFA</b></td>
    <td>Signal fractal scaling exponent</td>
    </tr>
    <tr>
    <td><b>spread1,spread2,PPE</b></td>
    <td>Three nonlinear measures of fundamental frequency variation </td>
    </tr>
</table>

<p>Therefore, we can narrow down our features by grouping columns by type of measurement and examining the variance within each.</p>  

In [49]:
# measures of variation in fundamental frequency
varFundFreq = X[[*X.columns[3:8]]].var().sort_values(ascending = False)
varFundFreq

Jitter:DDP          7.926954e-05
MDVP:Jitter(%)      2.350440e-05
MDVP:RAP            8.807685e-06
MDVP:PPQ            7.611952e-06
MDVP:Jitter(Abs)    1.212565e-09
dtype: float64

In [52]:
# measures of variation in amplitude
varAmp = X[[*X.columns[8:14]]].var().sort_values(ascending = False)
varAmp

MDVP:Shimmer(dB)    0.037977
Shimmer:DDA         0.000928
MDVP:Shimmer        0.000356
MDVP:APQ            0.000287
Shimmer:APQ5        0.000145
Shimmer:APQ3        0.000103
dtype: float64

In [53]:
# measures of ratio of noise
noiseToTonal = X[[*X.columns[14:16]]].var().sort_values(ascending = False)
noiseToTonal

HNR    19.587389
NHR     0.001634
dtype: float64

In [58]:
# Non-linear dynamical complexity measures
dynComplex = X[['RPDE', 'D2']].var().sort_values(ascending = False)
dynComplex

D2      0.146535
RPDE    0.010804
dtype: float64

In [59]:
# non-linear measurements of variation of fundemental frequency
varFunFreq_nonLinear = X[['spread1', 'spread2', 'PPE']].var().sort_values(ascending = False)
varFunFreq_nonLinear

spread1    1.188553
PPE        0.008121
spread2    0.006957
dtype: float64

In [66]:
# columns that are redundant with low variance to drop
colDrop = [*varFundFreq[1:].index, *varAmp[1:].index, *noiseToTonal[1:].index,
 *dynComplex[1:].index, *varFunFreq_nonLinear[1:].index]
print('----Columns to drop----', *colDrop, sep='\n')

----Columns to drop----
MDVP:Jitter(%)
MDVP:RAP
MDVP:PPQ
MDVP:Jitter(Abs)
Shimmer:DDA
MDVP:Shimmer
MDVP:APQ
Shimmer:APQ5
Shimmer:APQ3
NHR
RPDE
PPE
spread2


In [71]:
# keeping columns in redundant metric with largest variance
training_instances = X.drop(colDrop, axis=1)
training_instances.head()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),Jitter:DDP,MDVP:Shimmer(dB),HNR,DFA,spread1,D2
0,119.992,157.302,74.997,0.01109,0.426,21.033,0.815285,3.151953,2.301442
1,122.4,148.65,113.819,0.01394,0.626,19.085,0.819521,3.889792,2.486855
2,116.682,131.111,111.555,0.01633,0.482,20.651,0.825288,3.521805,2.342259
3,116.676,137.871,111.366,0.01505,0.517,20.644,0.819235,3.847483,2.405554
4,116.014,141.781,110.655,0.01966,0.584,19.649,0.823484,4.217197,2.33218


# The whole program:

In [246]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn import pipeline, preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression

#
# DECLARATION PART
#
PIPELINEPATH = "ser_pipeline.pickle"
DATASETPATH = "parkinsons.data"


def show_df_info(dataframe):
    # get the data type
    print(type(dataframe))
    print("amount of entries is %s" % dataframe.size)
    print("dimensions= %i" % dataframe.ndim)
    print("shape is ", end="")
    print(dataframe.shape)
    print("\n-------data types of columns-------")
    print(dataframe.dtypes)
    print("features: %s" % [*dataframe.columns], sep='\n')
    if dataframe.isnull().sum().values.sum() > 0:
        print('\nNulls for each attribute: \n')
        return dataframe.isnull().sum()
    else:
        return print('DataFrame passed contains no nulls\n\n')


#
# PROGRAM BODY
#

## PHASE 1: LOAD DATASET
data = pd.read_csv('parkinsons.data')
attributes = data.columns
data.head()

## PHASE 2: SLICE DATASET
y = data['status']
X = data.drop('status', axis = 1)
X['name'] = X.name.str.extract(r'_(S.*)_').values
names = X['name']
X = X.drop('name', axis = 1)
# Identify columns containing negative values
X_df = pd.DataFrame(X.min(), columns = ['Minimum'])

# columns containing negative values assigned to a list
colsNegative = [feature for feature in X_df[X_df['Minimum'] < 0].index.values]

for column in colsNegative:
    X[column] = X[column].apply(lambda x:x - X[column].min())
    
varFundFreq = X[[*X.columns[3:8]]].var().sort_values(ascending = False)
varAmp = X[[*X.columns[8:14]]].var().sort_values(ascending = False)
noiseToTonal = X[[*X.columns[14:16]]].var().sort_values(ascending = False)
dynComplex = X[['RPDE', 'D2']].var().sort_values(ascending = False)
varFunFreq_nonLinear = X[['spread1', 'spread2', 'PPE']].var().sort_values(ascending = False)

# columns that are redundant with low variance to drop
colDrop = [*varFundFreq[1:].index, *varAmp[1:].index, *noiseToTonal[1:].index,
 *dynComplex[1:].index, *varFunFreq_nonLinear[1:].index]
training_instances = X.drop(colDrop, axis=1)


show_df_info(training_instances) 

## PHASE 3: CREATE PIPELINE
log_model = LogisticRegression(max_iter=2000, solver = 'lbfgs', dual=False)

# features to select, (default = 5)
m = 5
pipe = pipeline.Pipeline(steps = [("feature_selection", SelectKBest(chi2, 5)),
                                  ("Scale", preprocessing.StandardScaler()),
                                  ("LOG", log_model)])

## PHASE 4: Train
pipe.fit(training_instances, class_labels)

## PHASE 5: Evaluation
# return value is array of scores
scores = cross_val_score(pipe, training_instances, class_labels, cv=10)
# use as quality metric the average CV score
meanCvAccuracy= scores.mean()
print("Mean CV accuracy= %f" % meanCvAccuracy)
print("Mean Accuracy: %f" % logit.score(scaled_x, class_labels))

## PHASE 6: Save the Pipeline
joblib.dump(pipe, PIPELINEPATH, compress = 1)

## PHASE 7: Load the Pipeline
pipeline_loaded = joblib.load(PIPELINEPATH)

## PHASE 8: Classify new instances
# create new random problem instance
vector = np.random.randint(0, m, size=len(training_instances.columns))
print(vector)

result = pipeline_loaded.predict([vector,])
print("class label is %i" % result)
print("--- end of execution ---")

<class 'pandas.core.frame.DataFrame'>
amount of entries is 1755
dimensions= 2
shape is (195, 9)

-------data types of columns-------
MDVP:Fo(Hz)         float64
MDVP:Fhi(Hz)        float64
MDVP:Flo(Hz)        float64
Jitter:DDP          float64
MDVP:Shimmer(dB)    float64
HNR                 float64
DFA                 float64
spread1             float64
D2                  float64
dtype: object
features: ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'Jitter:DDP', 'MDVP:Shimmer(dB)', 'HNR', 'DFA', 'spread1', 'D2']
DataFrame passed contains no nulls


Mean CV accuracy= 0.814795
Mean Accuracy: 0.861538
[1 0 3 4 4 1 1 4 0]
class label is 1
--- end of execution ---


