# Homework 2 - Logistic Regression Dataset
This is a clustering dataset for practicing logistic regression

<!---
url: https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data
-->

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Basic IO
filename = 'log_reg/parkinsons.csv'
dataset  = pd.read_csv(filename)

# Randomize Dataset
dataset = dataset.sample(frac=1,random_state=32).reset_index()
dataset.head()

Unnamed: 0,index,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,2179,16,65,0,166.43,9.7017,23.105,0.00504,5.1e-05,0.00191,...,0.33,0.02017,0.02183,0.02708,0.06052,0.026663,20.372,0.6364,0.68118,0.21368
1,3630,27,57,1,182.48,12.0,18.979,0.0044,2.4e-05,0.00198,...,0.203,0.01211,0.01503,0.0185,0.03633,0.005092,24.824,0.52478,0.72423,0.2312
2,2184,16,65,0,33.426,8.6244,16.127,0.00522,5.1e-05,0.00229,...,0.196,0.01015,0.01311,0.01879,0.03044,0.019854,24.055,0.55911,0.69461,0.22907
3,2096,15,65,0,110.51,15.0,20.491,0.00263,1.5e-05,0.00123,...,0.074,0.00441,0.00481,0.0068,0.01324,0.007946,27.401,0.55112,0.54453,0.09978
4,2140,16,65,0,47.427,8.4671,16.599,0.00645,6.5e-05,0.00253,...,0.24,0.00916,0.01363,0.03271,0.02748,0.016993,20.423,0.55699,0.70011,0.25182


In [3]:
dataset.drop(['index','subject#','motor_UPDRS'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,age,sex,test_time,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,65,0,166.43,23.105,0.00504,5.1e-05,0.00191,0.002,0.00574,0.03772,0.33,0.02017,0.02183,0.02708,0.06052,0.026663,20.372,0.6364,0.68118,0.21368
1,57,1,182.48,18.979,0.0044,2.4e-05,0.00198,0.00237,0.00593,0.02329,0.203,0.01211,0.01503,0.0185,0.03633,0.005092,24.824,0.52478,0.72423,0.2312
2,65,0,33.426,16.127,0.00522,5.1e-05,0.00229,0.00191,0.00686,0.0218,0.196,0.01015,0.01311,0.01879,0.03044,0.019854,24.055,0.55911,0.69461,0.22907
3,65,0,110.51,20.491,0.00263,1.5e-05,0.00123,0.00125,0.00368,0.00878,0.074,0.00441,0.00481,0.0068,0.01324,0.007946,27.401,0.55112,0.54453,0.09978
4,65,0,47.427,16.599,0.00645,6.5e-05,0.00253,0.00323,0.00759,0.02675,0.24,0.00916,0.01363,0.03271,0.02748,0.016993,20.423,0.55699,0.70011,0.25182


In [4]:
cols = dataset.columns.tolist()
cols

['age',
 'sex',
 'test_time',
 'total_UPDRS',
 'Jitter(%)',
 'Jitter(Abs)',
 'Jitter:RAP',
 'Jitter:PPQ5',
 'Jitter:DDP',
 'Shimmer',
 'Shimmer(dB)',
 'Shimmer:APQ3',
 'Shimmer:APQ5',
 'Shimmer:APQ11',
 'Shimmer:DDA',
 'NHR',
 'HNR',
 'RPDE',
 'DFA',
 'PPE']

In [5]:
new_cols = cols[0:3] + cols[4:] + [cols[3]]
new_cols

['age',
 'sex',
 'test_time',
 'Jitter(%)',
 'Jitter(Abs)',
 'Jitter:RAP',
 'Jitter:PPQ5',
 'Jitter:DDP',
 'Shimmer',
 'Shimmer(dB)',
 'Shimmer:APQ3',
 'Shimmer:APQ5',
 'Shimmer:APQ11',
 'Shimmer:DDA',
 'NHR',
 'HNR',
 'RPDE',
 'DFA',
 'PPE',
 'total_UPDRS']

In [6]:
df = dataset[new_cols]
df.head()

Unnamed: 0,age,sex,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,total_UPDRS
0,65,0,166.43,0.00504,5.1e-05,0.00191,0.002,0.00574,0.03772,0.33,0.02017,0.02183,0.02708,0.06052,0.026663,20.372,0.6364,0.68118,0.21368,23.105
1,57,1,182.48,0.0044,2.4e-05,0.00198,0.00237,0.00593,0.02329,0.203,0.01211,0.01503,0.0185,0.03633,0.005092,24.824,0.52478,0.72423,0.2312,18.979
2,65,0,33.426,0.00522,5.1e-05,0.00229,0.00191,0.00686,0.0218,0.196,0.01015,0.01311,0.01879,0.03044,0.019854,24.055,0.55911,0.69461,0.22907,16.127
3,65,0,110.51,0.00263,1.5e-05,0.00123,0.00125,0.00368,0.00878,0.074,0.00441,0.00481,0.0068,0.01324,0.007946,27.401,0.55112,0.54453,0.09978,20.491
4,65,0,47.427,0.00645,6.5e-05,0.00253,0.00323,0.00759,0.02675,0.24,0.00916,0.01363,0.03271,0.02748,0.016993,20.423,0.55699,0.70011,0.25182,16.599


In [7]:
# Split Into Training & Testing Sets
train, test  = train_test_split(dataset,test_size=0.30)
train, valid = train_test_split(train, test_size=0.20)

# Write to Text Data
train.to_csv('log_reg/train_parkinsons.csv',index=False)
train.to_csv('log_reg/valid_parkinsons.csv',index=False)
test.to_csv('log_reg/test_parkinsons.csv',index=False)

In [8]:
print 'Number of Instances Per Set'
print 'Training Set:   %d'%(len(train))
print 'Validation Set: %d'%(len(valid))
print 'Testing Set:    %d'%(len(test))

Number of Instances Per Set
Training Set:   3289
Validation Set: 823
Testing Set:    1763
