In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let's start with standard data imports, alloting each one of the five .csv files from the Raman Spectroscopy study its own Pandas DataFrame. 

In [56]:
RamanAGEsData = "AGEs.csv"
RamanInnerArmData = "innerArm.csv"
RamanThumbnailData = "thumbNail.csv"
RamanEarlobeData = "earLobe.csv"
RamanVeinData = "vein.csv"

In [57]:
RamanAGEsDF = pd.read_csv("AGEs.csv")
RamanInnerArmDF = pd.read_csv("innerArm.csv")
RamanThumbnailDF = pd.read_csv("thumbNail.csv")
RamanEarlobeDF = pd.read_csv("earLobe.csv")
RamanVeinDF = pd.read_csv("vein.csv")

In [58]:
#Let's take a quick peek at some of the Inner Arm spectral data, and make a quick plot of it: 
RamanInnerArmDF.head(25)

Unnamed: 0,patientID,has_DM2,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var3152,Var3153,Var3154,Var3155,Var3156,Var3157,Var3158,Var3159,Var3160,Var3161
0,ramanShift,,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,...,3150,3151,3152,3153,3154,3155,3156,3157,3158,3159
1,DM201,1.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,...,0,0,0,0,0,0,0,0,0,0
2,DM202,1.0,214.8,214.8,214.8,214.8,214.8,214.8,214.8,214.8,...,0,0,0,0,0,0,0,0,0,0
3,DM203,1.0,457.8,457.8,457.8,457.8,457.8,457.8,457.8,457.8,...,0,0,0,0,0,0,0,0,0,0
4,DM204,1.0,181.833333,181.833333,181.833333,181.833333,181.833333,181.833333,181.833333,181.833333,...,0,0,0,0,0,0,0,0,0,0
5,DM205,1.0,179.8,179.8,179.8,179.8,179.8,179.8,179.8,179.8,...,0,0,0,0,0,0,0,0,0,0
6,DM206,1.0,237.4,237.4,237.4,237.4,237.4,237.4,237.4,237.4,...,0,0,0,0,0,0,0,0,0,0
7,DM207,1.0,217.333333,217.333333,217.333333,217.333333,217.333333,217.333333,217.333333,217.333333,...,0,0,0,0,0,0,0,0,0,0
8,DM208,1.0,89.166667,89.166667,89.166667,89.166667,89.166667,89.166667,89.166667,89.166667,...,0,0,0,0,0,0,0,0,0,0
9,DM209,1.0,336.6,336.6,336.6,336.6,336.6,336.6,336.6,336.6,...,0,0,0,0,0,0,0,0,0,0


In [59]:
#Let's select the first row of the Inner Arm DataFrame to be used as the domain in future plots or analyses: 
InnerArmWavenumbersDF = RamanInnerArmDF.iloc[0]

#Let's also drop the first row (containing the Raman wavenumbers, the spectral domain) from the original DataFrame: 
RamanInnerArmDF = RamanInnerArmDF.drop(RamanInnerArmDF.index[0])
RamanInnerArmDF = RamanInnerArmDF.reset_index(drop=True)

1) Now, starting with the Inner Arm Dataframe, I'll normalize and run PCA on the data to reduce its dimensionality down to 15. The paper associated with this dataset mentioned that to be the optimal number of components for this chunk of the data. 

In [60]:
#1a) Inner Arm Data Scaling: 
from sklearn.preprocessing import StandardScaler 

#I'm making a slice of the original dataframe, from the "Var2" column onwards, as each column is an independent feature representing a wavenumber 
#chunk of the spectral domain. I'll then make a list of this dataframe of column names. This will be an input in the StandardScaler functions:
innerArmFeaturesDataFrame = RamanInnerArmDF.iloc[:,2:3159]
innerArmFeaturesList = innerArmFeaturesDataFrame.columns.tolist()

innerArmFeaturesValues = innerArmFeaturesDataFrame.loc[:,innerArmFeaturesList].values

#Separating out target values: 
innerArmTargetValues = RamanInnerArmDF.loc[:,['has_DM2']].values

#Feature Standardization: 
RamanInnerArmStandardScalerObject = StandardScaler().fit_transform(innerArmFeaturesValues)

In [61]:
#1b) Inner Arm PCA, set to (the reccomended) 15 components: 
from sklearn.decomposition import PCA

RamanInnerArmPCA = PCA(n_components = 15) 
RamanInnerArmPrincipalComponents = RamanInnerArmPCA.fit_transform(RamanInnerArmStandardScalerObject)

RamanInnerArmPCADF = pd.DataFrame(data = RamanInnerArmPrincipalComponents
             , columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4', 'principal component 5', 'principal component 6', 'principal component 7', 'principal component 8', 'principal component 9', 'principal component 10', 'principal component 11', 'principal component 12', 'principal component 13', 'principal component 14', 'principal component 15'])

#Now that we have a total of fifteen new principal components, preserving as much variation from the original 3159 Raman
#wavenumber features, lets concatenate the new primary components DF with the features column in a new DataFrame:
RamanPCAInnerArmComponentsAndFeatureDF = pd.concat([RamanInnerArmPCADF, RamanInnerArmDF[['has_DM2']]], axis = 1)


In [53]:
#Let's get a little preview of that concatenated DataFrame: 
RamanPCAInnerArmComponentsAndFeatureDF.head(20)

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10,principal component 11,principal component 12,principal component 13,principal component 14,principal component 15,has_DM2
0,47.222918,7.14568,-6.683618,3.167498,-2.925279,-3.181392,-0.646655,-0.490315,0.517423,2.732065,-1.798149,-2.73127,1.666045,-2.175124,4.387751,1.0
1,-3.859357,-6.496048,-2.691251,-2.147266,-2.785565,-2.488168,-1.655383,-0.878841,1.313479,-3.785902,-1.223921,2.204281,2.714711,4.713958,1.396491,1.0
2,114.666707,16.222721,-3.342684,3.011577,-0.047054,-0.047162,-2.521048,-0.116957,1.470904,-2.16661,-1.41989,3.451132,-0.782089,-2.448261,-2.87853,1.0
3,-23.901991,-6.55951,-1.969945,-0.00137,3.113465,-0.931984,-0.225603,-0.674807,0.84473,-0.809939,-0.125858,-0.721037,-0.52577,2.061907,1.210663,1.0
4,-28.76655,2.168372,-4.174635,3.54134,3.78082,-1.356589,-1.929865,1.107052,1.97874,1.490739,-1.798523,-1.28993,-3.448901,1.933798,-0.787482,1.0
5,-22.661784,43.804084,2.117221,-1.983106,-1.357085,1.416356,0.552067,0.143583,-0.875454,-0.657263,0.527516,0.465433,0.503147,0.094374,0.435004,1.0
6,-3.418142,-5.073555,-0.169552,-0.574871,3.439057,-1.915368,0.736279,-0.059795,4.94542,0.620146,4.031459,-1.585228,4.058599,-1.207408,-2.327849,1.0
7,-28.671182,-13.595802,9.004169,4.693889,-3.205121,0.119547,-0.300286,-2.364773,-1.286036,-1.531081,-3.770842,-3.121794,1.676568,-0.584971,-2.698592,1.0
8,64.647891,-2.988138,7.785108,-4.176278,4.345674,-0.053066,2.884967,-3.592821,0.50036,4.266789,-2.250115,2.425795,0.127,0.860364,0.745772,1.0
9,-70.516281,-11.785179,-4.472854,-0.40469,3.01658,3.629371,-0.433539,-0.458165,1.326049,0.31926,-1.538527,0.433346,-1.724034,-0.935326,-0.582094,1.0


In [54]:
#Let's drop the first and last rows of the concatenated PCA+target DataFrame, as the DM2 value
#for the first participant has, for some reason, gone "NaN": 
RamanPCAInnerArmComponentsAndFeatureDF = RamanPCAInnerArmComponentsAndFeatureDF.drop(RamanPCAInnerArmComponentsAndFeatureDF.index[0])
RamanPCAInnerArmComponentsAndFeatureDF = RamanPCAInnerArmComponentsAndFeatureDF.drop(RamanPCAInnerArmComponentsAndFeatureDF.index[18])

2) Now that the Inner Arm Data has been normalized and reduced, let's implement a K-Fold cross-validation method and attempt to 
replicate the CV procedures described in the original paper for this dataset. Since the paper mentioned that their team repeated K-fold 
CV and their subsequent SVM/ANN runs one thousand times, I will attempt to repeat a similar process with CV --> Binomial Logistic Regression.

In [55]:
#2) Implementing K-Fold cross-validation and Logistic Regression, iterating one thousand times: 

#Necessary imports: 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score 

#Splitting the PCA-reduced Dataframe into Target and Feature Data: 
RamanPCAInnerArmFeaturesDF = RamanPCAInnerArmComponentsAndFeatureDF.drop(RamanPCAInnerArmComponentsAndFeatureDF[['has_DM2']], axis = 1)
RamanPCAInnerArmTargetDF = RamanPCAInnerArmComponentsAndFeatureDF.loc[:,'has_DM2']
#I'm going to drop the first rows of both Dataframes because the target value for the first partipant now registers as "NaN" for some 
#reason:
RamanPCAInnerArmFeaturesDF = RamanPCAInnerArmFeaturesDF.drop(RamanPCAInnerArmFeaturesDF.index[0])
RamanPCAInnerArmTargetDF = RamanPCAInnerArmTargetDF.drop(RamanPCAInnerArmTargetDF.index[0])

#Creating empty lists to store test accuracy results: 
avgLRScore = []
LRScoresCV5 = []

#**I keep receiving an error saying that cross_val_score cannot run correctly because there is either a NaN, infinite, or too large 
#value. I'm going to convert the Dataframes into numpy arrays to see if this is a compatibility issue**#
RamanPCAInnerArmFeaturesArray = np.asarray(RamanPCAInnerArmFeaturesDF)
RamanPCAInnerArmTargetArray = np.asarray(RamanPCAInnerArmTargetDF)

RamanPCAInnerArmComponentsAndFeatureDF.head(25)

#Outer for loop, to iterate a 1000 times. Will do this over a list over a certain integer range (1-1000)#
for i in range(1,1000):
    LRScoresCV5.append(cross_val_score(LogisticRegression(),RamanPCAInnerArmFeaturesArray,RamanPCAInnerArmTargetArray, cv = 5))

avgLRScore = np.average(LRScoresCV5)


    




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [None]:
avgLRScore

It doesn't seem like regular, two-class Logistic Regression is any more effective than just guessing by pure chance.