In [28]:
import pandas as pd

mela = pd.read_csv('Melanoma.csv')
mela['TUMOR_SIZE']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
523487   NaN
523488   NaN
523489   NaN
523490   NaN
523491   NaN
Name: TUMOR_SIZE, Length: 523492, dtype: float64

In [2]:
mela = mela.rename(columns={'CS_SITESPECIFIC_FACTOR_1': "Depth", 'CS_SITESPECIFIC_FACTOR_2': 'Ulceration',
       'CS_SITESPECIFIC_FACTOR_3': 'lymph_node_mets', 'CS_SITESPECIFIC_FACTOR_4': 'LDH'})

In [3]:
attribs = ['AGE','SEX','Depth', 'Ulceration',
       'lymph_node_mets', 'LDH', 'CS_EXTENSION', 'CS_TUMOR_SIZEEXT_EVAL', 'REGIONAL_NODES_POSITIVE', 'REGIONAL_NODES_EXAMINED', 'CS_METS_AT_DX', 'CS_METS_EVAL']

## Cleaning the Data 

We first want to remove data with missing attributes. In the future, we can do some imputing for missing features, but let's first start with the cleanest version of the data.

In [4]:
df = mela[attribs].dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 523488 entries, 0 to 523491
Data columns (total 12 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   AGE                      523488 non-null  int64  
 1   SEX                      523488 non-null  int64  
 2   Depth                    523488 non-null  float64
 3   Ulceration               523488 non-null  float64
 4   lymph_node_mets          523488 non-null  float64
 5   LDH                      523488 non-null  float64
 6   CS_EXTENSION             523488 non-null  float64
 7   CS_TUMOR_SIZEEXT_EVAL    523488 non-null  float64
 8   REGIONAL_NODES_POSITIVE  523488 non-null  int64  
 9   REGIONAL_NODES_EXAMINED  523488 non-null  int64  
 10  CS_METS_AT_DX            523488 non-null  int64  
 11  CS_METS_EVAL             523488 non-null  float64
dtypes: float64(7), int64(5)
memory usage: 51.9 MB


Now that we've dropped the rows with missing information, we should also get rid of the rows where the data was unkown (just to start with). Later, we can use interpolation to decide what values to put into these parameters.

In [5]:
new_df = df[(df != 999).all(1)]
new_df.info()

new_df = new_df[(new_df['CS_TUMOR_SIZEEXT_EVAL'] != 9)]
new_df.info()
new_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175116 entries, 4 to 523488
Data columns (total 12 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   AGE                      175116 non-null  int64  
 1   SEX                      175116 non-null  int64  
 2   Depth                    175116 non-null  float64
 3   Ulceration               175116 non-null  float64
 4   lymph_node_mets          175116 non-null  float64
 5   LDH                      175116 non-null  float64
 6   CS_EXTENSION             175116 non-null  float64
 7   CS_TUMOR_SIZEEXT_EVAL    175116 non-null  float64
 8   REGIONAL_NODES_POSITIVE  175116 non-null  int64  
 9   REGIONAL_NODES_EXAMINED  175116 non-null  int64  
 10  CS_METS_AT_DX            175116 non-null  int64  
 11  CS_METS_EVAL             175116 non-null  float64
dtypes: float64(7), int64(5)
memory usage: 17.4 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 174962 entries, 4 

Unnamed: 0,AGE,SEX,Depth,Ulceration,lymph_node_mets,LDH,CS_EXTENSION,CS_TUMOR_SIZEEXT_EVAL,REGIONAL_NODES_POSITIVE,REGIONAL_NODES_EXAMINED,CS_METS_AT_DX,CS_METS_EVAL
4,54,1,42.0,0.0,5.0,998.0,100.0,3.0,98,0,0,0.0
12,60,1,270.0,10.0,0.0,0.0,300.0,3.0,0,3,0,0.0
17,60,1,101.0,0.0,43.0,998.0,100.0,3.0,1,3,0,0.0
30,57,2,100.0,0.0,0.0,998.0,100.0,3.0,0,1,0,0.0
31,76,1,265.0,0.0,0.0,0.0,300.0,3.0,98,0,0,0.0


## Combining the features that have same meaning:
Ulceration 10.0 and Ulceration 1.0 both mean the existance of ulceration. We also need to adjust values for LDH, lymph_node_mets, cs_extension, cs_tumor_size_ext_eval

In [6]:
new_df['Ulceration'] = new_df['Ulceration'].replace(10.0, 1.0)
new_df['Ulceration'].value_counts()

0.0    145667
1.0     29295
Name: Ulceration, dtype: int64

In [7]:
new_df['LDH'].value_counts()
new_df = new_df[(new_df['LDH'] != 997)]
new_df['LDH'].value_counts()
new_df['LDH'] = new_df['LDH'].replace(2, 0)
new_df['LDH'] = new_df['LDH'].replace(10, 4)
new_df['LDH'] = new_df['LDH'].replace(20, 5)
new_df['LDH'] = new_df['LDH'].replace(30, 6)
new_df['LDH'] = new_df['LDH'].replace(998, 0)
new_df = new_df[(new_df['LDH'] != 8)] #info not in charts
new_df['LDH'].value_counts()

0.0    166002
4.0      6397
5.0      1119
6.0       323
Name: LDH, dtype: int64

In [8]:
new_df['lymph_node_mets'].value_counts()
new_df['lymph_node_mets'] = new_df['lymph_node_mets'].replace([10, 20, 5, 43, 45, 48, 50, 100, 150], [1, 2, 0, 2, 2, 2, 2, 2, 2])
#maybe not best way to encode this
new_df['lymph_node_mets'].value_counts()

0.0    156440
1.0     11470
2.0      5931
Name: lymph_node_mets, dtype: int64

In [9]:
new_df['CS_EXTENSION'].value_counts()
new_df['CS_EXTENSION'] = new_df['CS_EXTENSION'].replace(950, -100)
new_df['CS_EXTENSION'].value_counts()

 300.0    60355
 100.0    42791
 200.0    32979
 500.0     8924
 0.0       7853
 400.0     6806
 310.0     3913
 330.0     1848
 315.0     1367
 375.0     1067
-100.0      967
 350.0      834
 355.0      695
 370.0      601
 30.0       484
 335.0      463
 10.0       344
 800.0      343
 20.0       276
 320.0      232
 50.0       176
 380.0      140
 360.0      122
 340.0      110
 40.0        65
 95.0        37
 99.0        32
 80.0        17
Name: CS_EXTENSION, dtype: int64

In [10]:
new_df['CS_TUMOR_SIZEEXT_EVAL'].value_counts()
new_df['CS_TUMOR_SIZEEXT_EVAL'] = new_df['CS_TUMOR_SIZEEXT_EVAL'].replace([1,5,6,2], [0,3,3,3])
new_df['CS_TUMOR_SIZEEXT_EVAL'].value_counts()

3.0    169636
0.0      4205
Name: CS_TUMOR_SIZEEXT_EVAL, dtype: int64

In [11]:
new_df.describe()

Unnamed: 0,AGE,SEX,Depth,Ulceration,lymph_node_mets,LDH,CS_EXTENSION,CS_TUMOR_SIZEEXT_EVAL,REGIONAL_NODES_POSITIVE,REGIONAL_NODES_EXAMINED,CS_METS_AT_DX,CS_METS_EVAL
count,173841.0,173841.0,173841.0,173841.0,173841.0,173841.0,173841.0,173841.0,173841.0,173841.0,173841.0,173841.0
mean,60.598006,1.429341,143.320707,0.166997,0.134215,0.190525,231.043787,2.927434,49.033582,3.874707,1.190784,0.065554
std,16.242053,0.494983,192.854659,0.372975,0.429461,0.883337,124.848447,0.460906,48.813561,12.391659,9.264519,0.584816
min,18.0,1.0,0.0,0.0,0.0,0.0,-100.0,0.0,0.0,0.0,0.0,0.0
25%,50.0,1.0,33.0,0.0,0.0,0.0,100.0,3.0,0.0,0.0,0.0,0.0
50%,62.0,1.0,72.0,0.0,0.0,0.0,300.0,3.0,13.0,1.0,0.0,0.0
75%,73.0,2.0,161.0,0.0,0.0,0.0,300.0,3.0,98.0,2.0,0.0,0.0
max,90.0,2.0,990.0,1.0,2.0,6.0,800.0,3.0,99.0,99.0,99.0,9.0


In [12]:
new_df['REGIONAL_NODES_POSITIVE'] = new_df['REGIONAL_NODES_POSITIVE'].replace(98, 0)
#new_df['REGIONAL_NODES_POSITIVE'].value_counts()
new_df = new_df[(new_df['REGIONAL_NODES_POSITIVE'] != 99)]
new_df = new_df[(new_df['REGIONAL_NODES_EXAMINED'] != 99)]

new_df['REGIONAL_NODES_EXAMINED'].replace(0,1)
new_df['PERCENT_POS'] = new_df['REGIONAL_NODES_POSITIVE'] > 0 
new_df['PERCENT_POS'].value_counts()

False    157098
True      16340
Name: PERCENT_POS, dtype: int64

## Splitting Data into training and test sets

The "percent_pos" column will be our y_train/y_test values, the rest of the attributes will be our features.

In [13]:

attribs = ['AGE','SEX','Depth','Ulceration',
       'lymph_node_mets', 'LDH', 'CS_EXTENSION', 'CS_TUMOR_SIZEEXT_EVAL']

X = new_df[attribs]
y = new_df['PERCENT_POS']

from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(new_df[attribs])
X = pd.DataFrame(X, columns=attribs)

import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Encoding Attributes

We have a few <b>numerical attributes</b>
- Age
- Depth
- LDH Maybe?
- Lymph Node Mets
- CS Extension (cardinal)


<b>Categorical Attributes</b>
- Sex
- Ulceration
- CS_TUMOR_SIZEEXT_EVAL

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names]
    
    
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X,y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(["AGE","Depth", "LDH", "lymph_node_mets", "CS_EXTENSION"])),
    ("imputer", SimpleImputer(strategy="median")),
     ])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ("select_cat", DataFrameSelector(['SEX', 'Ulceration', 'CS_TUMOR_SIZEEXT_EVAL'])),
    ("imputer", MostFrequentImputer()),
    ("cat_encoder", OneHotEncoder(sparse=False))
])

In [15]:
corr_matrix = new_df.corr()
corr_matrix["PERCENT_POS"].sort_values(ascending=False)

PERCENT_POS                1.000000
lymph_node_mets            0.837491
REGIONAL_NODES_EXAMINED    0.409011
Depth                      0.300244
REGIONAL_NODES_POSITIVE    0.274160
Ulceration                 0.249664
CS_EXTENSION               0.225428
LDH                        0.080740
CS_METS_AT_DX              0.077071
CS_METS_EVAL               0.033199
CS_TUMOR_SIZEEXT_EVAL     -0.009613
SEX                       -0.032846
AGE                       -0.056930
Name: PERCENT_POS, dtype: float64

In [16]:
train_num = num_pipeline.fit_transform(X_train)
train_num

array([[-0.34437925, -0.74352332, -0.21567508, -0.31257102, -1.85255714],
       [-0.4059391 , -0.73833885, -0.21567508, -0.31257102,  2.15499914],
       [-2.31429455, -0.53614457, -0.21567508, -0.31257102, -1.05104588],
       ...,
       [-0.09813983, -0.09546473, -0.21567508, -0.31257102,  0.55197663],
       [-0.4059391 , -0.44282414, -0.21567508, -0.31257102, -1.05104588],
       [ 0.70213826,  2.88560477, -0.21567508, -0.31257102,  1.15311007]])

In [17]:
cat_pipeline.fit_transform(X_train)
cat_pipeline

Pipeline(memory=None,
         steps=[('select_cat',
                 DataFrameSelector(attribute_names=['SEX', 'Ulceration',
                                                    'CS_TUMOR_SIZEEXT_EVAL'])),
                ('imputer', MostFrequentImputer()),
                ('cat_encoder',
                 OneHotEncoder(categories='auto', drop=None,
                               dtype=<class 'numpy.float64'>,
                               handle_unknown='error', sparse=False))],
         verbose=False)

In [18]:
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

X_train = preprocess_pipeline.fit_transform(X_train)
X_train

array([[-0.34437925, -0.74352332, -0.21567508, ...,  0.        ,
         0.        ,  1.        ],
       [-0.4059391 , -0.73833885, -0.21567508, ...,  0.        ,
         0.        ,  1.        ],
       [-2.31429455, -0.53614457, -0.21567508, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.09813983, -0.09546473, -0.21567508, ...,  0.        ,
         0.        ,  1.        ],
       [-0.4059391 , -0.44282414, -0.21567508, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70213826,  2.88560477, -0.21567508, ...,  1.        ,
         0.        ,  1.        ]])

In [19]:
from sklearn.base import BaseEstimator

class BaseLine(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self,X):
        return np.zeros((len(X),1))

In [20]:
from sklearn.model_selection import cross_val_score
base_line_clf = BaseLine()
base_scores = cross_val_score(base_line_clf,X_train,y_train,cv=10,scoring="accuracy")
base_scores.mean()

0.905553225724514

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_clf = LogisticRegression(solver="liblinear")
log_clf.fit(X_train,y_train)
log_scores = cross_val_score(log_clf, X_train,y_train,cv=10,scoring="accuracy")
y_pred = log_clf.predict(X_train)
log_score = accuracy_score(y_train,y_pred)
print(log_score)
log_scores.mean()

0.9880209633142002


0.9880037422607686

In [23]:
X_test = preprocess_pipeline.fit_transform(X_test) #running our train_set through the pipeline
y_pred = log_clf.predict(X_test)

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score


log_score = accuracy_score(y_test, y_pred)
print(log_score)
#print(y_test)
log_f1 = f1_score(y_test,y_pred)
log_recall = recall_score(y_test, y_pred)
log_precision = precision_score(y_test,y_pred)
print(f"The precision is {log_precision}, the recall is {log_recall}, and the f1 score is {log_f1}")

0.9870883200838648
The precision is 0.9083686440677966, the recall is 0.9589934762348555, and the f1 score is 0.9329948318070541


In [24]:
y_prob = log_clf.predict_proba(X_test)

In [25]:
print(y_prob)

[[0.99198464 0.00801536]
 [0.99433901 0.00566099]
 [0.98934344 0.01065656]
 ...
 [0.98796975 0.01203025]
 [0.99014811 0.00985189]
 [0.99456198 0.00543802]]


In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[51351,   519],
       [  220,  5145]])

In [27]:
sum(y_prob[:,1] > 0.5)

5664

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
gnb_score = accuracy_score(y_test, y_pred)
print(gnb_score)
#print(y_test)
gnb_f1 = f1_score(y_test,y_pred)
gnb_recall = recall_score(y_test, y_pred)
gnb_precision = precision_score(y_test,y_pred)
print(f"The precision is {gnb_precision}, the recall is {gnb_recall}, and the f1 score is {gnb_f1}")

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)
tree_score = accuracy_score(y_test, y_pred)
print(tree_score)
#print(y_test)
tree_f1 = f1_score(y_test,y_pred)
tree_recall = recall_score(y_test, y_pred)
tree_precision = precision_score(y_test,y_pred)
print(f"The precision is {tree_precision}, the recall is {tree_recall}, and the f1 score is {tree_f1}")

In [None]:
sum(y_prob[:,1] > 0.5)

In [None]:
sum(y_prob[:,0] > 0.5)