# DATA PREPROCESSING

#### Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

#### Read & Store Data

In [2]:
colnames = ['BI-RADS_ASSESSMENT','AGE', 'SHAPE', 'MARGIN', 'DENSITY', 'SEVERITY']

df = pd.read_csv('mammographic_masses.csv', header=None, names=colnames) # Read csv file

#### Copy Data

In [3]:
df_clean = df.copy()

In [4]:
df.names       = 'Mammographic Full Dataset'
df_clean.names = 'Mammographic Cleaned Dataset'

#### Display Top 5 Rows

In [5]:
df.head(5)

Unnamed: 0,BI-RADS_ASSESSMENT,AGE,SHAPE,MARGIN,DENSITY,SEVERITY
0,5,67,3,5,3,1
1,4,43,1,1,?,1
2,5,58,4,5,3,1
3,4,28,1,1,3,0
4,5,74,1,5,?,1


#### Check Info

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961 entries, 0 to 960
Data columns (total 6 columns):
BI-RADS_ASSESSMENT    961 non-null object
AGE                   961 non-null object
SHAPE                 961 non-null object
MARGIN                961 non-null object
DENSITY               961 non-null object
SEVERITY              961 non-null int64
dtypes: int64(1), object(5)
memory usage: 45.2+ KB


The dataset used is “Mammographic masses” which is a public dataset from UCI repository (https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass) <br>

Purpose: To predict the severity (benign or malignant) of a mammographic mass from BI-RADS attributes and the patient’s age. Number of Attributes: 6 (1 goal field: severity, 1 non-predictive: BI-RADS, 4 predictive attributes) <br>

Attribute Information:
* BI-RADS assessment: 1 to 5 (ordinal)
* Age: patient’s age in years (integer)
* Shape (mass shape): round=1, oval=2, lobular=3, irregular=4 (nominal)
* Margin (mass margin): circumscribed=1, microlobulated=2, obscured=3, ill-defined=4, spiculated=5 (nominal)
* Density (mass density): high=1, iso=2, low=3, fat-containing=4 (ordinal)
* Severity: benign=0 or malignant=1 (binomial)

#### Check Shape

In [7]:
df_clean.shape

(961, 6)

#### Check Missing Values per Column

In [8]:
for data in [df_clean]:
    print("{} Features with Null Values:\n\n {}\n".format(data.names, data.isnull().sum()))

Mammographic Cleaned Dataset Features with Null Values:

 BI-RADS_ASSESSMENT    0
AGE                   0
SHAPE                 0
MARGIN                0
DENSITY               0
SEVERITY              0
dtype: int64



#### Check Unique Values per Column

In [9]:
# Find the number, names and count of unique values for each column
for x in df_clean.columns.tolist():
    print("Analysis for column: ", x.upper(), "\n")
    print("Number of Unique values for the given column: ", len(df_clean[x].unique()), "\n")
    print("Unique Values: \n", df_clean[x].unique(), "\n")
    values_counts = sorted(dict(df_clean[x].value_counts()).items(), key=lambda x: x[0])
    for key, val in values_counts:
        if val > 1:
            print('Frequency of value:\t {} \t=> \t {}'.format(key, val))
    print("\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n")

Analysis for column:  BI-RADS_ASSESSMENT 

Number of Unique values for the given column:  8 

Unique Values: 
 ['5' '4' '3' '?' '2' '55' '0' '6'] 

Frequency of value:	 0 	=> 	 5
Frequency of value:	 2 	=> 	 14
Frequency of value:	 3 	=> 	 36
Frequency of value:	 4 	=> 	 547
Frequency of value:	 5 	=> 	 345
Frequency of value:	 6 	=> 	 11
Frequency of value:	 ? 	=> 	 2

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

Analysis for column:  AGE 

Number of Unique values for the given column:  74 

Unique Values: 
 ['67' '43' '58' '28' '74' '65' '70' '42' '57' '60' '76' '64' '36' '54'
 '52' '59' '40' '66' '56' '75' '63' '45' '55' '46' '39' '81' '77' '48'
 '78' '50' '61' '62' '44' '23' '80' '53' '49' '51' '25' '72' '73' '68'
 '33' '47' '29' '34' '71' '84' '24' '86' '41' '87' '21' '19' '35' '37'
 '79' '85' '69' '38' '32' '27' '83' '88' '26' '31' '?' '18' '82' '93' '30'
 '22' '96' '20'] 

Frequency of value:	 19 	=> 	 4
F

#### Define Functions for Data Cleaning

In [10]:
# Create Function for removing '?' values
def cleanFeature(num):
    if num == '?':
        return np.nan
    else:
        return num

#### Define Functions for Data Engineering

In [11]:
# Create Function for sorting out AGE into AGE_GROUP
# def getAgeGroup(age):
    
#     if age < 21:                 
#         return 'Teenager'   
#     elif age >= 21 and age < 34:   
#         return 'YoungAdult'  
#     elif age >= 35 and age < 49:   
#         return 'Adult'              
#     elif age >= 50 and age < 64: 
#         return 'OldAdult'  
#     elif age >= 65:            
#         return 'Elderly'   
#     else:
#         return np.nan

#### Clean Existing Columns

In [12]:
# Clean existing features
for data in [df_clean]:

    # Clean BI-RADS_ASSESSMENT Feature
    data['BI-RADS_ASSESSMENT'] = data['BI-RADS_ASSESSMENT'].apply(lambda x: cleanFeature(x))
    
    # Clean AGE Feature & Convert Data Type from Str to Float
    data['AGE'] = data['AGE'].apply(lambda x: cleanFeature(x)).astype(float)
    
    # Clean SHAPE Feature 
    data['SHAPE'] = data['SHAPE'].apply(lambda x: cleanFeature(x))
    
    # Clean MARGIN Feature
    data['MARGIN'] = data['MARGIN'].apply(lambda x: cleanFeature(x))
    
    # Clean DENSITY Feature
    data['DENSITY'] = data['DENSITY'].apply(lambda x: cleanFeature(x))

#### Engineer New Columns

In [13]:
# # Create new features
# for data in [df_clean]:
    
#     # Create AGE_GROUP Feature from AGE    
#     data['AGE_GROUP'] = data['AGE'].apply(lambda x: getAgeGroup(x))

#### Check Missing Values per Column after Data Cleaning & Engineering

In [14]:
for data in [df_clean]:
    print("{} Features with Null Values:\n\n {}\n".format(data.names, data.isna().sum()))

Mammographic Cleaned Dataset Features with Null Values:

 BI-RADS_ASSESSMENT     2
AGE                    5
SHAPE                 31
MARGIN                48
DENSITY               76
SEVERITY               0
dtype: int64



#### Drop Columns

In [15]:
nonPredictiveVals = ['BI-RADS_ASSESSMENT']

# redundantInfo = ['AGE']

In [16]:
for data in [df_clean]:
    
    # Drop Cols due to many missing values
    data.drop(nonPredictiveVals, axis=1, inplace=True)
    
    # Drop Cols due to redundant information
#     data.drop(redundantInfo, axis=1, inplace=True)

#### Drop Rows

In [17]:
for data in [df_clean]:
    
    # Drop Rows with Nan Values in SEVERITY
    data.dropna(subset=['SEVERITY'], inplace=True)
    
    # Drop All Rows with Nan Values in Features
    data.dropna(inplace=True)

#### Testing

In [18]:
# Test 1: Check Missing Values per Column after Data Cleaning & Engineering
for data in [df_clean]:
    print("{} Features with Null Values:\n\n {}\n".format(data.names, data.isnull().sum()))

Mammographic Cleaned Dataset Features with Null Values:

 AGE         0
SHAPE       0
MARGIN      0
DENSITY     0
SEVERITY    0
dtype: int64



In [19]:
# Test 2: Check the dimensionality reduction
df_clean.shape

(831, 5)

In [20]:
# Test 3: Visualize Cleaned Dataframe
df_clean.head(10)

Unnamed: 0,AGE,SHAPE,MARGIN,DENSITY,SEVERITY
0,67.0,3,5,3,1
2,58.0,4,5,3,1
3,28.0,1,1,3,0
8,57.0,1,5,3,1
10,76.0,1,4,3,1
11,42.0,2,1,3,1
13,36.0,3,1,2,0
14,60.0,2,1,2,0
15,54.0,1,1,3,0
16,52.0,3,4,3,0


#### Export Data

In [21]:
df_clean.to_csv('Mammographic_Data_Cleaned.csv', index=False)

##### End of document.