In [1]:
import pandas as pd
import numpy as np
from IPython.display import display_html
%matplotlib inline

In [2]:
def discretization(data):
    print("Discretizing 'Clump Thickness' attribute of the breast cancer dataset\
    Visualizing distribution of attribute value")
    print(data['Clump Thickness'].value_counts(sort=False))
    print("For the equal width method, we can apply the cut() function to discretize the attribute)\
    into 4 bins of similar interval widths.")
    print("The value_counts() function can be used to determine the number of instances in each bin.")
    bins = pd.cut(data['Clump Thickness'], 4)
    print(bins.value_counts(sort=False))
    print ("For the equal frequency method, the qcut() function can be used to partition the\
    values into 4 bins such that each bin has nearly the same number of instances.")
    bins = pd.qcut(data['Clump Thickness'],4)
    print(bins.value_counts (sort=False))

In [3]:
def sampling(data):
    print("Displaying the first five records of the table Without Sampling.")
    display_html(data.head())
    print("A sample of size 3 is randomly selected (without replacement) from the original data.")
    sample = data.sample(n=3)
    display_html(sample)
    print(" Randomly select 1% of the data (without replacement) and display the selected samples.")
    sample = data.sample(frac=0.01, random_state=1)
    display_html(sample)
    print("A sampling with replacement to create a sample whose size is equal to 1% of the entire data. ")
    sample = data.sample(frac=0.01, replace=True, random_state=1)
    display_html(sample)

In [4]:
def remove_duplicate(data):
    dups = data.duplicated()
    print('Number of duplicate rows = %d' % (dups.sum()))
    data.loc[[11,28]]
    print('Number of rows before discarding duplicates = %d' % (data.shape[0]))
    data2 = data.drop_duplicates()
    print('Number of rows after discarding duplicates = %d' % (data2.shape[0]))

In [5]:
def outlier(data):
    data2 = data.drop(['Class'],axis=1)
    data2['Bare Nuclei'] = pd.to_numeric(data2['Bare Nuclei'])
    Z = (data2-data2.mean())/data2.std()
    Z[20:25]
    print('Number of rows before discarding outliers = %d' % (Z.shape[0]))
    Z2 = Z.loc[((Z >-3).sum(axis=1)==9) & ((Z <= 3).sum(axis=1)==9),:]
    print('Number of rows after discarding missing values = %d' % (Z2.shape[0]))

In [6]:
def remove_missing(data):
    print('Number of rows in original data = %d' % (data.shape[0]))
    data= data.dropna()
    print('Number of rows after discarding missing values = %d' % (data.shape[0]))

In [7]:
def replace_missing_value_by_median(data):
    data2 = data['Bare Nuclei']
    print('Before replacing missing values:')
    print(data2[20:25])
    data2 = data2.fillna(data2.median())
    print('\nAfter replacing missing values by median:')
    print(data2[20:25])

In [8]:
def noise_handle(data):
    data = data.drop(['Sample code'],axis=1)
    data = data.replace('?',np.NaN)
    print('Number of instances = %d' % (data.shape[0]))
    print('Number of attributes = %d' % (data.shape[1]))
    print('Number of missing values:')
    for col in data.columns:
        print('\t%s: %d' % (col,data[col].isna().sum()))
    print("To further preprocess select option:\
    0.Exit\
    1. Replace missing value by median\
    2. Remove missing value\
    3. Handle outlier\
    4. Remove duplicate\
    5. Sampling\
    6.Discretization:")
    option = int(input())
    while(option !=0):
        if option == 1:
            replace_missing_value_by_median(data)
        elif option == 2:
            remove_missing(data)
        elif option == 3:
            outlier(data)
        elif option == 4:
            remove_duplicate(data)
        elif option == 5:
            sampling(data)
        elif option == 6:
            discretization(data)
        else:
            print("Enter correct choice")
        print("Select your option again:")
        option = int(input())

In [9]:
def view(data):
    data.head()
    print('Number of instances = %d' % (data.shape[0]))
    print('Number of attributes = %d' % (data.shape[1]))
    display_html(data.head())

In [10]:
def main():
    data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
    data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
    print("Do you want to view data?")
    response=input();
    if response=='yes':
        view(data)
    print("Do you want to remove noise and further preprocess data?")
    response=input()
    if response=='yes':
        noise_handle(data)
    else:
        quit()
main()

Do you want to view data?


 yes


Number of instances = 699
Number of attributes = 11


Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


Do you want to remove noise and further preprocess data?


 yes


Number of instances = 699
Number of attributes = 10
Number of missing values:
	Clump Thickness: 0
	Uniformity of Cell Size: 0
	Uniformity of Cell Shape: 0
	Marginal Adhesion: 0
	Single Epithelial Cell Size: 0
	Bare Nuclei: 16
	Bland Chromatin: 0
	Normal Nucleoli: 0
	Mitoses: 0
	Class: 0
To further preprocess select option:    0.Exit    1. Replace missing value by median    2. Remove missing value    3. Handle outlier    4. Remove duplicate    5. Sampling    6.Discretization:


 1


Before replacing missing values:
20     10
21      7
22      1
23    NaN
24      1
Name: Bare Nuclei, dtype: object

After replacing missing values by median:
20     10
21      7
22      1
23    1.0
24      1
Name: Bare Nuclei, dtype: object
Select your option again:


 2


Number of rows in original data = 699
Number of rows after discarding missing values = 683
Select your option again:


 3


Number of rows before discarding outliers = 699
Number of rows after discarding missing values = 632
Select your option again:


 4


Number of duplicate rows = 236
Number of rows before discarding duplicates = 699
Number of rows after discarding duplicates = 463
Select your option again:


 5


Displaying the first five records of the table Without Sampling.


Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


A sample of size 3 is randomly selected (without replacement) from the original data.


Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
632,1,1,1,1,2,1,1,1,1,2
397,4,1,1,1,2,1,1,1,1,2
163,1,1,1,2,1,3,1,1,7,2


 Randomly select 1% of the data (without replacement) and display the selected samples.


Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
584,5,1,1,6,3,1,1,1,1,2
417,1,1,1,1,2,1,2,1,1,2
606,4,1,1,2,2,1,1,1,1,2
349,4,2,3,5,3,8,7,6,1,4
134,3,1,1,1,3,1,2,1,1,2
502,4,1,1,2,2,1,2,1,1,2
117,4,5,5,10,4,10,7,5,8,4


A sampling with replacement to create a sample whose size is equal to 1% of the entire data. 


Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
37,6,2,1,1,1,1.0,7,1,1,2
235,3,1,4,1,2,,3,1,1,2
72,1,3,3,2,2,1.0,7,2,1,2
645,3,1,1,1,2,1.0,2,1,1,2
144,2,1,1,1,2,1.0,2,1,1,2
129,1,1,1,1,10,1.0,1,1,1,2
583,3,1,1,1,2,1.0,1,1,1,2


Select your option again:


 6


Discretizing 'Clump Thickness' attribute of the breast cancer dataset    Visualizing distribution of attribute value
5     130
3     108
6      34
4      80
8      46
1     145
2      50
7      23
10     69
9      14
Name: Clump Thickness, dtype: int64
For the equal width method, we can apply the cut() function to discretize the attribute)    into 4 bins of similar interval widths.
The value_counts() function can be used to determine the number of instances in each bin.
(0.991, 3.25]    303
(3.25, 5.5]      210
(5.5, 7.75]       57
(7.75, 10.0]     129
Name: Clump Thickness, dtype: int64
For the equal frequency method, the qcut() function can be used to partition the    values into 4 bins such that each bin has nearly the same number of instances.
(0.999, 2.0]    195
(2.0, 4.0]      188
(4.0, 6.0]      164
(6.0, 10.0]     152
Name: Clump Thickness, dtype: int64
Select your option again:


 7


Enter correct choice
Select your option again:


 1


Before replacing missing values:
20     10
21      7
22      1
23    NaN
24      1
Name: Bare Nuclei, dtype: object

After replacing missing values by median:
20     10
21      7
22      1
23    1.0
24      1
Name: Bare Nuclei, dtype: object
Select your option again:


 0
