# Ingest Data

In [1]:
import matplotlib.pyplot as plt
from patsy import dmatrices
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
from sklearn.linear_model import LinearRegression

![](https://www.evernote.com/l/AAHnO3y39xlDerYJwOvrQdUoTxXpGB2dlAcB/image.png)

       #  Attribute                     Domain
       -- -----------------------------------------
       1. Sample code number            id number
       2. Clump Thickness               1 - 10
       3. Uniformity of Cell Size       1 - 10
       4. Uniformity of Cell Shape      1 - 10
       5. Marginal Adhesion             1 - 10
       6. Single Epithelial Cell Size   1 - 10
       7. Bare Nuclei                   1 - 10
       8. Bland Chromatin               1 - 10
       9. Normal Nucleoli               1 - 10
      10. Mitoses                       1 - 10
      11. Class:                        (2 for benign, 4 for malignant)

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
BC = load_breast_cancer()

In [4]:
BC.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [5]:
print(BC.DESCR)

Breast Cancer Wisconsin (Diagnostic) Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        

## Unformatted Data

```
Olvi Mangasarian provided the original database in a different format
than the one under breast-cancer-wisconsin.data, which contains only
the complete set of data that was available on 15 July 1992.  In
particular, data was separated into groups, each preceeded by some
documentation.  I've kept this information here in case you needed
more details.

--------------------------------CUT--------------------------------------------
#####  Group 1 : 367 points: 200B 167M (January 1989)
#####  Revised Jan 10, 1991: Replaced zero bare nuclei in 1080185 & 1187805 
#####  Revised Nov 22,1991: Removed 765878,4,5,9,7,10,10,10,3,8,1 no record
#####                     : Removed 484201,2,7,8,8,4,3,10,3,4,1 zero epithelial 
#####			  : Changed 0 to 1 in field 6 of sample 1219406
#####			  : Changed 0 to 1 in field 8 of following sample:
#####			  : 1182404,2,3,1,1,1,2,0,1,1,1

1000025,2,5,1,1,1,2,1,3,1,1
1002945,2,5,4,4,5,7,10,3,2,1
1015425,2,3,1,1,1,2,2,3,1,1
1016277,2,6,8,8,1,3,4,3,7,1
1017023,2,4,1,1,3,2,1,3,1,1
1017122,4,8,10,10,8,7,10,9,7,1
...

##### Group 2 : 70 points: 57B 13M (October 1989)

160296,4,5,8,8,10,5,10,8,10,3
342245,2,1,1,3,1,2,1,1,1,1
428598,2,1,1,3,1,1,1,2,1,1
492561,2,4,3,2,1,3,1,2,1,1
493452,2,1,1,3,1,2,1,1,1,1
493452,2,4,1,2,1,2,1,2,1,1
521441,2,5,1,1,2,2,1,2,1,1
...

##### Current Total 698 points
##### End
```

In [6]:
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/unformatted-data'

In [7]:
URL

'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/unformatted-data'

In [8]:
pd.read_csv(URL)

ParserError: Error tokenizing data. C error: Expected 2 fields in line 4, saw 3


In [9]:
from urllib.request import urlopen

In [10]:
data = urlopen(URL)

In [11]:
type(data)

http.client.HTTPResponse

In [12]:
data.status

200

In [13]:
line = data.readline()
line

b'Olvi Mangasarian provided the original database in a different format\n'

In [14]:
type(line)

bytes

In [15]:
line = data.readline().decode()
line

'than the one under breast-cancer-wisconsin.data, which contains only\n'

In [16]:
type(line)

str

In [17]:
def read_string(data):
    return data.readline().decode()

In [18]:
data = urlopen(URL)
line = read_string(data)
count = 0
while line:
    count += 1
    line = read_string(data)

In [19]:
count

744

In [20]:
my_string = 'Hello my name is'

In [21]:
"my" in my_string

True

In [22]:
header = True
data = urlopen(URL)
line = read_string(data)
count = 0
while line:
    line = read_string(data)
    if header == True:
        if '#####' in line:
            header = False
    else:
        count += 1
count 

736

In [23]:
data = urlopen(URL)
groups = []
tmp = []
line = read_string(data)
while line:

    if 'Group' in line:
        groups.append(tmp)
        tmp = []

    tmp.append(line)
    line = read_string(data)
groups.append(tmp)    

In [24]:
groups[0]

['Olvi Mangasarian provided the original database in a different format\n',
 'than the one under breast-cancer-wisconsin.data, which contains only\n',
 'the complete set of data that was available on 15 July 1992.  In\n',
 'particular, data was separated into groups, each preceeded by some\n',
 "documentation.  I've kept this information here in case you needed\n",
 'more details.\n',
 '\n',
 '--------------------------------CUT--------------------------------------------\n']

In [25]:
groups[-1][:10]

['##### Group 8: 86 Points: 72B  14M  (November 1991)\n',
 '\n',
 '1016634,2,2,3,1,1,2,1,2,1,1\n',
 '1031608,2,2,1,1,1,1,1,2,1,1\n',
 '1041043,2,4,1,3,1,2,1,2,1,1\n',
 '1042252,2,3,1,1,1,2,1,2,1,1\n',
 '1057067,2,1,1,1,1,1,0,1,1,1\n',
 '1061990,2,4,1,1,1,2,1,2,1,1\n',
 '1073836,2,5,1,1,1,2,1,2,1,1\n',
 '1083817,2,3,1,1,1,2,1,2,1,1\n']

In [26]:
groups[0].remove('\n')

In [27]:
groups[0]

['Olvi Mangasarian provided the original database in a different format\n',
 'than the one under breast-cancer-wisconsin.data, which contains only\n',
 'the complete set of data that was available on 15 July 1992.  In\n',
 'particular, data was separated into groups, each preceeded by some\n',
 "documentation.  I've kept this information here in case you needed\n",
 'more details.\n',
 '--------------------------------CUT--------------------------------------------\n']

In [28]:
header = True
count = 0
data = urlopen(URL)
groups = []
tmp = []
line = read_string(data)
while line:
    if 'Group' in line:
        groups.append(tmp)
        tmp = []
    if line != '\n':
        tmp.append(line)
    line = read_string(data)
groups.append(tmp)     

### Local Processing

In [29]:
for group in groups:
    for line in group:
        if '#####' in line: 
            print(line)

#####  Group 1 : 367 points: 200B 167M (January 1989)

#####  Revised Jan 10, 1991: Replaced zero bare nuclei in 1080185 & 1187805 

#####  Revised Nov 22,1991: Removed 765878,4,5,9,7,10,10,10,3,8,1 no record

#####                     : Removed 484201,2,7,8,8,4,3,10,3,4,1 zero epithelial 

#####			  : Changed 0 to 1 in field 6 of sample 1219406

#####			  : Changed 0 to 1 in field 8 of following sample:

#####			  : 1182404,2,3,1,1,1,2,0,1,1,1

##### Group 2 : 70 points: 57B 13M (October 1989)

##### Group 3 : 31 points: 22B 9M (February 1990)

##### Group 4 : 17 points: 14B 3M (April 1990)

##### Group 5 : 48 points: 36B 12M (August 1990)

##### Group 6: 49 Points: 40B  9M (December 7, 1990; Updated Jan 8, 1991

##### by dropping 2 unsatisfactorily   measured points 803531 &

##### 1268952,4,10,10,7,2,7,1,4,3,3) <-----Reappears in Group 8 as:

##### 1268952,4,10,10,7,8,7,1,10,10,3 

##### Group 7: 31 Points: 16B  15M (June 1991)

##### Group 8: 86 Points: 72B  14M  (November 1991)

#

In [30]:
groups = [
    [line for line in group if '#####' not in line]
    for group in groups
]

In [31]:
groups[1][:10]

['1000025,2,5,1,1,1,2,1,3,1,1\n',
 '1002945,2,5,4,4,5,7,10,3,2,1\n',
 '1015425,2,3,1,1,1,2,2,3,1,1\n',
 '1016277,2,6,8,8,1,3,4,3,7,1\n',
 '1017023,2,4,1,1,3,2,1,3,1,1\n',
 '1017122,4,8,10,10,8,7,10,9,7,1\n',
 '1018099,2,1,1,1,1,2,10,3,1,1\n',
 '1018561,2,2,1,2,1,2,1,3,1,1\n',
 '1033078,2,2,1,1,1,2,1,1,1,5\n',
 '1033078,2,4,2,1,1,2,1,2,1,1\n']

In [32]:
'1033078,2,4,2,1,1,2,1,2,1,1\n'.split(',')

['1033078', '2', '4', '2', '1', '1', '2', '1', '2', '1', '1\n']

In [33]:
group_1 = groups[1]

In [34]:
group_1[:5]

['1000025,2,5,1,1,1,2,1,3,1,1\n',
 '1002945,2,5,4,4,5,7,10,3,2,1\n',
 '1015425,2,3,1,1,1,2,2,3,1,1\n',
 '1016277,2,6,8,8,1,3,4,3,7,1\n',
 '1017023,2,4,1,1,3,2,1,3,1,1\n']

In [35]:
group_1 = [line.split(',') for line in group_1]

In [36]:
group_1[:5]

[['1000025', '2', '5', '1', '1', '1', '2', '1', '3', '1', '1\n'],
 ['1002945', '2', '5', '4', '4', '5', '7', '10', '3', '2', '1\n'],
 ['1015425', '2', '3', '1', '1', '1', '2', '2', '3', '1', '1\n'],
 ['1016277', '2', '6', '8', '8', '1', '3', '4', '3', '7', '1\n'],
 ['1017023', '2', '4', '1', '1', '3', '2', '1', '3', '1', '1\n']]

In [37]:
group_1 = groups[1]

In [38]:
group_1 = [(line.replace('\n','').split(',')) for line in group_1]

In [39]:
group_1 = [line.remove('\n','') for line in group_1]
group_1 = [line.split(',') for line in group_1]

AttributeError: 'list' object has no attribute 'replace'

In [41]:
group_1 = groups[1]
temp = []
for line in group_1:
    temp.append(line.replace('\n','').split(','))
group_1 = temp

In [42]:
group_1 = groups[1]
temp = []
for line in group_1:
    line = line.replace('\n','')
    temp.append(line.split(','))
group_1 = temp

In [43]:
group_1[:5]

[['1000025', '2', '5', '1', '1', '1', '2', '1', '3', '1', '1'],
 ['1002945', '2', '5', '4', '4', '5', '7', '10', '3', '2', '1'],
 ['1015425', '2', '3', '1', '1', '1', '2', '2', '3', '1', '1'],
 ['1016277', '2', '6', '8', '8', '1', '3', '4', '3', '7', '1'],
 ['1017023', '2', '4', '1', '1', '3', '2', '1', '3', '1', '1']]

In [44]:
group_1 = groups[1]
group_1 = [(line.replace('\n','')
                .split(',')) for line in group_1]
group_1 = [[int(value) for value in line] for line in group_1]

In [45]:
group_1[:5]

[[1000025, 2, 5, 1, 1, 1, 2, 1, 3, 1, 1],
 [1002945, 2, 5, 4, 4, 5, 7, 10, 3, 2, 1],
 [1015425, 2, 3, 1, 1, 1, 2, 2, 3, 1, 1],
 [1016277, 2, 6, 8, 8, 1, 3, 4, 3, 7, 1],
 [1017023, 2, 4, 1, 1, 3, 2, 1, 3, 1, 1]]

In [46]:
group_1 = groups[1]
group_1 = [(line.replace('\n','')
                .split(',')) for line in group_1]
group_1 = np.array(group_1, dtype=int)

In [47]:
group_1[:5]

array([[1000025,       2,       5,       1,       1,       1,       2,
              1,       3,       1,       1],
       [1002945,       2,       5,       4,       4,       5,       7,
             10,       3,       2,       1],
       [1015425,       2,       3,       1,       1,       1,       2,
              2,       3,       1,       1],
       [1016277,       2,       6,       8,       8,       1,       3,
              4,       3,       7,       1],
       [1017023,       2,       4,       1,       1,       3,       2,
              1,       3,       1,       1]])

In [48]:
group_1 = groups[1]
group_1 = [(line.replace('\n','')
                .split(',')) for line in group_1]

group_1 = np.array(group_1, dtype=int)
group_1_df = pd.DataFrame(group_1)

In [49]:
group_1_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,2,5,1,1,1,2,1,3,1,1
1,1002945,2,5,4,4,5,7,10,3,2,1
2,1015425,2,3,1,1,1,2,2,3,1,1
3,1016277,2,6,8,8,1,3,4,3,7,1
4,1017023,2,4,1,1,3,2,1,3,1,1


```
#####  Group 1 : 367 points: 200B 167M (January 1989)
```

In [50]:
group_1_df.shape

(367, 11)

In [51]:
def group_to_df(group):
    group = [(line.replace('\n','')
                    .split(',')) for line in group]
    group = np.array(group, dtype=int)
    group_df = pd.DataFrame(group)
    return group_df

```
#####  Group 1 : 367 points: 200B 167M (January 1989)

#####  Revised Jan 10, 1991: Replaced zero bare nuclei in 1080185 & 1187805 

#####  Revised Nov 22,1991: Removed 765878,4,5,9,7,10,10,10,3,8,1 no record

#####                     : Removed 484201,2,7,8,8,4,3,10,3,4,1 zero epithelial 

#####			  : Changed 0 to 1 in field 6 of sample 1219406

#####			  : Changed 0 to 1 in field 8 of following sample:

#####			  : 1182404,2,3,1,1,1,2,0,1,1,1

##### Group 2 : 70 points: 57B 13M (October 1989)

##### Group 3 : 31 points: 22B 9M (February 1990)

##### Group 4 : 17 points: 14B 3M (April 1990)

##### Group 5 : 48 points: 36B 12M (August 1990)

##### Group 6: 49 Points: 40B  9M (December 7, 1990; Updated Jan 8, 1991

##### by dropping 2 unsatisfactorily   measured points 803531 &

##### 1268952,4,10,10,7,2,7,1,4,3,3) <-----Reappears in Group 8 as:

##### 1268952,4,10,10,7,8,7,1,10,10,3 

##### Group 7: 31 Points: 16B  15M (June 1991)

##### Group 8: 86 Points: 72B  14M  (November 1991)

##### Current Total 698 points

##### End
```

In [52]:
[print(len(group)) for group in groups]

7
367
70
31
17
48
0
49
31
86


[None, None, None, None, None, None, None, None, None, None]

In [53]:
group_1_df = group_to_df(groups[1])
group_2_df = group_to_df(groups[2])
group_3_df = group_to_df(groups[3])
group_4_df = group_to_df(groups[4])
group_5_df = group_to_df(groups[5])
group_6_df = group_to_df(groups[7])
group_7_df = group_to_df(groups[8])
group_8_df = group_to_df(groups[9])

In [None]:
from IPython.display import display

In [57]:
print(group_1_df.shape)
print(group_2_df.shape)
print(group_3_df.shape)
print(group_4_df.shape)
print(group_5_df.shape)
print(group_6_df.shape)
print(group_7_df.shape)
print(group_8_df.shape)

(367, 11)
(70, 11)
(31, 11)
(17, 11)
(48, 11)
(49, 11)
(31, 11)
(86, 11)


In [58]:
group_1_df['group'] = 'group_1'
group_2_df['group'] = 'group_2'
group_3_df['group'] = 'group_3'
group_4_df['group'] = 'group_4'
group_5_df['group'] = 'group_5'
group_6_df['group'] = 'group_6'
group_7_df['group'] = 'group_7'
group_8_df['group'] = 'group_8'

In [59]:
breast_cancer_df = pd.concat([group_1_df, group_2_df, group_3_df, group_4_df, 
                              group_5_df, group_6_df, group_7_df, group_8_df])

In [60]:
breast_cancer_df.shape

(699, 12)

```
##### Current Total 698 points
```

![](https://www.evernote.com/l/AAHnO3y39xlDerYJwOvrQdUoTxXpGB2dlAcB/image.png)

### Dataframes


In [61]:
breast_cancer_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,group
0,1000025,2,5,1,1,1,2,1,3,1,1,group_1
1,1002945,2,5,4,4,5,7,10,3,2,1,group_1
2,1015425,2,3,1,1,1,2,2,3,1,1,group_1
3,1016277,2,6,8,8,1,3,4,3,7,1,group_1
4,1017023,2,4,1,1,3,2,1,3,1,1,group_1


In [62]:
type(breast_cancer_df)

pandas.core.frame.DataFrame

In [63]:
bc_columns = [
    'Sample_code_number',
    'Clump_Thickness',
    'Uniformity_of_Cell_Size',
    'Uniformity_of_Cell_Shape',
    'Marginal_Adhesion',
    'Single_Epithelial_Cell_Size',
    'Bare_Nuclei',
    'Bland_Chromatin',
    'Normal_Nucleoli',
    'Mitoses',
    'Diagnosis',
    'group'
]

In [64]:
breast_cancer_df.columns = bc_columns

In [65]:
breast_cancer_df.head()

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Diagnosis,group
0,1000025,2,5,1,1,1,2,1,3,1,1,group_1
1,1002945,2,5,4,4,5,7,10,3,2,1,group_1
2,1015425,2,3,1,1,1,2,2,3,1,1,group_1
3,1016277,2,6,8,8,1,3,4,3,7,1,group_1
4,1017023,2,4,1,1,3,2,1,3,1,1,group_1


### Set Patient ID as Index

In [66]:
breast_cancer_df.set_index('Sample_code_number', verify_integrity=True)

ValueError: Index has duplicate keys: [320675, 385103, 411453, 466906, 493452, 560680, 654546, 695091, 704097, 733639, 734111, 769612, 798429, 822829, 897471, 1017023, 1033078, 1061990, 1070935, 1100524, 1105524, 1114570, 1115293, 1116116, 1116192, 1143978, 1158247, 1168736, 1171710, 1173347, 1174057, 1182404, 1198641, 1212422, 1218860, 1238777, 1240603, 1276091, 1277792, 1293439, 1299596, 1299924, 1320077, 1321942, 1339781, 1354840]

In [None]:
bc_sample_code_and_diag = breast_cancer_df[['Sample_code_number', 'Diagnosis']]
bc_sample_code_and_diag.head()

In [None]:
group_by_sample_code = bc_sample_code_and_diag.groupby('Sample_code_number')
group_by_sample_code.aggregate('count').Diagnosis.unique()

In [None]:
duplicate_mask = breast_cancer_df.duplicated('Sample_code_number')
duplicate_mask.head(10)

In [None]:
breast_cancer_df[duplicate_mask].shape

In [None]:
duplicate_sample_ids = breast_cancer_df[duplicate_mask].Sample_code_number.unique()

In [None]:
duplicate_sample_ids

In [None]:
duplicate_mask_2 = breast_cancer_df.Sample_code_number.isin(duplicate_sample_ids)

In [None]:
breast_cancer_df[duplicate_mask_2].shape

In [None]:
breast_cancer_df.drop_duplicates().shape

In [None]:
breast_cancer_df.drop_duplicates(inplace=True)

In [None]:
bc_sample_code_and_diag = breast_cancer_df[['Sample_code_number', 'Diagnosis']]
group_by_sample_code = bc_sample_code_and_diag.groupby('Sample_code_number')

In [None]:
sample_count = group_by_sample_code.aggregate('count').Diagnosis

In [None]:
sample_count.unique()

In [None]:
(sample_count > 1).head()

In [None]:
sample_count[sample_count > 1]

In [None]:
repeated_sample_ids = sample_count[sample_count > 1].index

In [None]:
repeated_sample_ids

In [None]:
repeated_mask = breast_cancer_df.Sample_code_number.isin(repeated_sample_ids)

In [None]:
repeated_samples = breast_cancer_df[repeated_mask].sort_values('Sample_code_number')

In [None]:
repeated_samples.head(10)

In [None]:
repeated_samples.groupby(['Sample_code_number', 'group'])[['Diagnosis']].count().head(10)

In [None]:
group_sample_count = repeated_samples.groupby(['Sample_code_number', 'group']).count()

In [None]:
group_sample_count[group_sample_count['Diagnosis'] > 1]

In [None]:
breast_cancer_df[breast_cancer_df.Sample_code_number == 493452]

In [None]:
breast_cancer_df['id'] = breast_cancer_df.Sample_code_number.astype(str) + '_' + breast_cancer_df.group

In [None]:
breast_cancer_df.head()

In [None]:
breast_cancer_df.groupby('id').agg('count')['Diagnosis'].unique()

In [None]:
id_count = breast_cancer_df.groupby('id').agg('count')['Diagnosis']

In [None]:
id_count[id_count > 1]

In [None]:
repeated_ids = id_count[id_count > 1].index

In [None]:
breast_cancer_df[breast_cancer_df.id.isin(repeated_ids)].sort_values('id').head(10)

In [None]:
breast_cancer_df[breast_cancer_df.id.isin(repeated_ids)].sort_values('id').id

In [None]:
test_ids = ['1','2','2','3','3','3','4','5']

In [None]:
counts = {}
new_test_ids = []
for ID in test_ids:
    if ID not in counts.keys():
        counts[ID] = 1
        new_test_ids.append(ID)
    else:
        counts[ID] += 1
        new_test_ids.append(ID+'_'+str(counts[ID]))
new_test_ids

In [None]:
counts = {}
new_ids = []
for ID in breast_cancer_df.id:
    if ID not in counts.keys():
        counts[ID] = 1
        new_ids.append(ID)
    else:
        counts[ID] += 1
        new_ids.append(ID+'_'+str(counts[ID]))

In [None]:
breast_cancer_df.id = new_ids

In [None]:
breast_cancer_df.head(20)

In [None]:
breast_cancer_df.set_index('id', inplace=True)

In [None]:
breast_cancer_df.head()

In [None]:
breast_cancer_df.drop(['Sample_code_number', 'group'], axis=1, inplace=True)

### Export to CSV

Ultimately, we will export a CSV of the dataframe to disk. This will make it easy to access the same data from both Python and R.


In [None]:
%ls

In [None]:
%mkdir -p data

In [None]:
%ls

In [None]:
breast_cancer_df.to_csv('data/breast_cancer.csv', index=False)