# Ingest Data

In [None]:
import matplotlib.pyplot as plt
from patsy import dmatrices
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
from sklearn.linear_model import LinearRegression

![](https://www.evernote.com/l/AAHnO3y39xlDerYJwOvrQdUoTxXpGB2dlAcB/image.png)

       #  Attribute                     Domain
       -- -----------------------------------------
       1. Sample code number            id number
       2. Clump Thickness               1 - 10
       3. Uniformity of Cell Size       1 - 10
       4. Uniformity of Cell Shape      1 - 10
       5. Marginal Adhesion             1 - 10
       6. Single Epithelial Cell Size   1 - 10
       7. Bare Nuclei                   1 - 10
       8. Bland Chromatin               1 - 10
       9. Normal Nucleoli               1 - 10
      10. Mitoses                       1 - 10
      11. Class:                        (2 for benign, 4 for malignant)

## Unformatted Data

```
Olvi Mangasarian provided the original database in a different format
than the one under breast-cancer-wisconsin.data, which contains only
the complete set of data that was available on 15 July 1992.  In
particular, data was separated into groups, each preceeded by some
documentation.  I've kept this information here in case you needed
more details.

--------------------------------CUT--------------------------------------------
#####  Group 1 : 367 points: 200B 167M (January 1989)
#####  Revised Jan 10, 1991: Replaced zero bare nuclei in 1080185 & 1187805 
#####  Revised Nov 22,1991: Removed 765878,4,5,9,7,10,10,10,3,8,1 no record
#####                     : Removed 484201,2,7,8,8,4,3,10,3,4,1 zero epithelial 
#####			  : Changed 0 to 1 in field 6 of sample 1219406
#####			  : Changed 0 to 1 in field 8 of following sample:
#####			  : 1182404,2,3,1,1,1,2,0,1,1,1

1000025,2,5,1,1,1,2,1,3,1,1
1002945,2,5,4,4,5,7,10,3,2,1
1015425,2,3,1,1,1,2,2,3,1,1
1016277,2,6,8,8,1,3,4,3,7,1
1017023,2,4,1,1,3,2,1,3,1,1
1017122,4,8,10,10,8,7,10,9,7,1
...

##### Group 2 : 70 points: 57B 13M (October 1989)

160296,4,5,8,8,10,5,10,8,10,3
342245,2,1,1,3,1,2,1,1,1,1
428598,2,1,1,3,1,1,1,2,1,1
492561,2,4,3,2,1,3,1,2,1,1
493452,2,1,1,3,1,2,1,1,1,1
493452,2,4,1,2,1,2,1,2,1,1
521441,2,5,1,1,2,2,1,2,1,1
...

##### Current Total 698 points
##### End
```

In [None]:
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/unformatted-data'

In [None]:
from urllib.request import urlopen
data = urlopen(URL)

In [None]:
line = data.readline()
line

In [None]:
type(line)

In [None]:
line = data.readline().decode()
line

In [None]:
type(line)

In [None]:
def read_string(data):
    return data.readline().decode()

In [None]:
data = urlopen(URL)
line = read_string(data)
count = 0
while line:
    count += 1
    line = read_string(data)

In [None]:
count

In [None]:
header = True
count = 0
data = urlopen(URL)
line = read_string(data)
while line:
    line = read_string(data)
    if header == True:
        if '#####' in line:
            header = False
    else:
        count += 1
count 

In [None]:
header = True
count = 0
data = urlopen(URL)
groups = []
tmp = []
line = read_string(data)
while line:
    if 'Group' in line:
        groups.append(tmp)
        tmp = []
    tmp.append(line)
    line = read_string(data)
groups.append(tmp)    

In [None]:
groups[0]

In [None]:
groups[-1]

In [None]:
groups[0].remove('\n')

In [None]:
groups[0]

In [None]:
header = True
count = 0
data = urlopen(URL)
groups = []
tmp = []
line = read_string(data)
while line:
    if 'Group' in line:
        groups.append(tmp)
        tmp = []
    if line != '\n':
        tmp.append(line)
    line = read_string(data)
groups.append(tmp)     

In [None]:
for group in groups:
    for line in group:
        if '#####' in line: 
            print(line)

In [None]:
groups = [[line for line in group if '#####' not in line]
          for group in groups]

In [None]:
group_1 = groups[1]
group_1 = [line.split(',') for line in group_1]

In [None]:
group_1[:5]

In [None]:
group_1 = groups[1]
group_1 = [(line.replace('\n','')
                .split(',')) for line in group_1]

In [None]:
group_1[:5]

In [None]:
group_1 = groups[1]
group_1 = [(line.replace('\n','')
                .split(',')) for line in group_1]
group_1 = [[int(value) for value in line] for line in group_1]

In [None]:
group_1[:5]

In [None]:
group_1 = groups[1]
group_1 = [(line.replace('\n','')
                .split(',')) for line in group_1]
group_1 = np.array(group_1, dtype=int)

In [None]:
group_1[:5]

In [None]:
group_1 = groups[1]
group_1 = [(line.replace('\n','')
                .split(',')) for line in group_1]
group_1 = np.array(group_1, dtype=int)
group_1_df = pd.DataFrame(group_1)

In [None]:
group_1_df.head()

```
#####  Group 1 : 367 points: 200B 167M (January 1989)
```

In [None]:
group_1_df.shape

In [None]:
def group_to_df(group):
    group = [(line.replace('\n','')
                    .split(',')) for line in group]
    group = np.array(group, dtype=int)
    group_df = pd.DataFrame(group)
    return group_df

```
#####  Group 1 : 367 points: 200B 167M (January 1989)

#####  Revised Jan 10, 1991: Replaced zero bare nuclei in 1080185 & 1187805 

#####  Revised Nov 22,1991: Removed 765878,4,5,9,7,10,10,10,3,8,1 no record

#####                     : Removed 484201,2,7,8,8,4,3,10,3,4,1 zero epithelial 

#####			  : Changed 0 to 1 in field 6 of sample 1219406

#####			  : Changed 0 to 1 in field 8 of following sample:

#####			  : 1182404,2,3,1,1,1,2,0,1,1,1

##### Group 2 : 70 points: 57B 13M (October 1989)

##### Group 3 : 31 points: 22B 9M (February 1990)

##### Group 4 : 17 points: 14B 3M (April 1990)

##### Group 5 : 48 points: 36B 12M (August 1990)

##### Group 6: 49 Points: 40B  9M (December 7, 1990; Updated Jan 8, 1991

##### by dropping 2 unsatisfactorily   measured points 803531 &

##### 1268952,4,10,10,7,2,7,1,4,3,3) <-----Reappears in Group 8 as:

##### 1268952,4,10,10,7,8,7,1,10,10,3 

##### Group 7: 31 Points: 16B  15M (June 1991)

##### Group 8: 86 Points: 72B  14M  (November 1991)

##### Current Total 698 points

##### End
```

In [None]:
[print(len(group)) for group in groups]

In [None]:
group_1_df = group_to_df(groups[1])
group_2_df = group_to_df(groups[2])
group_3_df = group_to_df(groups[3])
group_4_df = group_to_df(groups[4])
group_5_df = group_to_df(groups[5])
group_6_df = group_to_df(groups[7])
group_7_df = group_to_df(groups[8])
group_8_df = group_to_df(groups[9])


In [None]:
print(group_1_df.shape)
print(group_2_df.shape)
print(group_3_df.shape)
print(group_4_df.shape)
print(group_5_df.shape)
print(group_6_df.shape)
print(group_7_df.shape)
print(group_8_df.shape)

In [None]:
group_1_df['group'] = 'group_1'
group_2_df['group'] = 'group_2'
group_3_df['group'] = 'group_3'
group_4_df['group'] = 'group_4'
group_5_df['group'] = 'group_5'
group_6_df['group'] = 'group_6'
group_7_df['group'] = 'group_7'
group_8_df['group'] = 'group_8'

In [None]:
breast_cancer_df = pd.concat([group_1_df, group_2_df, group_3_df, group_4_df, group_5_df, group_6_df, group_7_df, group_8_df])

In [None]:
breast_cancer_df.shape

```
##### Current Total 698 points
```

![](https://www.evernote.com/l/AAHnO3y39xlDerYJwOvrQdUoTxXpGB2dlAcB/image.png)

### Dataframes


In [None]:
breast_cancer_df.head()

In [None]:
type(breast_cancer_df)

In [None]:
bc_columns = [
    'Sample_code_number',
    'Clump_Thickness',
    'Uniformity_of_Cell_Size',
    'Uniformity_of_Cell_Shape',
    'Marginal_Adhesion',
    'Single_Epithelial_Cell_Size',
    'Bare_Nuclei',
    'Bland_Chromatin',
    'Normal_Nucleoli',
    'Mitoses',
    'Diagnosis',
    'group'
]

In [None]:
breast_cancer_df.columns = bc_columns

In [None]:
breast_cancer_df.head()

### Set Patient ID as Index

In [None]:
breast_cancer_df.set_index('Sample_code_number', verify_integrity=True)

In [None]:
bc_sample_code_and_diag = breast_cancer_df[['Sample_code_number', 'Diagnosis']]
bc_sample_code_and_diag.head()

In [None]:
group_by_sample_code = bc_sample_code_and_diag.groupby('Sample_code_number')
group_by_sample_code.aggregate('count').Diagnosis.unique()

In [None]:
duplicate_mask = breast_cancer_df.duplicated('Sample_code_number')
duplicate_mask.head(10)

In [None]:
breast_cancer_df[duplicate_mask].shape

In [None]:
duplicate_sample_ids = breast_cancer_df[duplicate_mask].Sample_code_number.unique()

In [None]:
duplicate_sample_ids

In [None]:
duplicate_mask_2 = breast_cancer_df.Sample_code_number.isin(duplicate_sample_ids)

In [None]:
breast_cancer_df[duplicate_mask_2].shape

In [None]:
breast_cancer_df.drop_duplicates().shape

In [None]:
breast_cancer_df.drop_duplicates(inplace=True)

In [None]:
bc_sample_code_and_diag = breast_cancer_df[['Sample_code_number', 'Diagnosis']]
group_by_sample_code = bc_sample_code_and_diag.groupby('Sample_code_number')

In [None]:
sample_count = group_by_sample_code.aggregate('count').Diagnosis

In [None]:
sample_count.unique()

In [None]:
(sample_count > 1).head()

In [None]:
sample_count[sample_count > 1]

In [None]:
repeated_sample_ids = sample_count[sample_count > 1].index

In [None]:
repeated_sample_ids

In [None]:
repeated_mask = breast_cancer_df.Sample_code_number.isin(repeated_sample_ids)

In [None]:
repeated_samples = breast_cancer_df[repeated_mask].sort_values('Sample_code_number')

In [None]:
repeated_samples.head(10)

In [None]:
repeated_samples.groupby(['Sample_code_number', 'group'])[['Diagnosis']].count().head(10)

In [None]:
group_sample_count = repeated_samples.groupby(['Sample_code_number', 'group']).count()

In [None]:
group_sample_count[group_sample_count['Diagnosis'] > 1]

In [None]:
breast_cancer_df[breast_cancer_df.Sample_code_number == 493452]

In [None]:
breast_cancer_df['id'] = breast_cancer_df.Sample_code_number.astype(str) + '_' + breast_cancer_df.group

In [None]:
breast_cancer_df.head()

In [None]:
breast_cancer_df.groupby('id').agg('count')['Diagnosis'].unique()

In [None]:
id_count = breast_cancer_df.groupby('id').agg('count')['Diagnosis']

In [None]:
id_count[id_count > 1]

In [None]:
repeated_ids = id_count[id_count > 1].index

In [None]:
breast_cancer_df[breast_cancer_df.id.isin(repeated_ids)].sort_values('id').head(10)

In [None]:
breast_cancer_df[breast_cancer_df.id.isin(repeated_ids)].sort_values('id').id

In [None]:
test_ids = ['1','2','2','3','3','3','4','5']

In [None]:
counts = {}
new_test_ids = []
for ID in test_ids:
    if ID not in counts.keys():
        counts[ID] = 1
        new_test_ids.append(ID)
    else:
        counts[ID] += 1
        new_test_ids.append(ID+'_'+str(counts[ID]))
new_test_ids

In [None]:
counts = {}
new_ids = []
for ID in breast_cancer_df.id:
    if ID not in counts.keys():
        counts[ID] = 1
        new_ids.append(ID)
    else:
        counts[ID] += 1
        new_ids.append(ID+'_'+str(counts[ID]))

In [None]:
breast_cancer_df.id = new_ids

In [None]:
breast_cancer_df.head(20)

In [None]:
breast_cancer_df.set_index('id', inplace=True)

In [None]:
breast_cancer_df.head()

In [None]:
breast_cancer_df.drop(['Sample_code_number', 'group'], axis=1, inplace=True)

### Export to CSV

Ultimately, we will export a CSV of the dataframe to disk. This will make it easy to access the same data from both Python and R.


In [None]:
%ls

In [None]:
%mkdir -p data

In [None]:
%ls

In [None]:
breast_cancer_df.to_csv('data/breast_cancer.csv', index=False)