## Set Index of `DataFrame`

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [None]:
%run lib/ingest_data.py

### Dataframes


In [None]:
breast_cancer_df.head()

In [None]:
breast_cancer_df.shape

In [None]:
type(breast_cancer_df)

### Set Patient ID as Index

In [None]:
try:
    breast_cancer_df.set_index('Sample_code_number', verify_integrity=True)
except ValueError as e:
    print("ValueError: " + str(e))

#### For a Duplicate Index What are the Counts?

In [None]:
group_by_sample_code = breast_cancer_df.groupby('Sample_code_number')
group_by_sample_code.aggregate('count').Diagnosis.unique()

#### `pd.DataFrame.drop_duplicates()`

This will drop exact duplicate rows.

In [None]:
breast_cancer_df.drop_duplicates().shape

In [None]:
breast_cancer_df.drop_duplicates(inplace=True)

But is insufficient to fix the problem

In [None]:
try:
    breast_cancer_df.set_index('Sample_code_number', verify_integrity=True)
except ValueError as e:
    print("ValueError: " + str(e))

In [None]:
duplicate_mask = breast_cancer_df.duplicated('Sample_code_number')
duplicate_mask.head(10)

In [None]:
breast_cancer_df[duplicate_mask].shape

In [None]:
duplicate_sample_ids = breast_cancer_df[duplicate_mask].Sample_code_number.unique()

In [None]:
duplicate_sample_ids.shape

In [None]:
duplicate_sample_ids

In [None]:
duplicate_mask_2 = breast_cancer_df.Sample_code_number.isin(duplicate_sample_ids)

In [None]:
breast_cancer_df[duplicate_mask_2].shape

In [None]:
group_by_sample_code.aggregate('count').Diagnosis.head()

In [None]:
group_by_sample_code = breast_cancer_df.groupby('Sample_code_number')
group_by_sample_code.aggregate('count').Diagnosis.unique()
sample_code_counts = group_by_sample_code.aggregate('count').Diagnosis

In [None]:
(sample_code_counts > 1).head()

In [None]:
sample_code_counts[sample_code_counts > 1].index

In [None]:
sample_code_counts[sample_code_counts > 1].shape

In [None]:
manual_duplicate_indices = sample_code_counts[sample_code_counts > 1].index

In [None]:
manual_duplicate_mask = breast_cancer_df.Sample_code_number.isin(manual_duplicate_indices)

In [None]:
breast_cancer_df[manual_duplicate_mask].shape

In [None]:
repeated_samples = breast_cancer_df[manual_duplicate_mask].sort_values('Sample_code_number')

In [None]:
repeated_samples.head(10)

In [None]:
repeated_samples.groupby(['Sample_code_number', 'group'])[['Diagnosis']].count().head(10)

In [None]:
group_sample_count = repeated_samples.groupby(['Sample_code_number', 'group']).count()

In [None]:
group_sample_count[group_sample_count['Diagnosis'] > 1]

In [None]:
breast_cancer_df[breast_cancer_df.Sample_code_number == 493452]

In [None]:
breast_cancer_df['id'] = breast_cancer_df.Sample_code_number.astype(str) + '_' + breast_cancer_df.group

In [None]:
breast_cancer_df.head()

In [None]:
breast_cancer_df.groupby('id').agg('count')['Diagnosis'].unique()

In [None]:
id_count = breast_cancer_df.groupby('id').agg('count')['Diagnosis']

In [None]:
id_count[id_count > 1]

In [None]:
repeated_ids = id_count[id_count > 1].index

In [None]:
breast_cancer_df[breast_cancer_df.id.isin(repeated_ids)].sort_values('id').head(10)

In [None]:
breast_cancer_df[breast_cancer_df.id.isin(repeated_ids)].sort_values('id').id

In [None]:
test_ids = ['1','2','2','3','3','3','4','5']

In [None]:
counts = {}
new_test_ids = []
for ID in test_ids:
    if ID not in counts.keys():
        counts[ID] = 1
        new_test_ids.append(ID)
    else:
        counts[ID] += 1
        new_test_ids.append(ID+'_'+str(counts[ID]))
new_test_ids

In [None]:
counts = {}
new_ids = []
for ID in breast_cancer_df.id:
    if ID not in counts.keys():
        counts[ID] = 1
        new_ids.append(ID)
    else:
        counts[ID] += 1
        new_ids.append(ID+'_'+str(counts[ID]))

In [None]:
breast_cancer_df.id = new_ids

In [None]:
breast_cancer_df.head(20)

In [None]:
breast_cancer_df.set_index('id', inplace=True)

In [None]:
breast_cancer_df.head()

In [None]:
breast_cancer_df.drop(['Sample_code_number', 'group'], axis=1, inplace=True)

### Export to CSV

Ultimately, we will export a CSV of the dataframe to disk. This will make it easy to access the same data from both Python and R.


In [None]:
%ls

In [None]:
%mkdir -p data

In [None]:
%ls

In [None]:
breast_cancer_df.to_csv('data/breast_cancer.csv', index=False)