In [160]:
import pandas as pd
import csv

file_path = 'createdDataset.csv'
df = pd.read_csv(file_path, encoding='iso-8859-1')
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.columns = df.columns.str.strip()
df.head()

Unnamed: 0,rec_id,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,blocking_number
0,rec-52-dup-0,Yoland,Mencjs,7,Hydw St,Kaumakani,90290,hi,19311124,24,206 9589621,,1
1,rec-915-org,Kona,Elliott,42,Lincoln Ave,Pearl City,89133,az,19320408,35,423 9346799,7657030.0,7
2,rec-199-org,Kamea,Kennedy,12,Wailele St,Volcano,85258,az,19760705,35,213 0356038,,4
3,rec-996-org,Dominique,Fleming,110,Hauiki St,Hawaiian Ocean View,85030,az,19431130,32,808 1076447,,0
4,rec-195-org,Ciera,Pua,27,Puowaina Dr,Wahiawa,90290,hi,19470119,37,808 0257265,4354584.0,1


In [161]:
original_df = df[df['rec_id'].str.contains('-org', na=False)]
original_df.head()

Unnamed: 0,rec_id,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,blocking_number
1,rec-915-org,Kona,Elliott,42,Lincoln Ave,Pearl City,89133,az,19320408,35,423 9346799,7657030.0,7
2,rec-199-org,Kamea,Kennedy,12,Wailele St,Volcano,85258,az,19760705,35,213 0356038,,4
3,rec-996-org,Dominique,Fleming,110,Hauiki St,Hawaiian Ocean View,85030,az,19431130,32,808 1076447,,0
4,rec-195-org,Ciera,Pua,27,Puowaina Dr,Wahiawa,90290,hi,19470119,37,808 0257265,4354584.0,1
5,rec-524-org,Adelaide,Padilla,11,Kekona Pl,Olinda,89137,wa,20190608,29,253 7227405,,0


In [162]:
original_count = original_df.shape[0]
original_count

1000

In [163]:
duplicate_df = df[df['rec_id'].str.contains('-dup', na=False)]
duplicate_df.head()

Unnamed: 0,rec_id,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,blocking_number
0,rec-52-dup-0,Yoland,Mencjs,7,Hydw St,Kaumakani,90290,hi,19311124,24.0,206 9589621,,1
14,rec-120-dup-0,Lennox,Tamajo,24,Hanauma Bay Rd,8913c,Princeville,hi,19280408,30.0,310 6255922,,6
15,rec-891-dup-0,Aala,,1708,Puakauhi Ct,Mokuleia,85208,nv,19720409,,206 7456375,3773764.0,2
18,rec-191-dup-0,Isabella,Tucker,56,St Ohialoke,'Puiapani',85011,wa,19921101,62.0,253 9215064,1722381.0,3
21,rec-828-dup-0,Anitga,Muramto,91,Peter Lee Rd,Volcanro,96748,nv,19220308,30.0,808 5347407,4417732.0,7


In [164]:
duplicate_count = duplicate_df.shape[0]
duplicate_count

300

In [165]:
random_sample = original_df.sample(n=200, random_state=42)
dataset_2 = pd.concat([duplicate_df, random_sample])
dataset_1 = original_df.drop(random_sample.index)

In [166]:
dataset_1_count = dataset_1.shape[0]
dataset_1_count

800

In [167]:
dataset_2_count = dataset_2.shape[0]
dataset_2_count

500

In [168]:
dataset_1.to_csv('dataset1.csv', index=False)
dataset_2.to_csv('dataset2.csv', index=False)

In [169]:
clean_df = df.copy()
clean_df['rec_id'] = clean_df['rec_id'].str.replace(r'-org|-dup-\d+', '', regex=True)

# Display the new dataframe with cleaned rec_id
clean_df.head()

Unnamed: 0,rec_id,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,blocking_number
0,rec-52,Yoland,Mencjs,7,Hydw St,Kaumakani,90290,hi,19311124,24,206 9589621,,1
1,rec-915,Kona,Elliott,42,Lincoln Ave,Pearl City,89133,az,19320408,35,423 9346799,7657030.0,7
2,rec-199,Kamea,Kennedy,12,Wailele St,Volcano,85258,az,19760705,35,213 0356038,,4
3,rec-996,Dominique,Fleming,110,Hauiki St,Hawaiian Ocean View,85030,az,19431130,32,808 1076447,,0
4,rec-195,Ciera,Pua,27,Puowaina Dr,Wahiawa,90290,hi,19470119,37,808 0257265,4354584.0,1


In [170]:
clean_df.columns = clean_df.columns.str.strip()
matching_records = clean_df[clean_df.duplicated(subset=['rec_id', 'date_of_birth'], keep=False)]
num_matching_records = matching_records.shape[0]
print(f"Number of records with matching 'rec_id' and 'date_of_birth': {num_matching_records}")
matching_records.head()


Number of records with matching 'rec_id' and 'date_of_birth': 580


Unnamed: 0,rec_id,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,blocking_number
0,rec-52,Yoland,Mencjs,7,Hydw St,Kaumakani,90290,hi,19311124,24,206 9589621,,1
2,rec-199,Kamea,Kennedy,12,Wailele St,Volcano,85258,az,19760705,35,213 0356038,,4
3,rec-996,Dominique,Fleming,110,Hauiki St,Hawaiian Ocean View,85030,az,19431130,32,808 1076447,,0
4,rec-195,Ciera,Pua,27,Puowaina Dr,Wahiawa,90290,hi,19470119,37,808 0257265,4354584.0,1
9,rec-66,Lawrence,Felipe,4,Kapahu St,Volcano,85079,hi,19950422,29,602 6471003,,5


In [171]:
df = df.replace(r'^\s*$', pd.NA, regex=True)
clean_df.head()


Unnamed: 0,rec_id,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,blocking_number
0,rec-52,Yoland,Mencjs,7,Hydw St,Kaumakani,90290,hi,19311124,24,206 9589621,,1
1,rec-915,Kona,Elliott,42,Lincoln Ave,Pearl City,89133,az,19320408,35,423 9346799,7657030.0,7
2,rec-199,Kamea,Kennedy,12,Wailele St,Volcano,85258,az,19760705,35,213 0356038,,4
3,rec-996,Dominique,Fleming,110,Hauiki St,Hawaiian Ocean View,85030,az,19431130,32,808 1076447,,0
4,rec-195,Ciera,Pua,27,Puowaina Dr,Wahiawa,90290,hi,19470119,37,808 0257265,4354584.0,1


In [172]:
num_missing_dob = df['date_of_birth'].isna().sum()
num_missing_dob

np.int64(107)

In [173]:
clean_df.columns = clean_df.columns.str.strip()
matching_records_soc = clean_df[clean_df.duplicated(subset=['rec_id', 'soc_sec_id'], keep=False)]
num_matching_records_soc = matching_records_soc.shape[0]
print(f"Number of records with matching 'rec_id' and 'social security id': {num_matching_records_soc}")
matching_records_soc.head()

Number of records with matching 'rec_id' and 'social security id': 460


Unnamed: 0,rec_id,given_name,surname,street_number,address_1,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,blocking_number
0,rec-52,Yoland,Mencjs,7,Hydw St,Kaumakani,90290,hi,19311124,24,206 9589621,,1
2,rec-199,Kamea,Kennedy,12,Wailele St,Volcano,85258,az,19760705,35,213 0356038,,4
3,rec-996,Dominique,Fleming,110,Hauiki St,Hawaiian Ocean View,85030,az,19431130,32,808 1076447,,0
4,rec-195,Ciera,Pua,27,Puowaina Dr,Wahiawa,90290,hi,19470119,37,808 0257265,4354584.0,1
9,rec-66,Lawrence,Felipe,4,Kapahu St,Volcano,85079,hi,19950422,29,602 6471003,,5


In [174]:
num_missing_soc = df['soc_sec_id'].isna().sum()
num_missing_soc

np.int64(688)