In [27]:
## Importing Libraries
import pandas as pd
import numpy as np

# Sklearn libraries
from sklearn.impute import KNNImputer

# To view all columns
pd.set_option('display.max_columns', None)

In [2]:
## Reading the donar file
donar_df = pd.read_excel(r'..\Data\original_data\SampleDonorData2.xlsx')

### Understanding the data and file structure

In [3]:
donar_df.head()

Unnamed: 0,TARGET_B,TARGET_D6,TARGET_D12,TARGET_D18,TARGET_D24,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,NUMBER_OF_RESP,DONOR_AGE,IN_HOUSE,URBANICITY,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,INCOME_GROUP,PUBLISHED_PHONE,WEALTH_RATING,MEDIAN_HOME_VALUE,MEDIAN_HOUSEHOLD_INCOME,PCT_OWNER_OCCUPIED,PEP_STAR,RECENT_STAR_STATUS,RECENCY_FREQ_STATUS,RECENT_CARD_RESPONSE_PROP,MONTHS_SINCE_LAST_PROM_RESP,LAST_GIFT_AMT,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT
0,0,0.0,0.0,0.0,0.0,6875,137,15,42.0,1,S,11.0,H,F,6.0,1,9.0,1556,421,45,1,0,A4,0.4,17.0,189.0,15,17,128
1,0,0.0,0.0,0.0,0.0,8384,65,33,42.0,1,R,53.0,U,M,,0,1.0,310,163,79,1,0,A1,0.0,,0.0,33,7,57
2,0,0.0,0.0,0.0,0.0,8863,53,16,42.0,1,U,7.0,H,M,5.0,0,6.0,1307,310,55,0,0,A1,0.167,20.0,0.0,16,20,57
3,0,0.0,0.0,0.0,0.0,10001,53,33,42.0,0,C,22.0,U,M,,0,2.0,675,278,18,1,1,S3,0.125,18.0,0.0,33,18,52
4,0,0.0,0.0,0.0,0.0,14064,17,28,42.0,1,C,24.0,H,M,6.0,0,,919,519,85,0,0,F1,0.0,17.0,50.0,28,20,20


In [4]:
donar_df.shape

(19971, 29)

In [5]:
## Checking if multiple donations/campaigns are present for a single donor
# donar_df['CONTROL_NUMBER'].value_counts() 
donar_df['CONTROL_NUMBER'].value_counts().value_counts()


1    18834
2      483
3       50
4        4
5        1
Name: CONTROL_NUMBER, dtype: int64

#### 1. Duplicate values

In [6]:
## checking for number of duplicates
donar_df.duplicated().sum()

38

In [7]:
## Exploring and Understanding the duplicates
donar_df[donar_df.duplicated(keep = False)].sort_values(by='CONTROL_NUMBER')

Unnamed: 0,TARGET_B,TARGET_D6,TARGET_D12,TARGET_D18,TARGET_D24,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,NUMBER_OF_RESP,DONOR_AGE,IN_HOUSE,URBANICITY,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,INCOME_GROUP,PUBLISHED_PHONE,WEALTH_RATING,MEDIAN_HOME_VALUE,MEDIAN_HOUSEHOLD_INCOME,PCT_OWNER_OCCUPIED,PEP_STAR,RECENT_STAR_STATUS,RECENCY_FREQ_STATUS,RECENT_CARD_RESPONSE_PROP,MONTHS_SINCE_LAST_PROM_RESP,LAST_GIFT_AMT,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT
18491,0,0.0,0.0,0.0,0.0,2798,21,13,,0,?,,U,F,,0,,1073,160,39,0,0,N3,1.000,21.0,,13,15,21
18490,0,0.0,0.0,0.0,0.0,2798,21,13,,0,?,,U,F,,0,,1073,160,39,0,0,N3,1.000,21.0,,13,15,21
18500,0,0.0,0.0,0.0,0.0,3054,29,27,,1,?,,U,U,,0,,611,202,67,0,0,A2,0.364,29.0,35.00,27,5,44
18501,0,0.0,0.0,0.0,0.0,3054,29,27,,1,?,,U,U,,0,,611,202,67,0,0,A2,0.364,29.0,35.00,27,5,44
18572,0,0.0,0.0,0.0,0.0,6286,125,15,,1,U,1.0,U,F,2.0,0,5.0,4050,611,87,0,0,A2,0.100,125.0,,15,19,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8210,0,0.0,0.0,0.0,0.0,179869,344,12,55.0,0,S,12.0,H,M,6.0,1,6.0,1267,328,64,1,0,L1,0.100,15.0,0.00,12,15,123
4346,0,0.0,0.0,0.0,0.0,189438,53,12,56.0,0,U,4.0,U,M,2.0,1,,1563,263,28,0,0,A3,0.333,53.0,103.86,12,20,52
4345,0,0.0,0.0,0.0,0.0,189438,53,12,56.0,0,U,4.0,U,M,2.0,1,,1563,263,28,0,0,A3,0.333,53.0,103.86,12,20,52
9566,0,0.0,0.0,0.0,0.0,190408,11,4,66.0,0,C,28.0,U,U,,0,3.0,713,255,90,1,0,E2,0.500,11.0,0.00,4,19,110


About 38 records are duplicate and will be dropping them from the data. 

In [8]:
## Dropping the duplicates
df_cleaned = donar_df.drop_duplicates(keep = 'last')
df_cleaned.shape

(19933, 29)

#### 2. Missing Data overview

Exploring the data further to see if any columns are missing values and how many are missing and come up with a plan to impute as we have only 20k records

In [9]:
null_columns = df_cleaned.isnull().sum()
null_columns[null_columns > 0]

DONOR_AGE                      4965
CLUSTER_CODE                    465
INCOME_GROUP                   4515
WEALTH_RATING                  9067
MONTHS_SINCE_LAST_PROM_RESP     223
LAST_GIFT_AMT                  1847
dtype: int64

Since cluster_code represents socioeconomic status, urbanicity, ethnicity, and other demographic characteristics, I will first impute cluster code based on KNN imputations (Assuming Similar data fall under each cluster) and impute rest of the features based on the cluster code. 

##### 2.a Cluster Code Imputation
Method used: KNN imputer(Imputed with a value considering other similar data points)

For similarity, I will be looking at socioeconomic status, urbanicity, ethnicity, and demographic characteristic colums from the dataframe as cluster code is based on those columns

In [15]:
# Columns to consider
columns_ = ['CLUSTER_CODE', 'URBANICITY', 'DONOR_AGE', 'INCOME_GROUP', 'MEDIAN_HOME_VALUE', 'MEDIAN_HOUSEHOLD_INCOME', 'PCT_OWNER_OCCUPIED']

# Getting a copy
df_impute = df_cleaned[columns_].copy()
column_to_convert = df_impute.select_dtypes(include=['object']).columns

# Categorical to Numerical
for col in column_to_convert:
    df_impute[col] = df_impute[col].astype('category').cat.codes

# Imputing using KNN
imputer = KNNImputer(n_neighbors=5)
imputed_df = pd.DataFrame(imputer.fit_transform(df_impute), columns = df_impute.columns)


In [16]:
imputed_df.head()

Unnamed: 0,CLUSTER_CODE,URBANICITY,DONOR_AGE,INCOME_GROUP,MEDIAN_HOME_VALUE,MEDIAN_HOUSEHOLD_INCOME,PCT_OWNER_OCCUPIED
0,11.0,3.0,42.0,6.0,1556.0,421.0,45.0
1,53.0,2.0,42.0,3.0,310.0,163.0,79.0
2,7.0,5.0,42.0,5.0,1307.0,310.0,55.0
3,22.0,1.0,42.0,2.8,675.0,278.0,18.0
4,24.0,1.0,42.0,6.0,919.0,519.0,85.0


In [17]:
imputed_df.isnull().sum()

CLUSTER_CODE               0
URBANICITY                 0
DONOR_AGE                  0
INCOME_GROUP               0
MEDIAN_HOME_VALUE          0
MEDIAN_HOUSEHOLD_INCOME    0
PCT_OWNER_OCCUPIED         0
dtype: int64

In [18]:
df_cleaned.head(20)

Unnamed: 0,TARGET_B,TARGET_D6,TARGET_D12,TARGET_D18,TARGET_D24,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,NUMBER_OF_RESP,DONOR_AGE,IN_HOUSE,URBANICITY,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,INCOME_GROUP,PUBLISHED_PHONE,WEALTH_RATING,MEDIAN_HOME_VALUE,MEDIAN_HOUSEHOLD_INCOME,PCT_OWNER_OCCUPIED,PEP_STAR,RECENT_STAR_STATUS,RECENCY_FREQ_STATUS,RECENT_CARD_RESPONSE_PROP,MONTHS_SINCE_LAST_PROM_RESP,LAST_GIFT_AMT,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT
0,0,0.0,0.0,0.0,0.0,6875,137,15,42.0,1,S,11.0,H,F,6.0,1,9.0,1556,421,45,1,0,A4,0.4,17.0,189.0,15,17,128
1,0,0.0,0.0,0.0,0.0,8384,65,33,42.0,1,R,53.0,U,M,,0,1.0,310,163,79,1,0,A1,0.0,,0.0,33,7,57
2,0,0.0,0.0,0.0,0.0,8863,53,16,42.0,1,U,7.0,H,M,5.0,0,6.0,1307,310,55,0,0,A1,0.167,20.0,0.0,16,20,57
3,0,0.0,0.0,0.0,0.0,10001,53,33,42.0,0,C,22.0,U,M,,0,2.0,675,278,18,1,1,S3,0.125,18.0,0.0,33,18,52
4,0,0.0,0.0,0.0,0.0,14064,17,28,42.0,1,C,24.0,H,M,6.0,0,,919,519,85,0,0,F1,0.0,17.0,50.0,28,20,20
5,0,0.0,0.0,0.0,0.0,14474,113,29,42.0,1,S,14.0,H,M,7.0,0,9.0,3453,1103,84,1,0,A1,0.0,26.0,30.0,29,4,113
6,0,0.0,0.0,0.0,0.0,14487,46,31,42.0,1,C,28.0,H,M,6.0,0,8.0,584,276,80,0,0,A1,0.091,19.0,41.71,31,6,67
7,0,0.0,0.0,0.0,0.0,14547,29,17,42.0,1,S,12.0,H,F,7.0,1,,1884,518,85,0,0,N3,0.4,16.0,0.0,17,16,52
8,0,0.0,0.0,0.0,0.0,15445,113,13,42.0,0,C,25.0,H,M,3.0,1,6.0,782,223,14,1,0,A1,0.0,19.0,100.0,13,19,103
9,0,0.0,0.0,0.0,0.0,15651,137,12,42.0,0,C,24.0,H,F,7.0,0,9.0,1497,581,93,0,0,A2,0.222,23.0,0.0,12,23,127


In [41]:
print(imputed_df.shape)
print(df_cleaned.shape)

(19933, 7)
(19933, 29)


In [38]:
## Replacing cluser code with imputed values
df_cleaned['CLUSTER_CODE'] = imputed_df['CLUSTER_CODE']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['CLUSTER_CODE'] = imputed_df['CLUSTER_CODE']


In [40]:
df_cleaned[df_cleaned['CLUSTER_CODE'].isnull()]

Unnamed: 0,TARGET_B,TARGET_D6,TARGET_D12,TARGET_D18,TARGET_D24,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,NUMBER_OF_RESP,DONOR_AGE,IN_HOUSE,URBANICITY,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,INCOME_GROUP,PUBLISHED_PHONE,WEALTH_RATING,MEDIAN_HOME_VALUE,MEDIAN_HOUSEHOLD_INCOME,PCT_OWNER_OCCUPIED,PEP_STAR,RECENT_STAR_STATUS,RECENCY_FREQ_STATUS,RECENT_CARD_RESPONSE_PROP,MONTHS_SINCE_LAST_PROM_RESP,LAST_GIFT_AMT,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT
19933,0,0.0,0.0,0.0,0.0,175462,7,12,,0,U,,U,U,,0,,715,451,93,1,0,A1,0.333,7.0,0.0,12,16,115
19934,1,1079.38,1694.11,2573.08,2830.08,175471,29,12,,0,U,,U,F,,0,,813,239,13,0,0,A4,0.333,21.0,102.0,12,18,32
19935,0,0.0,0.0,0.0,0.0,175540,65,10,,0,U,,U,U,,0,2.0,541,132,29,1,0,A1,0.0,26.0,20.0,10,26,60
19936,0,0.0,0.0,0.0,0.0,175575,29,12,,0,S,,U,M,,0,,528,240,89,0,0,A1,0.286,16.0,,12,16,31
19937,0,0.0,0.0,0.0,0.0,175589,236,13,,0,S,,U,F,,0,4.0,0,65,3,0,0,A1,0.3,22.0,0.0,13,22,68
19938,0,0.0,0.0,0.0,0.0,175589,362,13,,0,S,,U,F,,0,4.0,0,65,3,0,0,A1,0.3,22.0,0.0,13,22,68
19939,0,0.0,0.0,0.0,0.0,175759,41,13,,0,S,,U,F,,0,,840,342,55,0,0,A3,0.2,19.0,0.0,13,19,45
19940,1,1700.0,3350.0,5300.0,7100.0,175782,17,8,,0,S,,U,F,6.0,1,,953,319,34,0,0,F1,0.333,17.0,,8,21,21
19941,1,35.0,72.0,464.0,856.0,175902,89,13,,0,C,,H,F,1.0,1,7.0,807,351,82,1,0,A3,0.2,15.0,0.0,13,15,80
19942,0,0.0,0.0,0.0,0.0,175906,113,14,,0,S,,H,F,1.0,1,5.0,871,343,62,1,1,S4,0.5,15.0,200.0,14,15,104


In [20]:
df_cleaned.head(20)

Unnamed: 0,TARGET_B,TARGET_D6,TARGET_D12,TARGET_D18,TARGET_D24,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,NUMBER_OF_RESP,DONOR_AGE,IN_HOUSE,URBANICITY,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,INCOME_GROUP,PUBLISHED_PHONE,WEALTH_RATING,MEDIAN_HOME_VALUE,MEDIAN_HOUSEHOLD_INCOME,PCT_OWNER_OCCUPIED,PEP_STAR,RECENT_STAR_STATUS,RECENCY_FREQ_STATUS,RECENT_CARD_RESPONSE_PROP,MONTHS_SINCE_LAST_PROM_RESP,LAST_GIFT_AMT,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT
0,0,0.0,0.0,0.0,0.0,6875,137,15,42.0,1,S,11.0,H,F,6.0,1,9.0,1556,421,45,1,0,A4,0.4,17.0,189.0,15,17,128
1,0,0.0,0.0,0.0,0.0,8384,65,33,42.0,1,R,53.0,U,M,,0,1.0,310,163,79,1,0,A1,0.0,,0.0,33,7,57
2,0,0.0,0.0,0.0,0.0,8863,53,16,42.0,1,U,7.0,H,M,5.0,0,6.0,1307,310,55,0,0,A1,0.167,20.0,0.0,16,20,57
3,0,0.0,0.0,0.0,0.0,10001,53,33,42.0,0,C,22.0,U,M,,0,2.0,675,278,18,1,1,S3,0.125,18.0,0.0,33,18,52
4,0,0.0,0.0,0.0,0.0,14064,17,28,42.0,1,C,24.0,H,M,6.0,0,,919,519,85,0,0,F1,0.0,17.0,50.0,28,20,20
5,0,0.0,0.0,0.0,0.0,14474,113,29,42.0,1,S,14.0,H,M,7.0,0,9.0,3453,1103,84,1,0,A1,0.0,26.0,30.0,29,4,113
6,0,0.0,0.0,0.0,0.0,14487,46,31,42.0,1,C,28.0,H,M,6.0,0,8.0,584,276,80,0,0,A1,0.091,19.0,41.71,31,6,67
7,0,0.0,0.0,0.0,0.0,14547,29,17,42.0,1,S,12.0,H,F,7.0,1,,1884,518,85,0,0,N3,0.4,16.0,0.0,17,16,52
8,0,0.0,0.0,0.0,0.0,15445,113,13,42.0,0,C,25.0,H,M,3.0,1,6.0,782,223,14,1,0,A1,0.0,19.0,100.0,13,19,103
9,0,0.0,0.0,0.0,0.0,15651,137,12,42.0,0,C,24.0,H,F,7.0,0,9.0,1497,581,93,0,0,A2,0.222,23.0,0.0,12,23,127


#### 2b. Impute DONOR_AGE, 'MONTHS_SINCE_LAST_PROM_RESP', 'WEALTH_RATING', 'INCOME_GROUP'

The cluster code is based on various socioeconomic factor, In general replacing null values with the respective groups might lead to more accurate rep of age

1. DONOR_AGE, MONTHS_SINCE_LAST_PROM_RESP -> Replacing with the median age from the respective cluster_code. 

2. 'WEALTH_RATING', 'INCOME_GROUP' -> Replacing these two columns with most frequently occuring value within their respective cluster group. 

In [33]:
## Creating a dictionary to hold the respective values for the above mentioned columns on each cluster. 

cluster_dict = {}

cluster_grouped = df_cleaned.groupby('CLUSTER_CODE')

for cluster, group in cluster_grouped:
    cluster_dict[cluster] = {
        'median_donor_age': group['DONOR_AGE'].median(),
        'median_month_since_last_resp': group['MONTHS_SINCE_LAST_PROM_RESP'].median(),
        'mode_wealth_rating': group['WEALTH_RATING'].mode()[0] if not group['WEALTH_RATING'].mode().empty else np.nan,
        'mode_income_group': group['INCOME_GROUP'].mode()[0] if not group['INCOME_GROUP'].mode().empty else np.nan
    }

In [34]:
cluster_dict

{1.0: {'median_donor_age': 59.5,
  'median_month_since_last_resp': 18.0,
  'mode_wealth_rating': 7.0,
  'mode_income_group': 2.0},
 1.8: {'median_donor_age': 33.0,
  'median_month_since_last_resp': 17.0,
  'mode_wealth_rating': nan,
  'mode_income_group': 2.0},
 2.0: {'median_donor_age': 59.5,
  'median_month_since_last_resp': 18.0,
  'mode_wealth_rating': 9.0,
  'mode_income_group': 5.0},
 3.0: {'median_donor_age': 58.5,
  'median_month_since_last_resp': 18.0,
  'mode_wealth_rating': 8.0,
  'mode_income_group': 5.0},
 4.0: {'median_donor_age': 59.0,
  'median_month_since_last_resp': 18.0,
  'mode_wealth_rating': 2.0,
  'mode_income_group': 4.0},
 4.6: {'median_donor_age': 72.0,
  'median_month_since_last_resp': 27.0,
  'mode_wealth_rating': nan,
  'mode_income_group': 5.0},
 5.0: {'median_donor_age': 60.0,
  'median_month_since_last_resp': 18.0,
  'mode_wealth_rating': 8.0,
  'mode_income_group': 5.0},
 5.6: {'median_donor_age': nan,
  'median_month_since_last_resp': 20.0,
  'mode_wea

In [35]:
## Verifying why certain clusters still have null values even after summary statistics
df_cleaned[df_cleaned['CLUSTER_CODE'] == 10.2]

Unnamed: 0,TARGET_B,TARGET_D6,TARGET_D12,TARGET_D18,TARGET_D24,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,NUMBER_OF_RESP,DONOR_AGE,IN_HOUSE,URBANICITY,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,INCOME_GROUP,PUBLISHED_PHONE,WEALTH_RATING,MEDIAN_HOME_VALUE,MEDIAN_HOUSEHOLD_INCOME,PCT_OWNER_OCCUPIED,PEP_STAR,RECENT_STAR_STATUS,RECENCY_FREQ_STATUS,RECENT_CARD_RESPONSE_PROP,MONTHS_SINCE_LAST_PROM_RESP,LAST_GIFT_AMT,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT
19278,0,0.0,0.0,0.0,0.0,167978,17,10,,0,S,10.2,U,M,,0,,2558,440,64,0,0,F1,0.333,17.0,340.0,10,21,21


In [37]:
null_columns = df_cleaned.isnull().sum()
null_columns[null_columns > 0]

DONOR_AGE                      4965
CLUSTER_CODE                     38
INCOME_GROUP                   4515
WEALTH_RATING                  9067
MONTHS_SINCE_LAST_PROM_RESP     223
LAST_GIFT_AMT                  1847
dtype: int64

In [44]:
## Dropping the null cluster_code values
df_cleaned = df_cleaned.dropna(subset=['CLUSTER_CODE'])
null_columns = df_cleaned.isnull().sum()
null_columns[null_columns > 0]

DONOR_AGE                      4936
INCOME_GROUP                   4496
WEALTH_RATING                  9045
MONTHS_SINCE_LAST_PROM_RESP     223
LAST_GIFT_AMT                  1840
dtype: int64

In [45]:
## Impute with the above cluster dict

def impute_with_cluster_stats(row, cluster_dict):
    
    if pd.isnull(row['DONOR_AGE']):
        row['DONOR_AGE'] = cluster_dict[row['CLUSTER_CODE']]['median_donor_age']
    if pd.isnull(row['MONTHS_SINCE_LAST_PROM_RESP']):
        row['MONTHS_SINCE_LAST_PROM_RESP'] = cluster_dict[row['CLUSTER_CODE']]['median_month_since_last_resp']
    if pd.isnull(row['WEALTH_RATING']):
        row['WEALTH_RATING'] = cluster_dict[row['CLUSTER_CODE']]['mode_wealth_rating']
    if pd.isnull(row['INCOME_GROUP']):
        row['INCOME_GROUP'] = cluster_dict[row['CLUSTER_CODE']]['mode_income_group']
    
    return row

df_cleaned = df_cleaned.apply(lambda x: impute_with_cluster_stats(x, cluster_dict), axis=1)

In [46]:
## Verifying if the imputation was successful
null_columns = df_cleaned.isnull().sum()
null_columns[null_columns > 0]

DONOR_AGE          42
INCOME_GROUP       35
WEALTH_RATING      57
LAST_GIFT_AMT    1840
dtype: int64

#### 2c. Imputing Rest of the data

LAST_GIFT_AMOUNT - Impute with 0 <br>
DONOR_AGE - Impute with Median <br>
INCOME_GROUP - Most Frequent <br>
WEALTH_RATING - Most Frequent <br>


In [47]:
df_cleaned['LAST_GIFT_AMT'].fillna(0, inplace = True)
df_cleaned['DONOR_AGE'].fillna(df_cleaned['DONOR_AGE'].median(), inplace = True)
df_cleaned['INCOME_GROUP'].fillna(df_cleaned['INCOME_GROUP'].mode()[0], inplace = True)
df_cleaned['WEALTH_RATING'].fillna(df_cleaned['WEALTH_RATING'].mode()[0], inplace = True)

In [48]:
## Verifying if the imputation was successful
null_columns = df_cleaned.isnull().sum()
null_columns[null_columns > 0]

Series([], dtype: int64)

In [49]:
df_cleaned.shape

(19895, 29)

### Saving the Cleaned File

In [51]:
## Saving the cleaned data
df_cleaned.to_csv(r'..\Data\cleaned_data\donar_data_cleaned.csv', index=False)