In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
df = pd.read_csv('../data/donor_data.csv')

In [None]:
df.head()

In [None]:
##converting the upper case features to lower class to make my eda easier

df.columns = df.columns.str.lower()

In [None]:
df.columns.to_list()

# Data Cleaning and Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df['target_d'].value_counts()

#### Looking at the missing values from the target_d column, it is observed that the values with nan are values from people who didn't donate. Rather that using nan, I will be substituing nan as 0 as those who didn't donate also gave $0 

In [None]:
df['target_d'] = df['target_d'].replace(np.nan, 0)

In [None]:
df.shape

In [None]:
# Checking for the number of unique values in each column
# This is to check for consistency, uniformity and accuracy
# A for loop has been used to print unique values for each column
#
for column in df.columns:
  print(column)
  print(df[column].nunique())
  print(df[column].unique())
  print('\n')

### Observations
* Age has a value with zero(0) which isn't right
* The Overlay column is meant to take 3 values; M=Metromail; P=Polk; B=Both. But it takes 4 values with the fourth value (4th) value being "n". I am going to assume this was a mistake and replace n with the value n
* Cluster code has a value with ".". This will be replaced with the most apporopriate method
* Donor Gender takes 4 values. Male, Female, Unknown and A. I am assuming the A value is a mistake as it takes one (1) value. It will be replaced appropriately
* "?" represents unknown wherever found


In [None]:
#Checking out the datatypes of the features
df.info()

print("\n----------\nCategorical variables values:")
#List values for categorical variables
for name, values in df.iteritems():
    if(values.dtype != np.float64 and values.dtype != np.int64):
        print('{name}: {value}\n'.format(name=name, value=values.unique()))

In [None]:
#rename TARGET_B AS Donated
df.rename(columns={'target_b':'donated', 'target_d': 'amount_donated'}, inplace=True)

In [None]:
plt.figure(figsize=(6,6))
sns.countplot(x='donated', data=df)

df['donated'].value_counts()
# From the data, we can see that non-donors outnumber actual donors at almost 3 to 1
# From here we can see the amount of non-donors to donors in our dataset

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='donor_gender',
              hue='donated',
              data=df,
             palette='Set1')
plt.show()

df['donor_gender'].value_counts()
# This graph shows us that a significant portion of the male and females fall as non-donors

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='home_owner',
              hue='donated',
              data=df,
             palette='Set1')
plt.show()

df['home_owner'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
df['donor_age'].plot.hist(figsize=(10,5))
# This shows us that most of the donors fall betweeen the ages of 30 - 80 
# This gives us valuable information o who to target based on age

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10,6))
sns.countplot(x='income_group',
              hue='donor_gender',
              data=df,
             palette='Set1')
plt.show()
# This graph shows us that across all income groups,females out earn the men. They will also make for a group to target

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10,6))
sns.countplot(x='income_group',
              hue='donated',
              data=df,
             palette='Set1')
plt.show()
df['income_group'].value_counts()

#### It is interesting to see that the on the income group column, users with the income group of level 5 donate more than any other income group. Also level 5 income group users do not donate as much as other income groups. This could as a result of most users falling into income group of level 5

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='home_owner',
              hue='donor_gender',
              data=df,
             palette='Set1')
plt.show()
df['home_owner'].value_counts()
# We also observe that on average females own more homes than men

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='urbanicity',
              hue='donated',
              data=df,
             palette='Set1')
plt.show()

df['urbanicity'].value_counts()
# We observe that the users most likely to donate live in suburban areas. 
# This makes sense as most of this demographic fall between the older age brackets as seen before when we checked age of donors

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='ses',
              hue='donated',
              data=df,
             palette='Set1')
plt.show()

df['ses'].value_counts()
# This shows that most of the people likely to donate fall into the second social economic status bracket

In [None]:
df['lifetime_avg_gift_amt'].plot.hist(bins=5, figsize=(10,5))
# This metric is to show us the average gift amount from donors. From the graph we can see the that this amount is close to a 100

In [None]:
df['lifetime_gift_amount'].plot.hist(bins=5, figsize=(10,5))
# This metric is to show us the overall gift amount from donors. From the graph we can see the that this amount is close to a 100

## Dealing with Missing Data and Miscellenous Data

In [None]:
df.isnull().sum()

### The columns with missing data are
* Donor Age
* Income Group
* Wealth Rating
* Months since last prom Resp

In [None]:
# Checking the datatype of each column
# Before filling missing values, the column dtypes should first be changed to their appriopriate datatype
#
df.dtypes

In [None]:
df.shape

In [None]:
df['donor_age'] = df['donor_age'].replace(0, np.nan)

In [None]:
df = df[df['donor_age']!=0] 

#removing the observations with zero (0) age values

In [None]:
# Checking for the number of unique values in each column
# This is to check for consistency, uniformity and accuracy
# A for loop has been used to print unique values for each column
#
for column in df.columns:
  print(column)
  print(df[column].nunique())
  print(df[column].unique())
  print('\n')

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['donor_age'].plot.hist(figsize=(10,5))

- the data is left-skewed.
- employ median to fill in missing data

In [None]:
# Replace using median
median = df['donor_age'].median()

In [None]:
df['donor_age'].fillna(median, inplace=True)

In [None]:
df['donor_age'].plot.hist(figsize=(10,5))

In [None]:
df['income_group'].plot.hist(figsize=(10,5))