In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_data = pd.read_csv('../data/donor_data.csv')
test_data = pd.read_csv('../data/prospective_data.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
##converting the upper case features to lower class to make my eda easier

train_data.columns = train_data.columns.str.lower()
test_data.columns = test_data.columns.str.lower()

In [None]:
train_data.columns.to_list()


In [None]:
test_data.columns.to_list()

# Data Cleaning and Exploratory Data Analysis

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.describe().T

In [None]:
test_data.describe().T

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data['target_d'].value_counts()

#### Looking at the missing values from the target_d column, it is observed that the values with nan are values from people who didn't donate. Rather that using nan, I will be substituing nan as 0 as those who didn't donate also gave $0 

In [None]:
train_data['target_d'] = train_data['target_d'].replace(np.nan, 0.00)

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
# Checking for the number of unique values in each column
# This is to check for consistency, uniformity and accuracy
# A for loop has been used to print unique values for each column
#
for column in train_data.columns:
  print(column)
  print(train_data[column].nunique())
  print(train_data[column].unique())
  print('\n')

In [None]:
# Checking for the number of unique values in each column
# This is to check for consistency, uniformity and accuracy
# A for loop has been used to print unique values for each column
#
for column in test_data.columns:
  print(column)
  print(test_data[column].nunique())
  print(test_data[column].unique())
  print('\n')

### Observations
* Age has a value with zero(0) which isn't right
* The Overlay column is meant to take 3 values; M=Metromail; P=Polk; B=Both. But it takes 4 values with the fourth value (4th) value being "n". I am going to assume this was a mistake and replace n with the value n
* Cluster code has a value with ".". This will be replaced with the most apporopriate method
* Donor Gender takes 4 values. Male, Female, Unknown and A. I am assuming the A value is a mistake as it takes one (1) value. It will be replaced appropriately
* "?" represents unknown wherever found


#### It is interesting to see that the on the income group column, users with the income group of level 5 donate more than any other income group. Also level 5 income group users do not donate as much as other income groups. This could as a result of most users falling into income group of level 5

In [None]:
#rename TARGET_B AS Donated
train_data.rename(columns={'target_b':'donated', 'target_d': 'amount_donated'}, inplace=True)

## Dealing with Missing Data and Miscellenous Data

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

### The columns with missing data are
* Donor Age
* Income Group
* Wealth Rating
* Months since last prom Resp

In [None]:
# Checking the datatype of each column
# Before filling missing values, the column dtypes should first be changed to their appriopriate datatype
#
train_data.dtypes

In [None]:
# Checking the datatype of each column
# Before filling missing values, the column dtypes should first be changed to their appriopriate datatype
#
test_data.dtypes

In [None]:
## Counting the number of unique ages of the donors, the minumum, maximum and mean age as well

print('For the Train dataset')
print('The number of unique age is :',train_data.donor_age.nunique())
print('The minimum age is :',train_data.donor_age.min())
print('The maximum age is :',train_data.donor_age.max())
print('The mean age is :',train_data.donor_age.mean())

#### There is a minimum age of 0 which makes no sense as there can't be anyone aged 0.

In [None]:
## Counting the number of unique ages of the donors, the minumum, maximum and mean age as well

print('For the Test dataset')
print('The number of unique age is :',test_data.donor_age.nunique())
print('The minimum age is :',test_data.donor_age.min())
print('The maximum age is :',test_data.donor_age.max())
print('The mean age is :',test_data.donor_age.mean())

#### Replacing the age of 0 with NAN

In [None]:
train_data['donor_age'] = train_data['donor_age'].replace(0, np.nan)

In [None]:
train_data.isnull().sum()

#### Visializing the donor age column

In [None]:
train_data['donor_age'].plot.hist(figsize=(10,5))
plt.xlabel("Donor's Age")

- the data is left-skewed.
- employ median to fill in missing data

### Distribution Plot

* We can visualize the distribution of the donor age

In [None]:
fig = plt.figure(figsize=(10,10))
sns.kdeplot(x = train_data.donor_age,shade = True)

In [None]:
test_data['donor_age'].plot.hist(figsize=(10,5))

In [None]:
fig = plt.figure(figsize=(20,10))
sns.kdeplot(x = test_data.donor_age, shade = True)

In [None]:
train_donor_median_age = train_data.donor_age.median()
test_donor_median_age = test_data.donor_age.median()

print('The train median age is : ',train_donor_median_age)
print('The test median age is : ',test_donor_median_age)

# filling the missing values in train set with that of the train data
train_data.donor_age = train_data.donor_age.fillna(value = train_donor_median_age)

# filling the missing values in test set with that of the test data
test_data.donor_age = test_data.donor_age.fillna(value = test_donor_median_age)

## Working on the missing values in the income group column

In [None]:
fig = plt.figure(figsize=(10,10))
sns.countplot(x = train_data.income_group)
train_data.income_group.value_counts()

In [None]:
fig = plt.figure(figsize=(20,10))
train_data.income_group.value_counts().plot(kind = 'pie', explode = [0.05]*7, cmap = 'Accent', autopct =  '%0.1f%%')
plt.xlabel("Income group pie chart")


* **filling the income_group values with the mode**

In [None]:
train_income_group_mode = train_data.income_group.mode()[0]
print('The mode for income_group  is : ',train_income_group_mode)
train_data.income_group = train_data.income_group.fillna(value = train_income_group_mode)

# filling the missing values in test set with that of the training data
test_income_group_mode = test_data.income_group.mode()[0]
print('The mode for income_group is : ',test_income_group_mode)
test_data.income_group = test_data.income_group.fillna(value = test_income_group_mode)

## Working on the missing values in the Wealth Rating column

In [None]:
fig = plt.figure(figsize=(20,10))
sns.countplot(x = train_data.wealth_rating)
train_data.wealth_rating.value_counts()
plt.xlabel("Wealth Rating")

In [None]:
fig = plt.figure(figsize=(20,10))
train_data.wealth_rating.value_counts().plot(kind = 'pie',explode = [0.035]*10, cmap = 'Accent', autopct =  '%0.1f%%')
plt.xlabel("Wealth Rating")

* **filling the wealth rating column values with the median as the data is skewwed to the left**

In [None]:
train_wealth_rating_median = train_data.wealth_rating.median()
print('The median for wealth_rating  is : ',train_wealth_rating_median)
train_data.wealth_rating = train_data.wealth_rating.fillna(value = train_wealth_rating_median)

# filling the missing values in test set with that of the training data
test_wealth_rating_median = test_data.wealth_rating.median()
print('The median for wealth_rating is : ',test_wealth_rating_median)
test_data.wealth_rating = test_data.wealth_rating.fillna(value = test_wealth_rating_median)

## Working on the missing values in the Month Since Last Prom Resp column

In [None]:
train_data['months_since_last_prom_resp'].plot.hist(figsize=(10,5))

In [None]:
fig = plt.figure(figsize=(10,8))
sns.kdeplot(x = train_data.months_since_last_prom_resp,shade = True)
plt.xlabel("Months since last response")

### it can be observed that the months since last prom response is left skewwed. This shall be filled using the median

In [None]:
train_months_since_last_prom_resp_median = train_data.months_since_last_prom_resp.median()
print('For the train data, the median for months_since_last_prom_resp  is : ',train_months_since_last_prom_resp_median)
train_data.months_since_last_prom_resp = train_data.months_since_last_prom_resp.fillna(value = train_months_since_last_prom_resp_median)

# filling the missing values in test set with that of the training data
test_months_since_last_prom_resp_median = test_data.months_since_last_prom_resp.median()
print('For the test data, the median for months_since_last_prom_resp is : ',test_months_since_last_prom_resp_median)
test_data.months_since_last_prom_resp = test_data.months_since_last_prom_resp.fillna(value = test_months_since_last_prom_resp_median)

### Checking if there are still null values in the data set

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

## Dealing with miscellaneous data

#### Despite the fact that there are no "missing" values, there are some misellaneous values in the data set like '?' and   ' .'
#### We will deal with them now

In [None]:
# Checking for the number of unique values in each column
# This is to check for consistency, uniformity and accuracy
# A for loop has been used to print unique values for each column
#
for column in train_data.columns:
  print(column)
  print(train_data[column].nunique())
  print(train_data[column].unique())
  print('\n')

In [None]:
# Checking for the number of unique values in each column
# This is to check for consistency, uniformity and accuracy
# A for loop has been used to print unique values for each column
#
for column in test_data.columns:
  print(column)
  print(test_data[column].nunique())
  print(test_data[column].unique())
  print('\n')

In [None]:
train_data['urbanicity'].value_counts()

In [None]:
test_data['urbanicity'].value_counts()

### I will be replacing "?" with "X" in the urbanicity column. X denotes unknown here

In [None]:
train_data["urbanicity"] = train_data["urbanicity"].replace("?", "X")
test_data["urbanicity"] = test_data["urbanicity"].replace("?", "X")

In [None]:
train_data['urbanicity'].value_counts()

In [None]:
test_data['urbanicity'].value_counts()

In [None]:
train_data['ses'].value_counts()

In [None]:
test_data['ses'].value_counts()

### I will be replacing "?" with "5" in the ses column

In [None]:
train_data['ses'] = train_data['ses'].replace("?", 5)
test_data['ses'] = test_data['ses'].replace("?", 5)

In [None]:
train_data['ses'].value_counts()


In [None]:
test_data['ses'].value_counts()

In [None]:
train_data['donor_gender'].value_counts()

In [None]:
test_data['donor_gender'].value_counts()

#### I will be replacing the "A" column with Unknown "U" column as there are only 3 values in the donor gender

In [None]:
train_data['donor_gender'] = train_data['donor_gender'].replace("A", "U")

In [None]:
train_data['donor_gender'].value_counts()

In [None]:
train_data['overlay_source'].value_counts()

In [None]:
test_data['overlay_source'].value_counts()

#### I will be replacing the "N" column with Metromail; "M" column as there are only 3 values in the overlay Source column and "N" isn't in the data dictionary. I am assuming that users mistakenly filled "N" as against "M" as they as close to each other on the keyboard

In [None]:
train_data['overlay_source'] = train_data['overlay_source'].replace("N", "M")
test_data['overlay_source'] = test_data['overlay_source'].replace("N", "M")

In [None]:
train_data['overlay_source'].value_counts()

In [None]:
test_data['overlay_source'].value_counts()

In [None]:
train_data['cluster_code'].value_counts(ascending = False)

In [None]:
test_data['cluster_code'].value_counts(ascending = True)

In [None]:
train_data['cluster_code'] = train_data.cluster_code.replace(' .', 54)
test_data['cluster_code'] = test_data.cluster_code.replace(' .', 54)

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data.dtypes

In [None]:
test_data.dtypes

### Creating a function to handle dropped columns so I can call the function later on

 * **Donated is the target column for classification.**

 * **Amount Donated is the target column for regression**
 
 * **Amount Donated will be dropped for the classification task**
 * **Donated will be dropped for the regression task**

 

In [None]:
def drop_column(column_name, data):
    dropped_data = data.drop([column_name], axis = 1) 
    return dropped_data

In [None]:
reg_train_data = drop_column('donated',train_data)

In [None]:
reg_train_data.head()

In [None]:
train_data = drop_column('amount_donated',train_data)


In [None]:
train_data.head()

## Splitting the dataset to numerical and categorical columns for better analysis and visualization

In [None]:
categorical_columns = []
for col in train_data.columns:
    unique_column =train_data[col].nunique()
    if unique_column <=60 and col != 'donated':
        categorical_columns.append(col)

In [None]:
print(categorical_columns)

In [None]:
numerical_columns = [x for x in train_data.columns if x not in categorical_columns ][1:]


In [None]:
print(numerical_columns)

In [None]:
print('There are %d categorical columns' %(len(categorical_columns)))
print('There are %d numerical columns' %(len(numerical_columns)))

In [None]:
cat_data = train_data[categorical_columns]
cat_data.head(10)

## Visualizations and Analysis

In [None]:
plt.figure(figsize=(6,6))
sns.countplot(x='donated', data=train_data)

train_data['donated'].value_counts()
plt.xlabel("Donations")
# From the data, we can see that non-donors outnumber actual donors at almost 3 to 1
# From here we can see the amount of non-donors to donors in our dataset

In [None]:
fig = plt.figure(figsize=(7,7))
ax = fig.subplots()
pie_chart = train_data.donated.value_counts().plot(kind = 'pie',explode = [0.035]*2, cmap = 'icefire',autopct =  '%0.1f%%')
plt.xlabel("Donations")

### How does gender affect Donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='donor_gender',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()

train_data['donor_gender'].value_counts()
plt.xlabel("Gender vs Donations")
# This graph shows us that a significant portion of the male and females fall as non-donors

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].donor_gender.value_counts().plot(kind = 'pie',explode = [0.035]*3,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Gender vs Donations")

### It is observed that females donate more than other gender classes

## How does having a home affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='home_owner',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()

train_data['home_owner'].value_counts()
plt.xlabel("Home Ownership vs Donations")
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].home_owner.value_counts().plot(kind = 'pie',explode = [0.035]*2,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Home Ownership vs Donations")

### It is observed that home owners donate more than non home owners

### How does the in-house program affect people who donate?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='in_house',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()

train_data['in_house'].value_counts()
plt.xlabel("In-house Program vs Donations")
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].in_house.value_counts().plot(kind = 'pie',explode = [0.035]*2,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("In-house Program vs Donations")

### Users who did not participate in the in-house program donated more

## How does Urbanicity affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='urbanicity',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()

train_data['urbanicity'].value_counts()
plt.xlabel("Urbanicity vs Donations")
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].urbanicity.value_counts().plot(kind = 'pie',explode = [0.035]*6,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Urbanicity vs Donations")

### Users in surburban areas donate more than users in other classes

## How does Social Economic Status affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(6,6))
sns.countplot(x='ses',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Social economic status vs Donations")
train_data['ses'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].ses.value_counts().plot(kind = 'pie',explode = [0.035]*5,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Social economic status vs Donations")

### Users in the 2nd SES class donate more than other SES classes

## Does income group affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='income_group',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Income group vs Donations")

train_data['income_group'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].income_group.value_counts().plot(kind = 'pie',explode = [0.035]*7,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Income group vs Donations")

### Users who belong to the 5th income group donate more than users in other income groups

## Does publishing number affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='published_phone',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Published phone vs Donations")
train_data['published_phone'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].published_phone.value_counts().plot(kind = 'pie',explode = [0.035]*2,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Published phone vs Donations")

### There is litle difference on how publishing number affects donations

### Does Overlay source affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='overlay_source',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Overlay Source vs Donations")
train_data['overlay_source'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].overlay_source.value_counts().plot(kind = 'pie',explode = [0.035]*3,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Overlay Source vs Donations")

### Users who use both Metromail and Polk donate more than other users

## How does wealth rating affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='wealth_rating',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Wealth Rating vs Donations")
train_data['wealth_rating'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].wealth_rating.value_counts().plot(kind = 'pie',explode = [0.035]*10,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Wealth Rating vs Donations")

###

### Users who belong to the 5th Wealth rating class donate more than other wealth rating class

## How does Pep start affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='pep_star',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Pep star vs Donations")

train_data['pep_star'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].pep_star.value_counts().plot(kind = 'pie',explode = [0.035]*2,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Pep star vs Donations")

### Users who are part of the pep star class 1 donate more than the other pep star class

## How does the recency status 96k affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='recency_status_96nk',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Recency status vs Donations")
train_data['recency_status_96nk'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].recency_status_96nk.value_counts().plot(kind = 'pie',explode = [0.035]*6,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Recency status vs Donations")

### Users who belong to the Class A of Recency Status 96k donate more than other users

## How does Frequency status 97nk affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='frequency_status_97nk',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Frequency status vs Donations")
train_data['frequency_status_97nk'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].frequency_status_97nk.value_counts().plot(kind = 'pie',explode = [0.035]*4,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Frequency status vs Donations")

### Users who belong to class 1 of frequency_status_97nk donate more than other classes 

## How does Recent card response count affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='recent_card_response_count',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Recent card response vs Donations")
train_data['recent_card_response_count'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.subplots()
pie_chart = train_data[train_data.donated == 1].recent_card_response_count.value_counts().plot(kind = 'pie',explode = [0.035]*10,ax = ax,cmap = 'rainbow',autopct =  '%0.1f%%')
plt.xlabel("Recent card response vs Donations")

### Users who belong to class 1 of recent card response count donate more than other classes

## How does Months since origin affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='months_since_origin',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Month since origin vs Donations")
train_data['months_since_origin'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

## How does Cluster code affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='cluster_code',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Cluster code vs Donations")
train_data['cluster_code'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

In [None]:
fig = plt.figure(figsize=(20,10))
sns.histplot(x = train_data.cluster_code.astype(int), hue = train_data.donated, palette='rainbow')
plt.xlabel("Cluster code vs Donations")

## How does Recent star status affect donations?

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))
sns.countplot(x='recent_star_status',
              hue='donated',
              data=train_data,
             palette='Set1')
plt.show()
plt.xlabel("Recent star status vs Donations")
train_data['recent_star_status'].value_counts()
# This graph shows us that home ownership is a strong indicator tthat someone will donate

# Exploring Numerical variables

In [None]:
train_data['donor_age'].plot.hist(figsize=(10,5))
plt.xlabel("Donor Age")
# This shows us that most of the donors fall betweeen the ages of 30 - 80 
# This gives us valuable information o who to target based on age


In [None]:
train_data['lifetime_avg_gift_amt'].plot.hist(bins=5, figsize=(10,5))
plt.xlabel("Lifetime average gift amount")
# This metric is to show us the average gift amount from donors. From the graph we can see the that this amount is close to a 100

In [None]:
train_data['lifetime_gift_amount'].plot.hist(bins=5, figsize=(10,5))
plt.xlabel("Lifetime gift amount")
# This metric is to show us the overall gift amount from donors. From the graph we can see the that this amount is close to a 100

## Violin plots

In [None]:
# Plotting violin plots
  #
sns.catplot(x="donor_age", y="donor_gender",hue="donated",col = 'home_owner', data=train_data,orient="h", height=5, aspect=1, palette="rainbow", kind="violin", dodge=True, cut=0, bw=.2)
plt.show()

## Bar charts

*From the above chart we can make these observations;*


*   The dataset is imbalanced, it has more users who didnt donate cases than users who donated
*   More females donated than any other gender

## Distribution of age per gender - Histograms

In [None]:
# Plotting histogram to show distribution of age per gender
# 
male = train_data.loc[train_data.donor_gender=='M', 'donor_age']    # Selecting only male datapoints in the donor age column
female = train_data.loc[train_data.donor_gender=='F', 'donor_age']  # Selecting only female datapoints in the donor age column
unknown = train_data.loc[train_data.donor_gender=='U', 'donor_age']  # Selecting only unknown datapoints in the donor age column

# Creating a dictionary containing opacaticy and line width in a dictionary for efficiency
#
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

# Plotting the histograms
#
plt.figure(figsize=(12,7), dpi= 80)
sns.distplot(male, color="darkgreen", label="Male", **kwargs)
sns.distplot(female, color="fuchsia", label="Female", **kwargs)
sns.distplot(unknown, color="orange", label="Unknown", **kwargs)
plt.title('Distribution of age per gender', fontsize = 14, color = 'purple')
plt.xlabel('Age', fontsize = 13, color = 'purple')
plt.ylabel('Frequency', fontsize = 13, color = 'purple')
plt.legend()
plt.show()

*From the above histogram, it can be deduced that both the ages of male, female and unknown are skewed to the left. This means that the median and mode are greater than the mean*

## Distribution of amount donated per gender - Histograms

In [None]:
# Plotting histogram to show distribution of age per gender
# 
male = train_data.loc[train_data.donor_gender=='M', 'donated']    # Selecting only male datapoints in the amount donated column
female = train_data.loc[train_data.donor_gender=='F', 'donated']  # Selecting only female datapoints in the amount donated column
unknown = train_data.loc[train_data.donor_gender=='U', 'donated']  # Selecting only unknown datapoints in the amount donated column

# Creating a dictionary containing opacaticy and line width in a dictionary for efficiency
#
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

# Plotting the histograms
#
plt.figure(figsize=(12,7), dpi= 80)
sns.distplot(male, color="darkgreen", label="Male", **kwargs)
sns.distplot(female, color="red", label="Female", **kwargs)
sns.distplot(unknown, color="yellow", label="Unknown", **kwargs)
plt.title('Distribution of users who donated per gender', fontsize = 14, color = 'purple')
plt.xlabel('Donated', fontsize = 13, color = 'purple')
plt.ylabel('Frequency', fontsize = 13, color = 'purple')
plt.legend()
plt.show()

In [None]:
# Plotting histogram to show distribution of age per gender
# 
male = train_data.loc[train_data.donor_gender=='M', 'amount_donated']    # Selecting only male datapoints in the amount donated column
female = train_data.loc[train_data.donor_gender=='F', 'amount_donated']  # Selecting only female datapoints in the amount donated column
unknown = train_data.loc[train_data.donor_gender=='U', 'amount_donated']  # Selecting only unknown datapoints in the amount donated column

# Creating a dictionary containing opacaticy and line width in a dictionary for efficiency
#
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

# Plotting the histograms
#
plt.figure(figsize=(12,7), dpi= 80)
sns.distplot(male, color="darkgreen", label="Male", **kwargs)
sns.distplot(female, color="red", label="Female", **kwargs)
sns.distplot(unknown, color="yellow", label="Unknown", **kwargs)
plt.title('Distribution of users who amount donated per gender', fontsize = 14, color = 'purple')
plt.xlabel('amount donated', fontsize = 13, color = 'purple')
plt.ylabel('Frequency', fontsize = 13, color = 'purple')
plt.legend()
plt.show()

*From the above histogram, it can be deduced that the amount donated by male, female and unknown are skewed to the right. This means that the median and mode are less than the mean*

## Distribution of the months since origin per gender - Histograms

In [None]:
# Plotting histogram to show distribution of age per gender
# 
male = train_data.loc[train_data.donor_gender=='M', 'months_since_origin']    # Selecting only male datapoints in the months since origin column
female = train_data.loc[train_data.donor_gender=='F', 'months_since_origin']  # Selecting only female datapoints in the months since origin column
unknown = train_data.loc[train_data.donor_gender=='U', 'months_since_origin']  # Selecting only unknown datapoints in the months since origin column

# Creating a dictionary containing opacaticy and line width in a dictionary for efficiency
#
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

# Plotting the histograms
#
plt.figure(figsize=(12,7), dpi= 80)
sns.distplot(male, color="darkgreen", label="Male", **kwargs)
sns.distplot(female, color="red", label="Female", **kwargs)
sns.distplot(unknown, color="yellow", label="Unknown", **kwargs)
plt.title('Distribution of months since origin per gender', fontsize = 14, color = 'purple')
plt.xlabel('Months since origin', fontsize = 13, color = 'purple')
plt.ylabel('Frequency', fontsize = 13, color = 'purple')
plt.legend()
plt.show()

## Distribution of the median home value per gender - Histograms

In [None]:
# Plotting histogram to show distribution of age per gender
# 
male = train_data.loc[train_data.donor_gender=='M', 'median_home_value']    # Selecting only male datapoints in the Median Home Value column
female = train_data.loc[train_data.donor_gender=='F', 'median_home_value']  # Selecting only female datapoints in the Median Home Value column
unknown = train_data.loc[train_data.donor_gender=='U', 'median_home_value']  # Selecting only unknown datapoints in the Median Home Value column

# Creating a dictionary containing opacaticy and line width in a dictionary for efficiency
#
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

# Plotting the histograms
#
plt.figure(figsize=(12,7), dpi= 80)
sns.distplot(male, color="darkgreen", label="Male", **kwargs)
sns.distplot(female, color="red", label="Female", **kwargs)
sns.distplot(unknown, color="yellow", label="Unknown", **kwargs)
plt.title('Distribution of Median Home Value per gender', fontsize = 14, color = 'purple')
plt.xlabel('Median Home Value', fontsize = 13, color = 'purple')
plt.ylabel('Frequency', fontsize = 13, color = 'purple')
plt.legend()
plt.show()

*From the above histogram, it can be deduced that the median home value by male, female and unknown are skewed to the right. This means that the median and mode are less than the mean*

## Distribution of the lifetime gift amount per gender - Histograms

In [None]:
# Plotting histogram to show distribution of lifetime gift amount per gender
# 
male = train_data.loc[train_data.donor_gender=='M', 'lifetime_gift_amount']    # Selecting only male datapoints in the lifetime gift amount column
female = train_data.loc[train_data.donor_gender=='F', 'lifetime_gift_amount']  # Selecting only female datapoints in the lifetime gift amount column
unknown = train_data.loc[train_data.donor_gender=='U', 'lifetime_gift_amount']  # Selecting only unknown datapoints in the lifetime gift amount column

# Creating a dictionary containing opacaticy and line width in a dictionary for efficiency
#
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

# Plotting the histograms
#
plt.figure(figsize=(12,7), dpi= 80)
sns.distplot(male, color="darkgreen", label="Male", **kwargs)
sns.distplot(female, color="red", label="Female", **kwargs)
sns.distplot(unknown, color="yellow", label="Unknown", **kwargs)
plt.title('Distribution of lifetime gift amount per gender', fontsize = 14, color = 'purple')
plt.xlabel('lifetime gift amount', fontsize = 13, color = 'purple')
plt.ylabel('Frequency', fontsize = 13, color = 'purple')
plt.legend()
plt.show()

*From the above histogram, it can be deduced that the lifetime gift amount by male, female and unknown are skewed to the right. This means that the median and mode are less than the mean*

## Distribution of the last gift amount per gender - Histograms

In [None]:
# Plotting histogram to show distribution of lifetime gift amount per gender
# 
male = train_data.loc[train_data.donor_gender=='M', 'last_gift_amt']    # Selecting only male datapoints in the last gift amount column
female = train_data.loc[train_data.donor_gender=='F', 'last_gift_amt']  # Selecting only female datapoints in the last gift amount column
unknown = train_data.loc[train_data.donor_gender=='U', 'last_gift_amt']  # Selecting only unknown datapoints in the last gift amount column

# Creating a dictionary containing opacaticy and line width in a dictionary for efficiency
#
kwargs = dict(hist_kws={'alpha':.6}, kde_kws={'linewidth':2})

# Plotting the histograms
#
plt.figure(figsize=(12,7), dpi= 80)
sns.distplot(male, color="darkgreen", label="Male", **kwargs)
sns.distplot(female, color="red", label="Female", **kwargs)
sns.distplot(unknown, color="yellow", label="Unknown", **kwargs)
plt.title('Distribution of last gift amount per gender', fontsize = 14, color = 'purple')
plt.xlabel('last gift amount', fontsize = 13, color = 'purple')
plt.ylabel('Frequency', fontsize = 13, color = 'purple')
plt.legend()
plt.show()

*From the above histogram, it can be deduced that the median household income by male, female and unknown are skewed to the right. This means that the median and mode are less than the mean*

In [None]:
# Plotting a heatmap to show correlations between variables
#
corr = train_data[['median_home_value', 'median_household_income', 'per_capita_income', 'recent_avg_gift_amt', 'lifetime_gift_amount', 'last_gift_amt']].corr()
plt.figure(figsize = (13, 8))
sns.heatmap(corr, cmap='RdYlGn', annot = True, center = 0)
plt.title('Correlogram', fontsize = 15, color = 'darkgreen')
plt.show()

In [None]:
train_data.head()

In [None]:
test_data.head(10)

In [None]:
train_data.describe().T

In [None]:
test_data.describe().T

In [None]:
train_data.to_csv('cleaned_train_data.csv', encoding = 'utf-8', index = True)
test_data.to_csv('cleaned_test_data.csv', encoding = 'utf-8', index = True)
reg_train_data.to_csv('cleaned_data_for_regression.csv', encoding = 'utf-8', index = True)