In [1]:
# Import dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import pickle

In [7]:
# Import dataset, check data
df_donor = pd.read_csv(os.path.join('Resources', 'Raw_Data_for_train_test.csv'))
df_donor.head()

Unnamed: 0,TARGET_B,TARGET_D,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,DONOR_AGE,IN_HOUSE,URBANICITY,SES,CLUSTER_CODE,HOME_OWNER,...,LIFETIME_GIFT_RANGE,LIFETIME_MAX_GIFT_AMT,LIFETIME_MIN_GIFT_AMT,LAST_GIFT_AMT,CARD_PROM_12,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT,FILE_AVG_GIFT,FILE_CARD_GIFT
0,0,,5,101,87.0,0,?,?,.,H,...,15.0,20.0,5.0,15.0,5,12,26,92,8.49,7
1,1,10.0,12,137,79.0,0,R,2,45,H,...,20.0,25.0,5.0,17.0,7,21,7,122,14.72,12
2,0,,37,113,75.0,0,S,1,11,H,...,23.0,28.0,5.0,19.0,11,32,6,105,16.75,16
3,0,,38,92,,0,U,2,4,H,...,14.0,17.0,3.0,15.0,11,33,6,92,11.76,12
4,0,,41,101,74.0,0,R,2,49,U,...,20.0,25.0,5.0,25.0,6,19,18,92,8.83,3


In [8]:
# Check if there are null variables
df_donor.columns[df_donor.isnull().any()]

Index(['TARGET_D', 'DONOR_AGE', 'INCOME_GROUP', 'WEALTH_RATING',
       'MONTHS_SINCE_LAST_PROM_RESP'],
      dtype='object')

In [10]:
# Fill numeric rows with the median
for label, content in df_donor.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Fill missing numeric values with median since it's more robust than the mean
            df_donor[label] = content.fillna(content.median())
            
df_donor.columns[df_donor.isnull().any()]

Index([], dtype='object')

In [12]:
# Check column info
df_donor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19372 entries, 0 to 19371
Data columns (total 50 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   TARGET_B                     19372 non-null  int64  
 1   TARGET_D                     19372 non-null  float64
 2   CONTROL_NUMBER               19372 non-null  int64  
 3   MONTHS_SINCE_ORIGIN          19372 non-null  int64  
 4   DONOR_AGE                    19372 non-null  float64
 5   IN_HOUSE                     19372 non-null  int64  
 6   URBANICITY                   19372 non-null  object 
 7   SES                          19372 non-null  object 
 8   CLUSTER_CODE                 19372 non-null  object 
 9   HOME_OWNER                   19372 non-null  object 
 10  DONOR_GENDER                 19372 non-null  object 
 11  INCOME_GROUP                 19372 non-null  float64
 12  PUBLISHED_PHONE              19372 non-null  int64  
 13  OVERLAY_SOURCE  

In [14]:
# Turn categorical variables into numbers
for label, content in df_donor.items():
    # Check columns which aren't numeric
    if not pd.api.types.is_numeric_dtype(content):
        # print the columns that are objectt type 
        print(label)
        df_donor[label] = pd.Categorical(content).codes+1

URBANICITY
SES
CLUSTER_CODE
HOME_OWNER
DONOR_GENDER
OVERLAY_SOURCE
RECENCY_STATUS_96NK


In [17]:
# Drop TARGET_D column as we are using TARGET_B as the target variable
df_donor = df_donor.drop('TARGET_D', axis=1)
df_donor.head()

Unnamed: 0,TARGET_B,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,DONOR_AGE,IN_HOUSE,URBANICITY,SES,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,...,LIFETIME_GIFT_RANGE,LIFETIME_MAX_GIFT_AMT,LIFETIME_MIN_GIFT_AMT,LAST_GIFT_AMT,CARD_PROM_12,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT,FILE_AVG_GIFT,FILE_CARD_GIFT
0,0,5,101,87.0,0,1,5,1,1,3,...,15.0,20.0,5.0,15.0,5,12,26,92,8.49,7
1,1,12,137,79.0,0,3,2,41,1,3,...,20.0,25.0,5.0,17.0,7,21,7,122,14.72,12
2,0,37,113,75.0,0,4,1,4,1,2,...,23.0,28.0,5.0,19.0,11,32,6,105,16.75,16
3,0,38,92,60.0,0,6,2,35,1,2,...,14.0,17.0,3.0,15.0,11,33,6,92,11.76,12
4,0,41,101,74.0,0,3,2,45,2,2,...,20.0,25.0,5.0,25.0,6,19,18,92,8.83,3


In [18]:
# Standardization

#Split data into x (input features) & y(target variable)
# input features
x = df_donor.drop('TARGET_B', axis=1)

# Target variable
y = df_donor['TARGET_B']

x.head()

Unnamed: 0,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,DONOR_AGE,IN_HOUSE,URBANICITY,SES,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,INCOME_GROUP,...,LIFETIME_GIFT_RANGE,LIFETIME_MAX_GIFT_AMT,LIFETIME_MIN_GIFT_AMT,LAST_GIFT_AMT,CARD_PROM_12,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT,FILE_AVG_GIFT,FILE_CARD_GIFT
0,5,101,87.0,0,1,5,1,1,3,2.0,...,15.0,20.0,5.0,15.0,5,12,26,92,8.49,7
1,12,137,79.0,0,3,2,41,1,3,7.0,...,20.0,25.0,5.0,17.0,7,21,7,122,14.72,12
2,37,113,75.0,0,4,1,4,1,2,5.0,...,23.0,28.0,5.0,19.0,11,32,6,105,16.75,16
3,38,92,60.0,0,6,2,35,1,2,6.0,...,14.0,17.0,3.0,15.0,11,33,6,92,11.76,12
4,41,101,74.0,0,3,2,45,2,2,2.0,...,20.0,25.0,5.0,25.0,6,19,18,92,8.83,3


In [19]:
y.head()

0    0
1    1
2    0
3    0
4    0
Name: TARGET_B, dtype: int64

In [20]:
#use Standard Scaler to to change data values of 'x' into one range.
# Import standard scaler
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

# apply scaler
x = ss.fit_transform(x)

x

array([[-1.7292245 ,  0.66877603,  1.9225452 , ...,  0.59940354,
        -0.49710644,  0.37474069],
       [-1.72909912,  1.5414079 ,  1.36956114, ...,  1.39797251,
         0.21185264,  1.46005894],
       [-1.72865132,  0.95965332,  1.09306911, ...,  0.9454501 ,
         0.44286178,  2.32831353],
       ...,
       [ 1.70519674, -1.07648773,  0.05622399, ..., -1.2373051 ,
         0.24371597, -1.14470485],
       [ 1.70571618,  1.34748971,  1.30043813, ...,  1.58430527,
         0.622662  ,  1.24299529],
       [ 1.70578783, -1.07648773,  0.74745407, ..., -1.21068613,
         1.38169203, -0.9276412 ]])

In [42]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# define and configure the model
model = RandomForestClassifier()

# fit the model
model.fit(xtrain, ytrain)

# evaluate the model
preds = model.predict(xtest)
accuracy_score(ytest, preds)


0.7455483870967742

In [38]:
# Hyper parameter tuning
from sklearn.model_selection import RandomizedSearchCV

# different randomforestregressor hyperperameters
rf_grid = {'n_estimators' : np.arange(10, 100, 10),
           'max_depth': [None, 3, 5, 10],
           'min_samples_split' : np.arange(2, 20, 2),
           'min_samples_leaf': np.arange(1, 20, 2),
            'max_features' : [0.5, 1, 'sqrt', 'auto']}

# instentiate randomizedsearchcv model
rs_model= RandomizedSearchCV(RandomForestClassifier(n_jobs = -1, 
                                                  random_state=42),
                                                  param_distributions = rf_grid,
                                                  n_iter = 90,
                                                  cv=5,
                                                  verbose=True)

rs_model.fit(xtrain, ytrain)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
                   n_iter=90,
                   param_distributions={'max_depth': [None, 3, 5, 10],
                                        'max_features': [0.5, 1, 'sqrt',
                                                         'auto'],
                                        'min_samples_leaf': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                                        'n_estimators': array([10, 20, 30, 40, 50, 60, 70, 80, 90])},
                   verbose=True)

In [39]:
rs_model.best_params_

{'n_estimators': 70,
 'min_samples_split': 6,
 'min_samples_leaf': 5,
 'max_features': 'sqrt',
 'max_depth': None}

In [43]:
# Create ideal model with params
ideal_model = RandomForestClassifier(n_estimators= 70,
                                     min_samples_split = 6,
                                     min_samples_leaf = 5,
                                     max_features = 'sqrt',
                                     max_depth = None)

# fit the model
ideal_model.fit(xtrain, ytrain)

# evaluate the model
preds = ideal_model.predict(xtest)
accuracy_score(ytest, preds)

0.7491612903225806

In [27]:
test_df = pd.read_csv(os.path.join('Resources', 'Predict_donor.csv'))
test_df.head()

Unnamed: 0,CONTROL_NUMBER,MONTHS_SINCE_ORIGIN,DONOR_AGE,IN_HOUSE,URBANICITY,SES,CLUSTER_CODE,HOME_OWNER,DONOR_GENDER,INCOME_GROUP,...,LIFETIME_GIFT_RANGE,LIFETIME_MAX_GIFT_AMT,LIFETIME_MIN_GIFT_AMT,LAST_GIFT_AMT,CARD_PROM_12,NUMBER_PROM_12,MONTHS_SINCE_LAST_GIFT,MONTHS_SINCE_FIRST_GIFT,FILE_AVG_GIFT,FILE_CARD_GIFT
0,139,101,,0,R,2,46,U,F,,...,16.0,21.0,5.0,21.0,4,10,26,90,14.71,2
1,142,137,,0,R,2,43,U,F,,...,9.0,11.0,2.0,10.0,9,22,22,129,6.0,14
2,282,17,30.0,0,T,1,35,H,M,6.0,...,15.0,20.0,5.0,20.0,5,14,20,87,8.44,1
3,368,137,75.0,0,U,1,2,H,M,6.0,...,5.0,10.0,5.0,10.0,10,24,19,129,6.5,22
4,387,5,,0,T,2,40,U,F,2.0,...,7.0,10.0,3.0,5.0,5,12,18,85,5.83,2


In [28]:
# Fill numeric rows with the median
for label, content in test_df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Fill missing numeric values with median since it's more robust than the mean
            test_df[label] = content.fillna(content.median())

In [29]:
# Turn categorical variables into numbers
for label, content in test_df.items():
    # Check columns which aren't numeric
    if not pd.api.types.is_numeric_dtype(content):
        # print the columns that are object type 
        print(label)
        test_df[label] = pd.Categorical(content).codes+1

URBANICITY
SES
CLUSTER_CODE
HOME_OWNER
DONOR_GENDER
OVERLAY_SOURCE
RECENCY_STATUS_96NK


In [35]:
Target = ideal_model.predict(test_df)
Target

array([0, 1, 1, ..., 1, 0, 0], dtype=int64)

In [37]:
PREDICTED_df = pd.DataFrame()
PREDICTED_df['TARGET_B'] = Target
PREDICTED_df['CONTROL_NUMBER'] = test_df['CONTROL_NUMBER']
PREDICTED_df.head(50)

Unnamed: 0,TARGET_B,CONTROL_NUMBER
0,0,139
1,1,142
2,1,282
3,1,368
4,0,387
5,0,458
6,1,564
7,0,665
8,1,702
9,1,743


In [2]:
curl http://localhost:5000/predict 
        -d "CONTROL_NUMBER=3" -d "MONTHS_SINCE_ORIGIN=101" -d "DONOR_AGE=87.0" -d "IN_HOUSE=0" -X POST -v

Note: you may need to restart the kernel to use updated packages.
