In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report
from test_results import test_results

% matplotlib inline

UsageError: Line magic function `%` not found.


## Load data

In [2]:
train_df = pd.read_csv('data/training.csv')
print('Loaded training data: size {}'.format(train_df.shape))
test_df = pd.read_csv('data/test.csv')
print('Loaded testing data: size {}'.format(test_df.shape))
print('----------------')
print('train_df:')
train_df.head()

Loaded training data: size (84534, 10)
Loaded testing data: size (41650, 10)
----------------
train_df:


Unnamed: 0,ID,Promotion,purchase,V1,V2,V3,V4,V5,V6,V7
0,1,No,0,2,30.443518,-1.165083,1,1,3,2
1,3,No,0,3,32.15935,-0.645617,2,3,2,2
2,4,No,0,2,30.431659,0.133583,1,1,4,2
3,5,No,0,0,26.588914,-0.212728,2,1,4,2
4,8,Yes,0,3,28.044331,-0.385883,1,1,2,2


## Modeling

Build a model to select the best customers to target that maximizes the IRR and NIR

### Get training data

Only consider data from treatment group for model training. By filtering out treatment data in train_df and test_df, then combine these treatment data into one dataset.

In [3]:
treatment_train_df = train_df[train_df['Promotion']=='Yes']
treatment_test_df = test_df[test_df['Promotion']=='Yes']

treatment_train_df = pd.concat([treatment_train_df, treatment_test_df])

treatment_train_df.drop(['ID','Promotion'], axis=1, inplace=True)

print('treatment_train_df: size {}\n'.format(treatment_train_df.shape))
treatment_train_df.head()

treatment_train_df: size (63112, 8)



Unnamed: 0,purchase,V1,V2,V3,V4,V5,V6,V7
4,0,3,28.044331,-0.385883,1,1,2,2
8,0,2,31.930423,0.393317,2,3,1,2
10,0,1,32.770916,-1.511395,2,1,4,1
12,0,1,36.957009,0.133583,2,3,1,1
14,0,3,36.911714,-0.90535,2,2,4,1


### Class distribution

In [4]:
print('class disbution: \n{}'.format(treatment_train_df['purchase'].value_counts()))

class disbution: 
purchase
0    62052
1     1060
Name: count, dtype: int64


* We can find that the class distribution is extremely imbalanced. We will [leave this class distribution as it is for model training](#fitting-first), [let's see how it performs](#test-promotion-strategy-first).

* Then try another [strategy that deal with class imbalance](#deal-with-imbalance), and let's see if the result is improved.

### Data splitting

In [5]:
# Split training data into predictors and response
X = treatment_train_df.drop(['purchase'], axis=1)
y = treatment_train_df['purchase']

print('X: {}'.format(X.shape))
print('y: {}'.format(y.shape))

X: (63112, 7)
y: (63112,)


#### Feature scaling

In [6]:
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)

#### Train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print('X_train: {}'.format(X_train.shape))
print('X_test: {}'.format(X_test.shape))
print('y_train: {}'.format(y_train.shape))
print('y_test: {}'.format(y_test.shape))

X_train: (42285, 7)
X_test: (20827, 7)
y_train: (42285,)
y_test: (20827,)


### Fitting data <a name="fitting-first"></a>

In [8]:
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)

## Prediction, accuracy, confusion matrix

### Prediction

In [9]:
y_pred = classifier.predict(X_test)

### Accuracy

In [10]:
accuracy = (y_pred == y_test).mean()
print('Accuracy: {0:.3f}'.format(accuracy))

Accuracy: 0.981


### Confusion matrix

In [11]:
print('Confusion matrix: \n')
print(confusion_matrix(y_test, y_pred))
print('\n')
print('Classification report: \n')
print(classification_report(y_test, y_pred))

Confusion matrix: 

[[20430    31]
 [  365     1]]


Classification report: 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     20461
           1       0.03      0.00      0.01       366

    accuracy                           0.98     20827
   macro avg       0.51      0.50      0.50     20827
weighted avg       0.97      0.98      0.97     20827



## Test promotion strategy <a name="test-promotion-strategy-first"></a>

In [12]:
def promotion_strategy(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    
    df = min_max_scaler.fit_transform(df)

    y_pred = classifier.predict(df)

    # Convert 1/0 value into yes/no for outcome
    promotion_yes_no = []
    for value in y_pred:
        if value == 0:
            promotion_yes_no.append("No")
        if value == 1:
            promotion_yes_no.append("Yes")
            
    promotion = np.asarray(promotion_yes_no)
    
    
    return promotion

In [13]:
test_results(promotion_strategy)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0476.

Your nir with this strategy is 6.85.
We came up with a model with an irr of 0.0476 and an nir of 6.85 on the test set.


## Deal with imbalanced class and test promotion strategy again <a name="deal-with-imbalance"></a>

### Fitting data