In [51]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import warnings

In [95]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [3]:
# target['TARGET_B'].value_counts()

# 1. Task 1

Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.

### 1.1 Split

In [4]:
X = pd.concat([numerical, categorical], axis = 1)
y = target['TARGET_B']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
X_train_cat = X_train.select_dtypes(include='object').reset_index(drop=True)

In [7]:
X_train_num = X_train.select_dtypes(include='number').reset_index(drop=True)

In [8]:
y_train = y_train.reset_index(drop=True)

In [9]:
X_test_cat = X_test.select_dtypes(include='object').reset_index(drop=True)

In [10]:
X_test_num = X_test.select_dtypes(include='number').reset_index(drop=True)

In [11]:
y_test = y_test.reset_index(drop=True)

### 1.2 Encode

In [12]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

In [13]:
encoded_categorical_train = encoder.transform(X_train_cat).toarray()

In [14]:
encoded_categorical_train = pd.DataFrame(encoded_categorical_train, columns=encoder.get_feature_names_out())

In [15]:
encoded_categorical_test = encoder.transform(X_test_cat).toarray()

In [16]:
encoded_categorical_test = pd.DataFrame(encoded_categorical_test, columns=encoder.get_feature_names_out())

### 1.3 Scaling

> For the random forest we don´t have to do this. But I´m testing if this way I maximise the recall.
>
> It didn´t, so I won´t run this.

In [17]:
# scaler = StandardScaler().fit(X_train_num)

In [18]:
# scaled_numerical_train = scaler.transform(X_train_num)

In [19]:
# scaled_numerical_train = pd.DataFrame(scaled_numerical_train, columns = X_train_num.columns)

In [20]:
# scaled_numerical_test = scaler.transform(X_test_num)

In [21]:
# scaled_numerical_test = pd.DataFrame(scaled_numerical_test, columns = X_train_num.columns)

### 1.4 Upscaling

In [22]:
trainset = pd.concat([encoded_categorical_train, X_train_num, y_train], axis=1)

In [23]:
category_1 = trainset[trainset['TARGET_B'] == 1].sample(len(trainset[trainset['TARGET_B'] == 0]), replace=True)
print(category_1.shape)

category_0 = trainset[trainset['TARGET_B']== 0 ]
print(category_0.shape)

trainset_new = pd.concat([category_0, category_1], axis=0)

X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']

print(X_train.shape)

(72486, 355)
(72486, 355)
(144972, 354)


### 1.5 Random Forest

##### 1.5.1  Paramethers 1

In [24]:
X_test = pd.concat([encoded_categorical_test, X_test_num], axis=1)

In [25]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)

display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

1.0
0.9466540900277735


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[18064,    19],
       [  999,     1]], dtype=int64)

In [26]:
pd.DataFrame(y_pred).value_counts()

0    19063
1       20
dtype: int64

In [27]:
pd.DataFrame(y_test).value_counts()

TARGET_B
0           18083
1            1000
dtype: int64

##### 1.5.2  Paramethers 2

> I want to implement grid search to find the best params
>
> Don´t run it, it takes too long.

In [139]:
# grid = {'max_depth': [3,5,10],
#         'min_samples_split': [2,10,20],
#         'min_samples_leaf': [2,10,20],
#         'max_samples': [0.8]}

In [140]:
# model = RandomForestClassifier()
# grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5, n_jobs=-1)

In [143]:
# grid_search.fit(X_train_pca, y_train)

In [39]:
# grid_search.best_params_

> I'll use same params that we use in class.

In [44]:
%%time
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,)
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)

display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6225960875203488
0.5945082010166116


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10788,  7295],
       [  443,   557]], dtype=int64)

CPU times: total: 49.5 s
Wall time: 53.3 s


In [40]:
# pd.DataFrame(y_pred).value_counts()

In [41]:
# pd.DataFrame(y_test).value_counts()

# 2. Task 2

Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

> I´ll use PCa

### 2.1 Split

In [128]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

X = pd.concat([numerical, categorical], axis = 1)
y = target['TARGET_B']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train_cat = X_train.select_dtypes(include='object').reset_index(drop=True)
X_train_num = X_train.select_dtypes(include='number').reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

X_test_cat = X_test.select_dtypes(include='object').reset_index(drop=True)
X_test_num = X_test.select_dtypes(include='number').reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### 2.2 Encode

In [129]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

encoded_categorical_train = encoder.transform(X_train_cat).toarray()
encoded_categorical_train = pd.DataFrame(encoded_categorical_train, columns=encoder.get_feature_names_out())

encoded_categorical_test = encoder.transform(X_test_cat).toarray()
encoded_categorical_test = pd.DataFrame(encoded_categorical_test, columns=encoder.get_feature_names_out())

### 2.3 Scale

In [130]:
scaler = StandardScaler().fit(X_train_num)

scaled_numerical_train = scaler.transform(X_train_num)
scaled_numerical_train = pd.DataFrame(scaled_numerical_train, columns = X_train_num.columns)

scaled_numerical_test = scaler.transform(X_test_num)
scaled_numerical_test = pd.DataFrame(scaled_numerical_test, columns = X_train_num.columns)

### 2.4 Concatenate

In [131]:
X_train = pd.concat([encoded_categorical_train, scaled_numerical_train], axis=1).reset_index(drop=True)
X_test = pd.concat([encoded_categorical_test, scaled_numerical_test], axis=1).reset_index(drop=True)

### 2.5 Fit and Transform

In [132]:
pca = PCA(0.90)
pca.fit(X_train)

> Would like to have enough components to capture at least 90% of the variation in the total data.

In [134]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

X_train_pca = pd.DataFrame(X_train_pca).reset_index(drop=True)
X_test_pca = pd.DataFrame(X_test_pca).reset_index(drop=True)

# 3. Task 3

Re-run the Random Forest algorithm to determine if the Feature Selection has improved the results.

### 3.1 Upscale

In [136]:
trainset = pd.concat([X_train_pca, y_train], axis=1)

In [137]:
category_1 = trainset[trainset['TARGET_B'] == 1].sample(len(trainset[trainset['TARGET_B'] == 0]), replace=True)
print(category_1.shape)

category_0 = trainset[trainset['TARGET_B']== 0 ]
print(category_0.shape)

trainset_new = pd.concat([category_0, category_1], axis=0)

X_train_pca = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']

print(X_train_pca.shape)

(72486, 123)
(72486, 123)
(144972, 122)


### 3.2 Random Forest

In [138]:
%%time
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,)
clf.fit(X_train_pca, y_train)

print(clf.score(X_train_pca, y_train))
print(clf.score(X_test_pca, y_test))

y_pred = clf.predict(X_test_pca)

display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6633142951742406
0.6643609495362365


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[12236,  5847],
       [  558,   442]], dtype=int64)

CPU times: total: 1min 43s
Wall time: 1min 45s


# 4. Task 4

Discuss the output and its impact in the business scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?

In the one hand:

    We have predicted that 5847+442 = 6289 people will make a donation.
    However, there are 1000 people that actually made a donation.

    Assuming the cost per mail is 0.65, we have lost (6289-1000)*0.65 = 3438

    Precision = 442/(442+5847) = 0.07
    Recall = 442/(442+558) = 0.44

    We´d like to minimise the false positives, that means increment precision.
    
    So we´d like to increment precision.
    
In the other hand:

    We didn´t send the mail to 558 potential donors. Assuming the average donation is 15, we could´ve earned 558*15 = 8370.
    
    We´d like to minimise the false negatives, that means increment recall.
    
The real benefit is 1000*(15-0.65) = 14350

We obtained 442*(15-0.65) = 6343

There is 8007 of difference.

If we had sent the mail to 558 extra people (maximise recall), we would´ve obtained: 1000*(15-0.65) - 5847*0.65 = 10550
With the prediction we earned: 442*15 - (5847+442)*0.65 = 6630 - 6289 = 341
If we tried to maximise precision, we would obtained 442*15 = 6630

##### So it would be better to send the mail to more people than we should than not send it to the right people.