In [28]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [3]:
# target['TARGET_B'].value_counts()

# 1. Task 1

Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.

### 1.1 Split

In [4]:
X = pd.concat([numerical, categorical], axis = 1)
y = target['TARGET_B']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
X_train_cat = X_train.select_dtypes(include='object').reset_index(drop=True)

In [7]:
X_train_num = X_train.select_dtypes(include='number').reset_index(drop=True)

In [8]:
y_train = y_train.reset_index(drop=True)

In [9]:
X_test_cat = X_test.select_dtypes(include='object').reset_index(drop=True)

In [10]:
X_test_num = X_test.select_dtypes(include='number').reset_index(drop=True)

In [11]:
y_test = y_test.reset_index(drop=True)

### 1.2 Encode

In [12]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

In [13]:
encoded_categorical_train = encoder.transform(X_train_cat).toarray()

In [14]:
encoded_categorical_train = pd.DataFrame(encoded_categorical_train, columns=encoder.get_feature_names_out())

In [15]:
encoded_categorical_test = encoder.transform(X_test_cat).toarray()

In [16]:
encoded_categorical_test = pd.DataFrame(encoded_categorical_test, columns=encoder.get_feature_names_out())

### 1.3 Scaling

> For the random forest we don´t have to do this. But I´m testing if this way I maximise the recall.
>
> It didn´t, so I won´t run this.

In [17]:
# scaler = StandardScaler().fit(X_train_num)

In [18]:
# scaled_numerical_train = scaler.transform(X_train_num)

In [19]:
# scaled_numerical_train = pd.DataFrame(scaled_numerical_train, columns = X_train_num.columns)

In [20]:
# scaled_numerical_test = scaler.transform(X_test_num)

In [21]:
# scaled_numerical_test = pd.DataFrame(scaled_numerical_test, columns = X_train_num.columns)

### 1.4 Upscaling

In [22]:
trainset = pd.concat([encoded_categorical_train, X_train_num, y_train], axis=1)

In [23]:
category_1 = trainset[trainset['TARGET_B'] == 1].sample(len(trainset[trainset['TARGET_B'] == 0]), replace=True)
print(category_1.shape)

category_0 = trainset[trainset['TARGET_B']== 0 ]
print(category_0.shape)

trainset_new = pd.concat([category_0, category_1], axis=0)

X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']

print(X_train.shape)

(72486, 355)
(72486, 355)
(144972, 354)


### 1.5 Random Forest

##### 1.5.1  Paramethers 1

In [24]:
X_test = pd.concat([encoded_categorical_test, X_test_num], axis=1)

In [25]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)

display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

1.0
0.9466540900277735


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[18064,    19],
       [  999,     1]], dtype=int64)

In [26]:
pd.DataFrame(y_pred).value_counts()

0    19063
1       20
dtype: int64

In [27]:
pd.DataFrame(y_test).value_counts()

TARGET_B
0           18083
1            1000
dtype: int64

##### 1.5.2  Paramethers 2

> I want to implement grid search to find the best params

In [35]:
grid = {'max_depth': [3,5,10],
        'min_samples_split': [2,10,20],
        'min_samples_leaf': [2,10,20],
        'max_samples': [0.8],
        'random_state': [42]}

In [36]:
model = RandomForestClassifier()
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
# clf = RandomForestClassifier()
# clf.fit(X_train, y_train)

# print(clf.score(X_train, y_train))
# print(clf.score(X_test, y_test))

# y_pred = clf.predict(X_test)

# display(y_test.value_counts())
# display(confusion_matrix(y_test, y_pred))

In [26]:
# pd.DataFrame(y_pred).value_counts()

0    19063
1       20
dtype: int64

In [27]:
# pd.DataFrame(y_test).value_counts()

TARGET_B
0           18083
1            1000
dtype: int64

# 2. Task 2

Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)