In [1]:
# Importing libraries

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt 
import matplotlib.pyplot as plt

from sklearn.feature_selection import RFE
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

from sklearn.tree import DecisionTreeClassifier

In [2]:
# Importing Data Sets

categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

In [3]:
# Checking Categorical Data Types

categorical.dtypes

STATE           object
CLUSTER          int64
HOMEOWNR        object
GENDER          object
DATASRCE         int64
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B         int64
ODATEW_YR        int64
ODATEW_MM        int64
DOB_YR           int64
DOB_MM           int64
MINRDATE_YR      int64
MINRDATE_MM      int64
MAXRDATE_YR      int64
MAXRDATE_MM      int64
LASTDATE_YR      int64
LASTDATE_MM      int64
FIRSTDATE_YR     int64
FIRSTDATE_MM     int64
dtype: object

In [4]:
# Checking numerical Data Types

numerical.dtypes

TCODE         int64
AGE         float64
INCOME        int64
WEALTH1       int64
HIT           int64
             ...   
AVGGIFT     float64
CONTROLN      int64
HPHONE_D      int64
RFA_2F        int64
CLUSTER2      int64
Length: 315, dtype: object

In [5]:
# Checking target Data Types

target.dtypes

TARGET_B      int64
TARGET_D    float64
dtype: object

In [6]:
# Rearanging the Categorical Data Set to a second numerical dataset

numerical_2 = categorical.select_dtypes(include=np.number)
numerical_2

Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,36,3,2,89,1,37,12,92,8,94,2,95,12,89,11
1,14,3,1,94,1,52,2,93,10,95,12,95,12,93,10
2,43,3,2,90,1,0,2,91,11,92,7,95,12,90,1
3,44,3,2,87,1,28,1,87,11,94,11,95,12,87,2
4,16,3,2,86,1,20,1,93,10,96,1,96,1,79,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,27,3,2,96,1,0,2,96,2,96,2,96,2,96,2
95408,24,3,1,96,1,50,1,96,3,96,3,96,3,96,3
95409,30,3,3,95,1,38,1,96,3,95,1,96,10,94,10
95410,24,2,1,86,1,40,5,90,11,96,8,97,1,86,12


In [7]:
# Rearanging the Categorical Data Set

categorical = categorical.select_dtypes(include = object)
categorical

Unnamed: 0,STATE,HOMEOWNR,GENDER,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A
0,IL,H,F,L,E,C,T
1,CA,H,M,L,G,A,S
2,NC,U,M,L,E,C,R
3,CA,U,F,L,E,C,R
4,FL,H,F,L,F,A,S
...,...,...,...,...,...,...,...
95407,other,H,M,L,G,C,C
95408,TX,H,M,L,F,A,C
95409,MI,H,M,L,E,B,C
95410,CA,H,F,L,F,A,C


In [8]:
# One hot enconding

dummies = pd.get_dummies(categorical, drop_first = True)
dummies

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
95408,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
95409,0,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
95410,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [9]:
# Creating a new ds all data and dropping TARGET_D
ds = pd.concat([numerical, numerical_2, dummies, target.drop("TARGET_D", axis = 1) ], axis = 1)

In [10]:
# Dropping null values

ds = ds.dropna(axis = 1)

In [11]:
# Dropping duplicate values

ds = ds.drop_duplicates()

In [13]:
# X and Y

X = ds.drop("TARGET_B",axis = 1)
y = ds["TARGET_B"]

# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

# Train dataframe

ds_train = pd.concat([X_train, y_train], axis = 1)
ds_train = ds_train.reset_index(drop = True)
ds_train

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,28,61.611649,5,9,0,0,29,36,21,7,...,1,0,0,0,0,0,0,0,1,0
1,0,64.000000,4,9,0,0,23,23,28,2,...,1,0,0,0,1,0,1,0,0,0
2,1,63.000000,7,6,2,0,25,27,33,3,...,0,0,0,0,1,0,0,1,0,0
3,2,72.000000,2,9,0,0,34,19,30,7,...,0,0,0,0,0,0,1,0,0,0
4,0,61.611649,1,9,0,0,37,46,38,6,...,1,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76324,0,61.611649,1,7,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
76325,1,62.000000,2,2,6,2,40,31,32,6,...,1,0,1,0,0,0,1,0,0,0
76326,0,44.000000,5,9,0,2,40,55,14,5,...,1,0,1,0,0,0,1,0,0,0
76327,0,61.611649,2,9,0,0,30,22,47,8,...,1,0,0,0,1,1,0,0,0,0


In [14]:
# Unique values - target variable
ds_train["TARGET_B"].value_counts()

0    72464
1     3865
Name: TARGET_B, dtype: int64

In [21]:
# Test dataframe
ds_test = pd.concat([X_test, y_test], axis = 1)
ds_test = ds_test.reset_index(drop = True)
ds_test

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,1,40.000000,6,9,11,0,28,33,19,3,...,0,1,0,1,0,0,1,0,0,0
1,0,75.000000,5,9,0,1,20,33,34,10,...,0,0,0,0,0,0,1,0,0,0
2,0,52.000000,7,9,8,0,39,19,8,13,...,0,0,0,0,0,0,0,0,1,0
3,0,54.000000,7,9,0,0,26,20,24,15,...,1,0,0,0,0,0,1,0,0,0
4,1,56.000000,5,6,2,0,52,8,76,9,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,2,61.611649,5,9,0,0,33,30,34,4,...,1,0,0,1,0,1,0,0,0,0
19079,1,66.000000,5,9,0,0,8,0,26,0,...,1,0,0,0,0,0,1,0,0,0
19080,0,51.000000,6,9,0,4,35,34,42,11,...,1,0,1,0,0,0,1,0,0,0
19081,0,39.000000,4,3,1,1,29,31,44,5,...,1,0,1,0,0,0,0,0,1,0


# SMOTE

In [16]:
smote = SMOTE()
x_sm, y_sm = smote.fit_resample(ds_train.drop("TARGET_B", axis = 1), ds_train["TARGET_B"])

# Creating a balanced dataframe
ds_train = pd.concat([x_sm, y_sm], axis = 1)
ds_train

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,28,61.611649,5,9,0,0,29,36,21,7,...,1,0,0,0,0,0,0,0,1,0
1,0,64.000000,4,9,0,0,23,23,28,2,...,1,0,0,0,1,0,1,0,0,0
2,1,63.000000,7,6,2,0,25,27,33,3,...,0,0,0,0,1,0,0,1,0,0
3,2,72.000000,2,9,0,0,34,19,30,7,...,0,0,0,0,0,0,1,0,0,0
4,0,61.611649,1,9,0,0,37,46,38,6,...,1,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144923,378,73.695727,4,9,0,0,10,11,10,4,...,0,0,0,0,1,1,0,0,0,1
144924,1,79.819369,3,7,1,0,13,19,10,5,...,0,0,0,0,0,0,0,0,0,1
144925,0,52.715854,6,9,0,0,25,12,20,6,...,0,0,1,0,0,1,0,0,0,1
144926,1,63.573877,4,9,0,0,38,33,22,4,...,0,0,1,0,0,0,0,1,0,1


In [17]:
# Unique values of target B after oversampling
ds_train["TARGET_B"].value_counts()

0    72464
1    72464
Name: TARGET_B, dtype: int64

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(ds_train.drop("TARGET_B", axis=1), ds_train["TARGET_B"])

train_predictions = rf_model.predict(ds_train.drop("TARGET_B", axis=1))
test_predictions = rf_model.predict(ds_test.drop("TARGET_B", axis=1))

train_accuracy = accuracy_score(ds_train["TARGET_B"], train_predictions)
test_accuracy = accuracy_score(ds_test["TARGET_B"], test_predictions)

print("Accuracy (Train data):", train_accuracy)
print("Accuracy (Test data):", test_accuracy)

Accuracy (Train data): 0.9091893905939501
Accuracy (Test data): 0.863805481318451
