In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.utils import resample

from sklearn.metrics import precision_recall_fscore_support

In [None]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [None]:
data = pd.read_csv('learningSet.csv')

## Lesson 7.01 - Read Data

In [None]:
print(data.shape)
data.head(10)

In [None]:
data.dtypes

In [None]:
data['TARGET_B'].value_counts()

In [None]:
data['TARGET_D'].value_counts()

## Lesson 7.01 - Review Data

In [None]:
# CHECK FOR NULL VALUES

In [None]:
data.isna().sum()/len(data)

In [None]:
nulls_percent_df = pd.DataFrame(data.isna().sum()/len(data)).reset_index()
nulls_percent_df
nulls_percent_df.columns = ['column_name', 'nulls_percentage']
nulls_percent_df

In [None]:
nulls_percent_df[nulls_percent_df['nulls_percentage']!=0]

In [None]:
nulls_percent_df[nulls_percent_df['nulls_percentage']!=0].head(60)

In [None]:
nulls_percent_df[nulls_percent_df['nulls_percentage']!=0].tail(32)

In [None]:
columns_above_threshold = nulls_percent_df[nulls_percent_df['nulls_percentage']>0.25]
columns_above_threshold['column_name']

In [None]:
drop_columns_list = list(columns_above_threshold['column_name'])
print(drop_columns_list)

In [None]:
# DROP COLUMNS OVER 25% THRESHOLD EXCEPT WEALTH1 WEALTH2

In [None]:
columns_to_omit = ['WEALTH1','WEALTH2']
drop_columns_list = [col for col in drop_columns_list if col not in columns_to_omit]
print(drop_columns_list)

In [None]:
data = data.drop(columns=drop_columns_list)
data.shape

In [None]:
# FIX MAILCODES

In [None]:
data['MAILCODE'].value_counts()
data['MAILCODE'] = data['MAILCODE'].apply(lambda x: x.replace(" ", "A"))

In [None]:
# REPLACE BLANKS WITH NAN VALUES

In [None]:
data = data.apply(lambda x: x.replace(" ", np.NaN))
data.head(10)

In [None]:
# REPEAT DROP COLUMNS ABOVE 25% THRESHOLD

In [None]:
nulls_percent_df2 = pd.DataFrame(data.isna().sum()/len(data)).reset_index()
nulls_percent_df2
nulls_percent_df2.columns = ['column_name', 'nulls_percentage']
nulls_percent_df2

In [None]:
nulls_percent_df2[nulls_percent_df2['nulls_percentage']!=0]

In [None]:
nulls_percent_df2[nulls_percent_df2['nulls_percentage']!=0].head(60)

In [None]:
nulls_percent_df[nulls_percent_df['nulls_percentage']!=0].tail(25)

In [None]:
columns_above_threshold2 = nulls_percent_df2[nulls_percent_df2['nulls_percentage']>0.25]
columns_above_threshold2['column_name']

In [None]:
drop_columns_list2 = list(columns_above_threshold2['column_name'])
print(drop_columns_list2)

In [None]:
columns_to_omit2 = ['WEALTH1','WEALTH2', 'SOLIH','VETERANS']
drop_columns_list2 = [col for col in drop_columns_list2 if col not in columns_to_omit2]
print(drop_columns_list2)

In [None]:
data = data.drop(columns=drop_columns_list2)
data.shape

## Lesson 7.01 - Separate Data Features

In [None]:
Y = data[['TARGET_B', 'TARGET_D']]
display(Y.head(30))
print(Y.isna().any().any())

In [None]:
numerical = data.select_dtypes(np.number)
numerical = numerical.drop(columns = ['TARGET_B', 'TARGET_D'])

display(numerical.head())
display(numerical.shape)

In [None]:
categorical = data.select_dtypes(object)
display(categorical.head())
display(categorical.shape)

In [None]:
def find_outliers_iqr(data, factor=1.5):
    # Calculating the first quartile (Q1) and third quartile (Q3) for each column
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    # Calculating the IQR (Interquartile Range) for each column
    IQR = Q3 - Q1
    # Defining the lower and upper bounds for identifying outliers
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    # Finding outliers by comparing values to the bounds
    outliers = ((data < lower_bound) | (data > upper_bound))
    # Count the number of outliers in each column
    outlier_count = outliers.sum()
    outlier_info = pd.DataFrame({'Columns': outlier_count.index, 'Outlier Count': outlier_count.values})
    return outliers, outlier_info
# Set the IQR factor for outlier detection (default is 1.5)
iqr_factor = 1.5
# Find outliers in the continuous_df DataFrame and get outlier counts
outliers, outlier_info = find_outliers_iqr(numerical, factor=iqr_factor)
# Display the DataFrame of outliers (True indicates an outlier)
print("Outliers:")
print(outliers)
# Display the DataFrame with outlier counts
print("\nOutlier Counts:")
outlier_info

In [None]:
filtered_rows = outlier_info[outlier_info['Outlier Count'] > 0]
display(filtered_rows)

In [None]:
# WORKING WITH CATEGORICAL

In [None]:
categorical.isna().sum()/len(categorical)

In [None]:
# CHECK FOR UNIQUE VALUES WITHIN CATEGORICAL DF

for column_name in categorical.columns:
    column_unique_counts = categorical[column_name].value_counts()
    print(f"Unique counts for column '{column_name}':\n{column_unique_counts}\n")

In [None]:
pd.DataFrame(categorical['STATE'].value_counts()).reset_index()

In [None]:
df = pd.DataFrame(categorical['STATE'].value_counts()).reset_index()

df.columns = ['state', 'count']
other_states = list(df[df['count']<2500]['state'])

def clean_state(x):
    if x in other_states:
        return 'other'
    else:
        return x
    
categorical['STATE'] = list(map(clean_state, categorical['STATE']))

In [None]:
categorical['STATE'].value_counts()

In [None]:
categorical

# W7 Lab 1 - Revisiting Machine Learning

In [None]:
# DROP CATEGORICAL VALUES > 50% THRESHOLD

In [None]:
categorical.isna().sum()/len(data)

In [None]:
nulls_percent_df3 = pd.DataFrame(categorical.isna().sum()/len(categorical)).reset_index()
nulls_percent_df3
nulls_percent_df3.columns = ['column_name', 'nulls_percentage']
nulls_percent_df3

In [None]:
nulls_percent_df3[nulls_percent_df3['nulls_percentage']!=0]

In [None]:
columns_above_threshold3 = nulls_percent_df3[nulls_percent_df3['nulls_percentage']>0.50]
columns_above_threshold3['column_name']

In [None]:
# CLEAN GENDER VALUES

In [None]:
categorical['GENDER'].unique()
categorical['GENDER'].value_counts(dropna=False)

In [None]:
categorical['GENDER'].fillna('F', inplace=True)

In [None]:
values_to_replace = ['J','C','U','A',]
replacement_value = 'Oth'
categorical['GENDER'].replace(values_to_replace, replacement_value, inplace=True)

## Lesson 7.02 - Data Cleaning Part 2

In [None]:
# CLEAN DOMAIN VALUES

In [None]:
categorical['DOMAIN'].value_counts(dropna=False)

In [None]:
categorical['DOMAIN'] = categorical['DOMAIN'].fillna('R2')

In [None]:
categorical['DOMAIN_A'] = list(map(lambda x: x[0], categorical['DOMAIN']))
categorical['DOMAIN_B'] = list(map(lambda x: x[1], categorical['DOMAIN']))

In [None]:
categorical.DOMAIN_A.value_counts()

In [None]:
categorical = categorical.drop(columns=['DOMAIN'])

In [None]:
categorical.head()

In [None]:
print(categorical.MAILCODE.value_counts())
print(categorical.NOEXCH.value_counts())
print(categorical.MDMAUD.value_counts())

In [None]:
# CREATE DROP LIST

In [None]:
drop_list = categorical[['SOLIH','VETERANS','OSOURCE','ZIP','MAILCODE','MDMAUD','NOEXCH']].columns.tolist()

In [None]:
drop_list = drop_list + ['MDMAUD_R', 'MDMAUD_F','MDMAUD_A']

In [None]:
drop_list

In [None]:
# DROP OTHER NAN VALUES

In [None]:
categorical['CLUSTER'].value_counts(dropna=False)

In [None]:
categorical['CLUSTER'].sort_values().unique()

In [None]:
# sorted(categorical['CLUSTER'].unique())

In [None]:
categorical['CLUSTER'].value_counts()

In [None]:
sorted(categorical['CLUSTER'].value_counts().index)

In [None]:
categorical['CLUSTER'] = categorical['CLUSTER'].fillna('40') # 'other' would also be a valid choice

In [None]:
categorical['HOMEOWNR'].value_counts(dropna=False)

In [None]:
categorical['HOMEOWNR'] = categorical['HOMEOWNR'].fillna('U') # assumption: NAN also means 'we don't know'

In [None]:
# REPLACING NAN VALUES IN DATASRCE AND GEOCODE2 

In [None]:
categorical['DATASRCE'].value_counts(dropna=False)

In [None]:
categorical['DATASRCE'].fillna('1', inplace=True)

In [None]:
# REPLACING GEOCODE2 with Mode Values

mode_geocode2 = categorical['GEOCODE2'].mode().iloc[0]
categorical['GEOCODE2'].fillna(mode_geocode2, inplace=True)

In [None]:
categorical['GEOCODE2'].value_counts(dropna=False)

In [None]:
# REMOVE COLUMNS WITH 'ADATE_'

In [None]:
data = data.drop(df.filter(like='ADATE_').columns, axis=1)
display(data)
print(data.shape)

In [None]:
# CONTINUING CLEANING CATEGORICAL DATA

In [None]:
categorical['RFA_6'].value_counts()

In [None]:
for col_name in categorical.columns:
    if 'RFA' in col_name:
        drop_list.append(col_name)

In [None]:
drop_list.remove('RFA_2R')
drop_list.remove('RFA_2A')
drop_list

In [None]:
categorical = categorical.drop(columns=drop_list)
categorical.head()

In [None]:
categorical.isna().sum()

In [None]:
data['OSOURCE'].value_counts(dropna=False).head(60)

In [None]:
data['OSOURCE'] = data['OSOURCE'].fillna('MBC')

In [None]:
# categorical = categorical.drop(columns=['OSOURCE', 'ZIP'])
# categorical.head()

## Lesson 7.02 - Numerical Columns

In [None]:
numerical.head()

In [None]:
numerical.shape

In [None]:
df = pd.DataFrame(numerical.isna().sum()).reset_index()
df.columns = ['column_name', 'nulls']
df[df['nulls']>0]

In [None]:
# CHECK FOR NAN VALUES

In [None]:
mode_wealth1 = numerical['WEALTH1'].mode().iloc[0]
numerical['WEALTH1'].fillna(mode_wealth1, inplace=True)

# numerical['WEALTH1'] = numerical.WEALTH1.interpolate(method='nearest', axis=0)
# numerical = numerical.dropna(subset=['WEALTH1'])

print(numerical['WEALTH1'].value_counts(dropna=False))
numerical['WEALTH1'].shape

In [None]:
mode_wealth2 = numerical['WEALTH2'].mode().iloc[0]
numerical['WEALTH2'].fillna(mode_wealth2, inplace=True)

# numerical['WEALTH1'] = numerical.WEALTH1.interpolate(method='nearest', axis=0)
# numerical = numerical.dropna(subset=['WEALTH1'])

print(numerical['WEALTH2'].value_counts(dropna=False))
numerical['WEALTH2'].shape

In [None]:
numerical['WEALTH2'].shape

In [None]:
numerical['TIMELAG'].value_counts(dropna=False)

mode_timelag = numerical['TIMELAG'].mode().iloc[0]
numerical['TIMELAG'].fillna(mode_timelag, inplace=True)

print(numerical['TIMELAG'].value_counts(dropna=False))


# numerical['TIMELAG'].unique().tolist()
# numerical['TIMELAG'] = numerical.TIMELAG.interpolate(method='nearest', axis=0)



In [None]:
# DROP ROWS > 30 (5 YEARS) AS CONSIDERD DORMANT

# numerical = numerical[numerical['TIMELAG'] <= 30]
# numerical['TIMELAG'].value_counts(dropna=False)

In [None]:
# FILLING NAN VALUES WITH PLOTS

In [None]:
numerical['AGE'].value_counts(dropna=False)

In [None]:
sns.distplot(numerical['AGE'])
plt.show()

In [None]:
numerical['AGE'] = numerical["AGE"].fillna(np.mean(numerical['AGE']))

In [None]:
sns.distplot(numerical['AGE'])
plt.show()

In [None]:
sns.distplot(numerical['INCOME']) 
plt.show()

In [None]:
print(numerical['INCOME'].value_counts(dropna=False))

In [None]:
numerical['INCOME'] = numerical['INCOME'].fillna(5.0)

In [None]:
sns.distplot(numerical[numerical['CLUSTER2'].isna()==False]['CLUSTER2']) 
plt.show()

In [None]:
numerical['CLUSTER2'].value_counts(dropna=False)

In [None]:
np.mean(numerical['CLUSTER2'])

In [None]:
numerical['CLUSTER2'] = numerical['CLUSTER2'].fillna(np.ceil(np.mean(numerical['CLUSTER2'])))

In [None]:
np.ceil(np.mean(numerical['CLUSTER2']))

In [None]:
sns.distplot(numerical['CLUSTER2']) 
plt.show()

# W7 Lab 2 - Feature Engineering

In [None]:
# CLEAN NUMERICAL DATA

In [None]:
numerical.isna().sum()/len(data)

In [None]:
# APPLY THE SAME NAN PERCENTAGE CHECK

nulls_percent_df4 = pd.DataFrame(numerical.isna().sum()/len(numerical)).reset_index()
nulls_percent_df4
nulls_percent_df4.columns = ['column_name', 'nulls_percentage']
nulls_percent_df4

In [None]:
nulls_percent_df4[nulls_percent_df4['nulls_percentage']!=0].head(60)

In [None]:
print(numerical['MSA'].value_counts(dropna=False))
numerical['MSA'] = numerical["MSA"].fillna(np.mean(numerical['MSA']))

In [None]:
print(numerical['MSA'].value_counts(dropna=False))

In [None]:
print(numerical['ADI'].value_counts(dropna=False))
numerical['ADI'] = numerical["ADI"].fillna(np.mean(numerical['ADI']))

In [None]:
print(numerical['ADI'].value_counts(dropna=False))

In [None]:
# REPLACE NAN WITH MODE

print(numerical['DMA'].value_counts(dropna=False))

mode_dma = numerical['DMA'].mode()[0]
numerical['DMA'] = numerical['DMA'].fillna(mode_dma)

In [None]:
# INTERPOLATE ADATE COLUMNS AND DISTRIBUTE WITH NEAREST VALUE

adate_columns = [col for col in numerical.columns if col.startswith("ADATE_")]
numerical[adate_columns] = numerical[adate_columns].interpolate()

In [None]:
print(numerical['NEXTDATE'].value_counts(dropna=False))
numerical['NEXTDATE'] = numerical.NEXTDATE.interpolate(method='nearest', axis=0)

In [None]:
print(numerical.isna().any())

In [None]:
# CHECK ADATE_14 AND ADATE_18 COLUMNS

print(numerical['ADATE_14'].value_counts(dropna=False))
print(numerical['ADATE_18'].value_counts(dropna=False))

numerical = numerical.dropna(subset=['ADATE_14'])
numerical = numerical.dropna(subset=['ADATE_18'])

In [None]:
print(numerical.shape)
print(categorical.shape)

In [None]:
# CHECK TO SEE IF ANY NAN VALUES

print(numerical.isna().any().any())
print(categorical.isna().any().any())

In [None]:
# SAVE TO CSV

numerical.to_csv('numerical.csv', index=False)
categorical.to_csv('categorical.csv', index=False)

# W7 Lab 4 - Handling Data Imbalance

In [None]:
clean_df = pd.concat([numerical, categorical], axis=1)
clean_df.head(60)
numerical.isna().any().any()

In [None]:
clean_df.columns.tolist()

In [None]:
target_b = data['TARGET_B']
target_d = data['TARGET_D']
donor_df = pd.concat([target_b, target_d, clean_df], axis=1)

## Split, Scale & Encode (V1)

In [None]:
# SPLIT DATAFRAME X-y

In [None]:
# Use the new DF as the X axis with the 'Target-B' column from the original dataframe.                                                                                    

X = donor_df
y = donor_df['TARGET_B']


print(data['TARGET_B'].shape)
print(clean_df.shape)
print(clean_df.dtypes)

In [None]:
X.isna().any().any()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Seprate the training and testing data between numerical and categorical columns

X_train_cat = X_train.select_dtypes(object)
X_train_num = X_train.select_dtypes(np.number)

X_test_cat = X_test.select_dtypes(object)
X_test_num = X_test.select_dtypes(np.number)

In [None]:
# Scale the data using Standard Scaler

transformer = StandardScaler().fit(X_train_num)

X_train_scaled = pd.DataFrame(transformer.transform(X_train_num), columns=X_train_num.columns)
X_test_scaled = pd.DataFrame(transformer.transform(X_test_num), columns=X_test_num.columns)

display(X_train_scaled.head())
display(X_test_scaled.head())

display(X_train.shape)
display(X_test.shape)

In [None]:
# Separate Categorical and drop 'CLUSTER' from OneHot Encoding due to length of unique values

X_train_cat_onehot = X_train_cat.drop(columns=['CLUSTER']) 
display(X_train_cat_onehot.head())
display(X_train_cat_onehot.shape)

X_test_cat_onehot = X_test_cat.drop(columns=['CLUSTER']) 
display(X_test_cat_onehot.head())
display(X_test_cat_onehot.shape)

In [None]:
# OneHot encode the training categorical values

# Fit on the training data
encoder = OneHotEncoder(drop="first", sparse=False)
encoded_train = encoder.fit_transform(X_train_cat_onehot)
X_train_cat_onehot_encoded = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out())

# Transform the testing data using the same encoder and feature names
encoded_test = encoder.transform(X_test_cat_onehot)
X_test_cat_onehot_encoded = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out())

# Reset index
X_train_cat_onehot_encoded.reset_index(drop=True, inplace=True)
X_test_cat_onehot_encoded.reset_index(drop=True, inplace=True)

# Display the head of the encoded DataFrames
display(X_train_cat_onehot_encoded.head())
display(X_test_cat_onehot_encoded.head())


# encoder = OneHotEncoder(drop="first").fit(X_train_cat_onehot)

# encoded_train = encoder.transform(X_train_cat_onehot).toarray()
# X_train_cat_onehot = pd.DataFrame(encoded_train, columns = encoder.get_feature_names_out())


# encoded_test = encoder.transform(X_test_cat_onehot).toarray()
# X_test_cat_onehot = pd.DataFrame(encoded_test, columns = encoder.get_feature_names_out())

# X_train_cat_onehot.head()
# X_test_cat_onehot.head()






# encoder = OneHotEncoder(drop="first", sparse=False)
# encoded_train = encoder.fit_transform(X_train_cat_onehot)

# # Get the feature names for the encoded columns
# feature_names = encoder.get_feature_names_out()

# # Display the feature names
# print(feature_names)



In [None]:
# Ordinal encode the CLUSTER column due to unique value lengths

ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train_cat[['CLUSTER']])

X_train_cat['CLUSTER'] = ordinal_encoder.transform(X_train_cat[['CLUSTER']])
X_test_cat['CLUSTER'] = ordinal_encoder.transform(X_test_cat[['CLUSTER']])

X_train_cat_ord = X_train_cat[['CLUSTER']]
X_test_cat_ord = X_test_cat[['CLUSTER']]

display(X_train_cat_ord)

In [None]:
# Check X dimensions and shape to ensure no data anamolies

# Training Data
display(X_train_cat_onehot_encoded.shape)
display(X_train_cat_ord.shape)
display(X_train_num.shape)


# Testing Data
display(X_train_cat_onehot_encoded.shape)
display(X_test_cat_ord.shape)
display(X_test_num.shape)

In [None]:
# Reset the index for training and testing data

X_train_cat_onehot_encoded.reset_index(drop=True, inplace=True)
X_train_cat_ord.reset_index(drop=True, inplace=True)
X_train_num.reset_index(drop=True, inplace=True)

X_test_cat_onehot_encoded.reset_index(drop=True, inplace=True)
X_test_cat_ord.reset_index(drop=True, inplace=True)
X_test_num.reset_index(drop=True, inplace=True)

# Concact the training and testing dataframes

X_train_processed = pd.concat([X_train_cat_onehot_encoded, X_train_cat_ord, X_train_num], axis=1)
X_test_processed = pd.concat([X_test_cat_onehot_encoded, X_test_cat_ord, X_test_num], axis=1)

display(X_train_processed.dtypes)
display(X_test_processed.dtypes)

In [None]:
# CLASSIFICATION MODELS

In [None]:
y_train.shape

In [None]:
# Model training and evaluation (Random Forest Classifier)

rf_classifier1 = RandomForestClassifier(random_state=42)
rf_classifier1.fit(X_train_processed, y_train)

y_pred_rf_train1 = rf_classifier1.predict(X_train_processed)
accuracy_rf_train1 = accuracy_score(y_train, y_pred_rf_train1)

y_pred_rf_test1 = rf_classifier1.predict(X_test_processed)
accuracy_rf_test1 = accuracy_score(y_test, y_pred_rf_test1)

print(f"RandomForestClassifier with SMOTE:")
print(f"Training Accuracy -> {accuracy_rf_train1:.4f}")
print(f"Test Accuracy -> {accuracy_rf_test1:.4f}")



# Model training and evaluation (Logistic Regression)

logistic_regression1 = LogisticRegression(random_state=42)
logistic_regression1.fit(X_train_processed, y_train)

y_pred_lr_train1 = logistic_regression1.predict(X_train_processed)
accuracy_lr_train1 = accuracy_score(y_train, y_pred_lr_train1)

y_pred_lr_test1 = logistic_regression1.predict(X_test_processed)
accuracy_lr_test1 = accuracy_score(y_test, y_pred_lr_test1)
                                  
print(f"LogisticRegression with SMOTE:")
print(f"Training Accuracy -> {accuracy_lr_train1:.4f}")
print(f"Test Accuracy -> {accuracy_lr_test1:.4f}")

In [None]:
y_pred_test1 = rf_classifier1.predict(X_test_processed)

In [None]:
# CLASSIFICATION REPORT 1

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_test1))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test1))

# Display some of the predicted and actual values
results_df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test1})
print("\nActual vs Predicted:")
print(results_df1.head(10))

In [None]:
# Assuming y_true and y_pred are your true and predicted labels
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_test1, average='binary')

print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1 Score: {f1:.5f}")

In [None]:
# models = [RandomForestClassifier(random_state=42), LogisticRegression()]
# models_automation(models, X_train_processed, y_train, X_test_processed, y_test)

### First Conclusion 

Despite the data set being quite unbalanced, I seem to have a high RFC and LR score. This is without dropping any feature columns, but rather just encoding/scaling. I can summarise, unless there was a mistake in the encoding that with the amount of information, we can assume that we are able to predict 'TARGET_B'.

# Restart Data PreProcessing (V2)

Based on the existing DF without upsampling, we can see that the model isn't optimal. With this information I want to re-process the data, re-encode and upsample before splitting for train/testing.

In [None]:
X = clean_df
y = data['TARGET_B']

In [None]:
display(X.shape)
display(y.shape)

## Scale & Encode Data

In [None]:
# SEPARATE NUM & CAT FOR SCALE & ENCODING

X_cat = X.select_dtypes(object)
X_num = X.select_dtypes(np.number)

In [None]:
# SCALE NUMERICAL

transformer = StandardScaler().fit(X_num)
X_num_scaled = pd.DataFrame(transformer.transform(X_num), columns=X_num.columns)

display(X_num_scaled)

In [None]:
# Separate Categorical and drop 'CLUSTER' from OneHot Encoding due to length of unique values

X_cat_onehot = X_cat.drop(columns=['CLUSTER']) 
display(X_cat_onehot.head())
display(X_cat_onehot.shape)

In [None]:
print(X_cat_onehot.isnull().sum())
print(X_cat_onehot.dtypes)
print(encoder.get_feature_names_out())

In [None]:
# ONEHOT ENCODE CATEGORICAL DATA

encoder = OneHotEncoder(drop="first", sparse=False)
encoded = encoder.fit_transform(X_cat_onehot)
X_cat_encoded = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

display(X_cat_encoded)

In [None]:
# ORDINAL ENCODE THE CLUSTER COLUMN

ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_cat[['CLUSTER']])

X_cat['CLUSTER'] = ordinal_encoder.transform(X_cat[['CLUSTER']])

X_cat_ord = X_cat[['CLUSTER']]

display(X_cat_ord)

In [None]:
# Reset the index for training and testing data

X_num_scaled.reset_index(drop=True, inplace=True)
X_cat_ord.reset_index(drop=True, inplace=True)
X_cat_encoded.reset_index(drop=True, inplace=True)

# Concact the training and testing dataframes

X_processed = pd.concat([X_num_scaled, X_cat_ord, X_cat_encoded], axis=1)

display(X_processed.dtypes)

## Upsampling Data (V2)

In [None]:
# SPLIT DATA BEFORE UPSAMPLING

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [None]:
# CONCAT PROCESSED DATA FOR UPSAMPLING

processed_df = pd.concat([X_train_2, y_train_2], axis=1)

df_majority = processed_df[processed_df['TARGET_B'] == 0]
df_minority = processed_df[processed_df['TARGET_B'] == 1]

# Upsample the training data
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

X_train_upsampled = df_upsampled.drop('TARGET_B', axis=1)
y_train_upsampled = df_upsampled['TARGET_B']

In [None]:
display(X_train_upsampled)
display(y_train_upsampled)

## Re-test Classifcation Model

In [None]:
# Model training and evaluation (Random Forest Classifier)

rf_classifier2 = RandomForestClassifier(random_state=42)
rf_classifier2.fit(X_train_upsampled, y_train_upsampled)

y_pred_rf_train2 = rf_classifier2.predict(X_train_upsampled)
accuracy_rf_train2 = accuracy_score(y_train_upsampled, y_pred_rf_train2)

y_pred_rf_test2 = rf_classifier2.predict(X_test_2)
accuracy_rf_test2 = accuracy_score(y_test_2, y_pred_rf_test2)

print(f"RandomForestClassifier with Resample:")
print(f"Training Accuracy -> {accuracy_rf_train2:.4f}")
print(f"Test Accuracy -> {accuracy_rf_test2:.4f}")



# Model training and evaluation (Logistic Regression)

logistic_regression2 = LogisticRegression(random_state=42)
logistic_regression2.fit(X_train_upsampled, y_train_upsampled)

y_pred_lr_train2 = logistic_regression2.predict(X_train_upsampled)
accuracy_lr_train2 = accuracy_score(y_train_upsampled, y_pred_lr_train2)

y_pred_lr_test2 = logistic_regression2.predict(X_test_2)
accuracy_lr_test2 = accuracy_score(y_test_2, y_pred_lr_test2)
                                  
print(f"LogisticRegression with Resample:")
print(f"Training Accuracy -> {accuracy_lr_train2:.4f}")
print(f"Test Accuracy -> {accuracy_lr_test2:.4f}")

In [None]:
y_pred_test2 = rf_classifier2.predict(X_test_2)

In [None]:
# CLASSIFICATION REPORT 2

# Print classification report
print("Classification Report:")
print(classification_report(y_test_2, y_pred_test2))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_2, y_pred_test2))

# Display some of the predicted and actual values
results_df2 = pd.DataFrame({'Actual': y_test_2, 'Predicted': y_pred_test2})
print("\nActual vs Predicted:")
print(results_df2.head(10))

In [None]:
# PRECISION RECALL F1 COMPARISON

precision, recall, f1, _ = precision_recall_fscore_support(y_test_2, y_pred_test2, average='binary')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

### Second Concluson

The first obversation is that the RFC scores have improved to almost perfect on both training and testing data. This implies that the upsampled data has improved performance on both counts. For the LR, it has decreased dramatically which could suggest that it's struggling to fit with the upsampled data.

The next step i'd like to try is to upsample the data using another method, in this case SMOTE.

## Resample using SMOTE

In [None]:
# Use the processed data prior to the upsample

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [None]:
# Apply SMOTE only to the training data

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_3, y_train_3)


# Model training and evaluation (Random Forest Classifier)

rf_classifier3 = RandomForestClassifier(random_state=42)
rf_classifier3.fit(X_train_smote, y_train_smote)

y_pred_rf_train3 = rf_classifier3.predict(X_train_smote)
accuracy_rf_train3 = accuracy_score(y_train_smote, y_pred_rf_train3)

y_pred_rf_test3 = rf_classifier3.predict(X_test_3)
accuracy_rf_test3 = accuracy_score(y_test_3, y_pred_rf_test3)

print(f"RandomForestClassifier with SMOTE:")
print(f"Training Accuracy -> {accuracy_rf_train3:.4f}")
print(f"Test Accuracy -> {accuracy_rf_test3:.4f}")



# Model training and evaluation (Logistic Regression)

logistic_regression3 = LogisticRegression(random_state=42)
logistic_regression3.fit(X_train_smote, y_train_smote)

y_pred_lr_train3 = logistic_regression3.predict(X_train_smote)
accuracy_lr_train3 = accuracy_score(y_train_smote, y_pred_lr_train3)

y_pred_lr_test3 = logistic_regression3.predict(X_test_3)
accuracy_lr_test3 = accuracy_score(y_test_3, y_pred_lr_test3)
                                  
print(f"LogisticRegression with SMOTE:")
print(f"Training Accuracy -> {accuracy_lr_train3:.4f}")
print(f"Test Accuracy -> {accuracy_lr_test3:.4f}")

In [None]:
y_pred_test3 = rf_classifier3.predict(X_test_3)

In [None]:
# CLASSFICATION REPORT 3

# Print classification report
print("Classification Report:")
print(classification_report(y_test_3, y_pred_test3))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_3, y_pred_test3))

# Display some of the predicted values and actual values
results_df3 = pd.DataFrame({'Actual': y_test_3, 'Predicted': y_pred_test3})
print("\nActual vs Predicted:")
print(results_df3.head(10))

In [None]:
# PRECISION RECALL F1 COMPARISON

precision3, recall3, f13, _ = precision_recall_fscore_support(y_test_3, y_pred_test3, average='binary')

print(f"Precision: {precision3:.4f}")
print(f"Recall: {recall3:.4f}")
print(f"F1 Score: {f13:.4f}")

# W7 - Lab 5 Random Forest (V4)

In [None]:
# In this part of the process, we do an RFE (Recursive Feature Elimination)
# This is to understand if droping features has an impact on the model and scores

In [None]:
# I use the processed X-dataframes from earlier (encoded and scaled)

X = pd.concat([X_train_processed, X_test_processed], axis=0)
y = data['TARGET_B']

In [None]:
display(X)
display(y)

In [None]:
# TRAIN TEST SPLIT

X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(x_processed, y, test_size=0.2, random_state=42)

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from imblearn.over_sampling import SMOTE
# from sklearn.feature_selection import SelectFromModel
# from sklearn.metrics import classification_report, accuracy_score

# # Assuming X and y are your original features and target variable
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Upsample only the training data using SMOTE
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# # Perform RFC feature selection
# rf_classifier = RandomForestClassifier(random_state=42)
# rf_classifier.fit(X_train_resampled, y_train_resampled)

# # Use feature importances for feature selection
# sfm = SelectFromModel(rf_classifier, threshold='median')
# sfm.fit(X_train_resampled, y_train_resampled)

# # Transform the data to include only selected features
# X_train_selected = sfm.transform(X_train_resampled)
# X_test_selected = sfm.transform(X_test)

# # Train RFC on the selected features
# rf_classifier_selected = RandomForestClassifier(random_state=42)
# rf_classifier_selected.fit(X_train_selected, y_train_resampled)

# # Make predictions
# y_pred = rf_classifier_selected.predict(X_test_selected)

# # Evaluate the model
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

# print("\nAccuracy:", accuracy_score(y_test, y_pred))


In [None]:
display(X_train_resampled)
display(y_train_resampled)

In [None]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X, y, test_size=0.2, random_state=42)

ratio_class_0_to_1 = 90000 / 4000
class_weights = {0: 1.0, 1: ratio_class_0_to_1}
# class_weights = {0: 1.0, 1: 50.0} 


rf_classifier4 = RandomForestClassifier(n_estimators=500, class_weight=class_weights, random_state=42)
rf_classifier4.fit(X_train_4, y_train_4)


feature_importances = rf_classifier4.feature_importances_

k = 100
top_features_indices = feature_importances.argsort()[-k:][::-1]

X_selected_train = X_train_4.iloc[:, top_features_indices]
X_selected_test = X_test_4.iloc[:, top_features_indices]

In [None]:
# Step 5: Train Random Forest Classifier on Selected Features
rf_classifier4.fit(X_selected_train, y_train_4)

# Step 6: Make Predictions and Evaluate
y_pred = rf_classifier4.predict(X_selected_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test_4, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_4, y_pred))


In [None]:
X_all = pd.concat([X_selected_train, X_selected_test], axis=0)
X_all.shape

In [None]:
# Assuming X_all is your entire DataFrame with features
predictions_all = rf_classifier4.predict(X_all)
X_all['Predicted_Target_B'] = predictions_all

In [None]:
X_all['Predicted_Target_B'].value_counts()

In [None]:
X_all.to_csv('pred_df.csv', index=False)

# Process 2

In [None]:
subset_data = donor_df[donor_df['TARGET_B'] == 1]
subset_data.shape

In [None]:
X = subset_data
y = subset_data['TARGET_D']

In [None]:
display(X.shape)
display(y.shape)

In [None]:
X_cat_p2 = X.select_dtypes(object)
X_num_p2 = X.select_dtypes(np.number)

In [None]:
# transformer = StandardScaler()
X_num_scaled = pd.DataFrame(transformer.transform(X_num_p2), columns=X_num.columns)

display(X_num_scaled)

In [None]:
X_cat_onehot = X_cat.drop(columns=['CLUSTER']) 
display(X_cat_onehot.head())
display(X_cat_onehot.shape)

In [None]:
X_subset_train, X_subset_test, y_subset_train, y_subset_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# Assuming scaler is the scaler used in Process 1
X_subset_train_scaled = scaler.transform(X_subset_train)
X_subset_test_scaled = scaler.transform(X_subset_test)