In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv(r"../data/Bank_Personal_Loan_Modelling.csv")
data.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In this case, our products that we are targeting are
   - Mortgage
   - Personal Loan
   - Securities Account
   - CD account
   - Online
   - Credit Card

In [4]:
y_columns = [
    "Mortgage",
    "Personal Loan",
    "Securities Account",
    "CD Account",
    "Online",
    "Credit Card",
]

# Data Analytics

## Checking For Missing and duplicated Values

In [5]:
print("========== Missing Values ==========")
print(data.isnull().sum())
print("========== Duplicate Values ==========")
print(data.duplicated().sum())

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64
0


## Discover Hidden relation in Zip Code

Somethings I can think about Zip Code relation
   - Average GDP of the given Zip code
   - Population Density
   - Education Level

While all this values, if found, can somewhat be correlate to other rows (hopefully not), such as "Education", "CCAvg" etc.. It can give us hidden insights in to the users external factors, such as pressure by society and stuff. Hence, given a certain ZIP code maybe somewhat useless, we can replace it with the 3 mentioned values

In [6]:
import zipcodes
zipcode = data["ZIP Code"].unique()

def get_county(zipcode):
    finder = zipcodes.matching(zipcode)
    if len(finder) > 0:
        val = finder[0]["county"], finder[0]["lat"], finder[0]["long"]
    else:
        val = None, None, None
    return val

data["County"], data["Latitude"], data["Longitude"] = zip(*data["ZIP Code"].astype(str).apply(lambda x : get_county(x)))

In [7]:
col_to_drop = ['Rank within US (of 3142 counties)',
               'Rank within US (of 3143 counties)',
               'People (Education: Less Than 9th Grade)',
               "People (Education: At Least Bachelors Degree)",
               ' FIPS']

In [8]:
# Income (Median Family Income)
california_income = pd.read_csv(r"..\data\california_income.csv", delimiter=",")[2:].reset_index(drop=True).drop(columns=col_to_drop, errors = "ignore")
# Education (Percentage of people with atleast a bachelors)
california_education = pd.read_csv(r"..\data\california_education.csv", delimiter=",")[2:].reset_index(drop=True).drop(columns=col_to_drop, errors = "ignore")
california_education.rename(columns={"Value (Percent)": "County Education"}, inplace=True)
# Total Population
california_population = pd.read_csv(r"..\data\california_population.csv", delimiter=",")[2:].reset_index(drop=True).drop(columns=col_to_drop, errors = "ignore")
california_population["Total Population"] = california_population["People (Age 18-39)"]*100/california_population["Value (Percent)"]
california_population.drop(columns = ["People (Age 18-39)","Value (Percent)"], inplace = True, errors = "ignore")

In [9]:
data.shape

(5000, 17)

In [10]:
data_merge = data.merge(california_income, on="County", how="inner")
data_merge = data_merge.merge(california_education, on="County", how="inner")
data_merge = data_merge.merge(california_population, on="County", how="inner")

In [11]:
col_to_rename = {
    "Personal Loan": "Personal_Loan",
    "Securities Account": "Securities_Account",
    "CD Account": "CD_Account",
    "ZIP Code": "ZIP_Code",
    "Value (Dollars)": "Median_Income",
    "Country_Eudcation": "Country_Education",
    "Total Population": "Total_Population",
}

In [12]:
data_merge.rename(
    columns = col_to_rename, inplace = True, errors = "ignore"
)

In [13]:
data_merge  

Unnamed: 0,ID,Age,Experience,Income,ZIP_Code,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard,County,Latitude,Longitude,Median_Income,County Education,Total_Population
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0,Los Angeles County,34.1620,-118.0894,83411,34.6,9.949538e+06
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0,Los Angeles County,34.0218,-118.2883,83411,34.6,9.949538e+06
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0,Alameda County,37.8746,-122.2547,122488,50.9,1.663446e+06
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0,San Francisco County,37.7217,-122.4446,136689,59.8,8.517527e+05
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1,Los Angeles County,34.2429,-118.5273,83411,34.6,9.949538e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4961,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0,Orange County,33.6473,-117.8409,109361,42.8,3.175215e+06
4962,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0,San Diego County,32.8668,-117.2482,96974,41.0,3.286497e+06
4963,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0,Ventura County,34.5210,-119.2477,102141,34.7,8.416505e+05
4964,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0,Los Angeles County,34.0293,-118.3994,83411,34.6,9.949538e+06


In [14]:
data_a = pd.read_csv(r"../data/train.csv", delimiter=";")

In [15]:
data_merge.columns = data_merge.columns.str.replace(" ", "_").str.lower()

In [16]:
data_merge.rename(columns={'personal_loan': "loan"}, inplace=True)

In [17]:
data_merge.columns

Index(['id', 'age', 'experience', 'income', 'zip_code', 'family', 'ccavg',
       'education', 'mortgage', 'loan', 'securities_account', 'cd_account',
       'online', 'creditcard', 'county', 'latitude', 'longitude',
       'median_income', 'county_education', 'total_population'],
      dtype='object')

In [18]:
data_a.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

### Data_a already have "loan" so we can focus mainly on

- securities_account
- mortgage
- cd_account
- online
- creditcard

### Some special case to take in
- family >= 2 (married/divorced) else single
- Education primary = 1, secondary = 2, tertiary = 3

In [19]:
data_a

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [20]:
data_merge

Unnamed: 0,id,age,experience,income,zip_code,family,ccavg,education,mortgage,loan,securities_account,cd_account,online,creditcard,county,latitude,longitude,median_income,county_education,total_population
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0,Los Angeles County,34.1620,-118.0894,83411,34.6,9.949538e+06
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0,Los Angeles County,34.0218,-118.2883,83411,34.6,9.949538e+06
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0,Alameda County,37.8746,-122.2547,122488,50.9,1.663446e+06
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0,San Francisco County,37.7217,-122.4446,136689,59.8,8.517527e+05
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1,Los Angeles County,34.2429,-118.5273,83411,34.6,9.949538e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4961,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0,Orange County,33.6473,-117.8409,109361,42.8,3.175215e+06
4962,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0,San Diego County,32.8668,-117.2482,96974,41.0,3.286497e+06
4963,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0,Ventura County,34.5210,-119.2477,102141,34.7,8.416505e+05
4964,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0,Los Angeles County,34.0293,-118.3994,83411,34.6,9.949538e+06


In [21]:
data_a.groupby("education").education.count()

education
primary       6851
secondary    23202
tertiary     13301
unknown       1857
Name: education, dtype: int64

In [22]:
common_columns = list(data_a.columns.intersection(data_merge.columns))
other_columns = {
    "balance": "income",
    "marital": "family"
} # Formatted in col_a:col_b

y_columns = ["securities_account",
             "mortgage",
             "cd_account",
             "online",
             "creditcard"]

In [23]:
data_a[common_columns + list(other_columns.keys())]

Unnamed: 0,age,education,loan,balance,marital
0,58,tertiary,no,2143,married
1,44,secondary,no,29,single
2,33,secondary,yes,2,married
3,47,unknown,no,1506,married
4,33,unknown,no,1,single
...,...,...,...,...,...
45206,51,tertiary,no,825,married
45207,71,primary,no,1729,divorced
45208,72,secondary,no,5715,married
45209,57,secondary,no,668,married


In [24]:
data_merge[common_columns + list(other_columns.values())]

Unnamed: 0,age,education,loan,income,family
0,25,1,0,49,4
1,45,1,0,34,3
2,39,1,0,11,1
3,35,2,0,100,1
4,35,2,0,45,4
...,...,...,...,...,...
4961,29,3,0,40,1
4962,30,1,0,15,4
4963,63,3,0,24,2
4964,65,2,0,49,3


In [25]:
def data_processing_a(data):
    data_copy = data.copy()[common_columns + list(other_columns.keys())]
    data_copy = data_copy.replace("unknown", pd.NA).dropna()
    def encode_education(value):
        if value == "tertiary":
            return 3
        if value == "secondary":
            return 2
        if value == "primary":
            return 1
    def encode_loan(value):
        if value == "yes":
            return 1
        else:
            return 0
    def encode_marital(value):
        if value in ("married","divorced"):
            return 2
        else:
            return 1
    
    data_copy["education"] = data_copy["education"].apply(encode_education)
    data_copy["loan"] = data_copy["loan"].apply(encode_loan)
    data_copy["marital"] = data_copy["marital"].apply(encode_marital)
    data_copy.rename(columns = other_columns, inplace = True)
    return data_copy

def data_processing_b(data):
    data_copy = data.copy()[common_columns + list(other_columns.values())]
    y_vals = data.copy()[y_columns]
    data_copy["family"] = [2 if _val >= 2 else 1 for _val in data_copy["family"]]
    y_vals["mortgage"] = [1 if _val > 0 else 0 for _val in y_vals["mortgage"]]
    return data_copy, y_vals

In [26]:
data_a = data_a[data_a["education"] != "unknown"]

In [27]:
final_result = data_processing_a(data_a)
data_b, y_all = data_processing_b(data_merge)

In [28]:
final_result

Unnamed: 0,age,education,loan,income,family
0,58,3,0,2143,2
1,44,2,0,29,1
2,33,2,1,2,2
5,35,3,0,231,2
6,28,3,1,447,1
...,...,...,...,...,...
45206,51,3,0,825,2
45207,71,1,0,1729,2
45208,72,2,0,5715,2
45209,57,2,0,668,2


In [29]:
data_b

Unnamed: 0,age,education,loan,income,family
0,25,1,0,49,2
1,45,1,0,34,2
2,39,1,0,11,1
3,35,2,0,100,1
4,35,2,0,45,2
...,...,...,...,...,...
4961,29,3,0,40,1
4962,30,1,0,15,2
4963,63,3,0,24,2
4964,65,2,0,49,2


In [30]:
y_all

Unnamed: 0,securities_account,mortgage,cd_account,online,creditcard
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,1
...,...,...,...,...,...
4961,0,0,0,1,0
4962,0,1,0,1,0
4963,0,0,0,0,0
4964,0,0,0,1,0


## Model Creation

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.decomposition import PCA
import os
import pickle
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


def model_pipeline(X, y, param_grid, model, test_size=0.2, random_state=3101, scoring = 'accuracy'):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    smote = RandomOverSampler(random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('pca', PCA(n_components = 0.95)),
        ('model', model)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring=scoring)
    
    grid_search.fit(X_resampled, y_resampled)
    
    best_pipeline = grid_search.best_estimator_

    y_pred = best_pipeline.predict(X_test)
    
    if scoring == "accuracy":
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "f1 Score": f1_score(y_test, y_pred, average="macro"),
            "recall": recall_score(y_test, y_pred, average="macro"),
            "precision": precision_score(y_test, y_pred, average="macro"),
        }
    else:
        metrics = {
        "mean_squared_error": mean_squared_error(y_test, y_pred),
        "mean_absolute_error": mean_absolute_error(y_test, y_pred),
        "r2_score": r2_score(y_test, y_pred)
    }

    return metrics, best_pipeline, grid_search.best_params_

def print_accuracy(dict):
    for _model, _result in dict.items():
        print(f"======== {_model} =======")
        print(pd.DataFrame.from_dict(_result[0], orient="index", columns=["Value"]).T)
        
def save_pipeline(result, save_path, model_name):
    pipeline = result[model_name][1]
    save_path += f'_{model_name.replace(" ", "_").lower()}'
    file_number = 0
    while os.path.exists(save_path):
        file_number += 1
    save_path += f"_{str(file_number)}.pkl"
    with open(save_path, 'wb') as f:
        pickle.dump(pipeline, f)

In [42]:
knn_grid = {
    'model__n_neighbors': [1,2, 3, 5, 7, 10],       # Hyperparameter tuning for KNN
    'model__weights': ['uniform', 'distance'],
    'model__metric': ['euclidean', 'manhattan']
}

dt_grid = {
    'model__max_depth': [None, 5, 10, 15],  # Hyperparameter tuning for Decision Tree
    'model__min_samples_split': [2, 5, 10],
    'model__criterion': ['gini', 'entropy']
}

lr_grid = {
    'model__C': [0.01, 0.01, 0.1, 1, 10],          # Hyperparameter tuning for Logistic Regression
    'model__penalty': ['l1', 'l2'],
    'model__solver': ['liblinear']
}

rf_grid = {
    'model__n_estimators': [25, 50, 100, 200, 400],   # Hyperparameter tuning for Random Forest
    'model__max_depth': [None, 10, 20, 30, 40, 50],
    'model__min_samples_split': [2, 5, 10]
}

gb_grid = {
    'model__n_estimators': [50, 100, 200],         # Number of boosting stages
    'model__learning_rate': [0.01, 0.1, 0.2],      # Step size
    'model__max_depth': [3, 4, 5],                 # Maximum depth of each tree
    'model__subsample': [0.8, 1.0],                # Fraction of samples used for fitting individual base learners
    'model__min_samples_split': [2, 5, 10]         # Minimum number of samples required to split a node
}
pipeline_directory = r"C:\Users\limti\PycharmProjects\DSA3101-Group-20\group_B\synthetic_data_model"

### Securities Account
Model to use: [KNN, logistic regression, decision tree, random forest]

In [38]:
securities_allresults = {
    "KNN": model_pipeline(data_b, y_all["securities_account"], knn_grid, KNeighborsClassifier(), test_size=0.2),
    "Decision Tree": model_pipeline(data_b, y_all["securities_account"], dt_grid,DecisionTreeClassifier(), test_size=0.2),
    "Logistic Regression": model_pipeline(data_b, y_all["securities_account"], lr_grid,LogisticRegression(), test_size=0.2),
    "Random Forest": model_pipeline(data_b, y_all["securities_account"],rf_grid ,RandomForestClassifier(), test_size=0.2),
}

In [51]:
print(print_accuracy(securities_allresults))

       accuracy  f1 Score    recall  precision
Value  0.816901  0.515486  0.515773   0.515251
       accuracy  f1 Score    recall  precision
Value  0.809859  0.518918  0.520431   0.518139
       accuracy  f1 Score    recall  precision
Value  0.412475  0.357813  0.487671   0.495209
       accuracy  f1 Score    recall  precision
Value  0.817907  0.511988  0.512041   0.511938
None


TypeError: sorted expected 1 argument, got 2

In [54]:
save_pipeline(securities_allresults, os.path.join(pipeline_directory, "securities"), "Random Forest")

In [59]:
#securities_allresults["Gradient Boost"] = model_pipeline(data_b, y_all["securities_account"], gb_grid, GradientBoostingClassifier(random_state=3101), test_size=0.2)

  _data = np.array(data, dtype=dtype, copy=copy,


### cd_account
Model to use: [KNN, logistic regression, decision tree, random forest]

In [60]:
cd_account_allresults = {
    "KNN": model_pipeline(data_b, y_all["cd_account"], knn_grid, KNeighborsClassifier(), test_size=0.2),
    "Decision Tree": model_pipeline(data_b, y_all["cd_account"], dt_grid,DecisionTreeClassifier(), test_size=0.2),
    "Logistic Regression": model_pipeline(data_b, y_all["cd_account"], lr_grid,LogisticRegression(), test_size=0.2),
    "Random Forest": model_pipeline(data_b, y_all["cd_account"],rf_grid ,RandomForestClassifier(), test_size=0.2),
}

In [61]:
print(print_accuracy(cd_account_allresults))

       accuracy  f1 Score    recall  precision
Value   0.89839  0.526164  0.526856   0.525549
       accuracy  f1 Score    recall  precision
Value   0.88833  0.533169  0.538648   0.530067
       accuracy  f1 Score    recall  precision
Value  0.911469  0.657376  0.696389   0.634137
       accuracy  f1 Score    recall  precision
Value  0.914487  0.556685  0.552493   0.562167
None


In [63]:
for _model_name, _result in cd_account_allresults.items():
    save_pipeline(cd_account_allresults, os.path.join(pipeline_directory, "cd_account"), _model_name)

### Online

In [64]:
online_account_allresults = {
    "KNN": model_pipeline(data_b, y_all["online"], knn_grid, KNeighborsClassifier(), test_size=0.2),
    "Decision Tree": model_pipeline(data_b, y_all["online"], dt_grid,DecisionTreeClassifier(), test_size=0.2),
    "Logistic Regression": model_pipeline(data_b, y_all["online"], lr_grid,LogisticRegression(), test_size=0.2),
    "Random Forest": model_pipeline(data_b, y_all["online"],rf_grid ,RandomForestClassifier(), test_size=0.2),
}

In [65]:
print(print_accuracy(online_account_allresults))

       accuracy  f1 Score   recall  precision
Value  0.504024     0.491  0.49241   0.492681
       accuracy  f1 Score    recall  precision
Value  0.481891  0.467574  0.468293   0.469373
       accuracy  f1 Score   recall  precision
Value  0.472837  0.464774  0.46811    0.46956
       accuracy  f1 Score    recall  precision
Value  0.517103  0.495431  0.495449   0.495481
None


In [66]:
for _model_name, _result in online_account_allresults.items():
    save_pipeline(online_account_allresults, os.path.join(pipeline_directory, "online"), _model_name)

### Credit Card

In [67]:
cc_account_allresults = {
    "KNN": model_pipeline(data_b, y_all["creditcard"], knn_grid, KNeighborsClassifier(), test_size=0.2),
    "Decision Tree": model_pipeline(data_b, y_all["creditcard"], dt_grid,DecisionTreeClassifier(), test_size=0.2),
    "Logistic Regression": model_pipeline(data_b, y_all["creditcard"], lr_grid,LogisticRegression(), test_size=0.2),
    "Random Forest": model_pipeline(data_b, y_all["creditcard"],rf_grid ,RandomForestClassifier(), test_size=0.2),
}

In [68]:
print(print_accuracy(cc_account_allresults))

       accuracy  f1 Score    recall  precision
Value  0.573441  0.495827  0.495825   0.495872
       accuracy  f1 Score    recall  precision
Value  0.574447  0.502945  0.503215   0.503107
       accuracy  f1 Score    recall  precision
Value   0.50503  0.491179  0.522177   0.518745
       accuracy  f1 Score    recall  precision
Value  0.582495  0.497402  0.497536   0.497471
None


In [69]:
for _model_name, _result in online_account_allresults.items():
    save_pipeline(cc_account_allresults, os.path.join(pipeline_directory, "creditcard"), _model_name)

### Mortgage

In [43]:
mortgage_allresults = {
    "KNN": model_pipeline(data_b, y_all["mortgage"], knn_grid, KNeighborsClassifier(), test_size=0.2),
    "Decision Tree": model_pipeline(data_b, y_all["mortgage"], dt_grid,DecisionTreeClassifier(), test_size=0.2),
    "Logistic Regression": model_pipeline(data_b, y_all["mortgage"], lr_grid,LogisticRegression(), test_size=0.2),
    "Random Forest": model_pipeline(data_b, y_all["mortgage"],rf_grid ,RandomForestClassifier(), test_size=0.2),
}

In [44]:
print(print_accuracy(mortgage_allresults))

       accuracy  f1 Score    recall  precision
Value  0.607646  0.539657  0.539466   0.539898
       accuracy  f1 Score    recall  precision
Value  0.583501  0.511329  0.511289   0.511413
       accuracy  f1 Score    recall  precision
Value  0.605634  0.498158  0.502478   0.503031
       accuracy  f1 Score    recall  precision
Value  0.579477  0.504801  0.504817   0.504906
None


In [46]:
for _model_name, _result in mortgage_allresults.items():
    save_pipeline(mortgage_allresults, os.path.join(pipeline_directory, "mortgage"), _model_name)

In [1]:
cd_account_model_path = r"C:\Users\limti\PycharmProjects\DSA3101-Group-20\group_B\synthetic_data_model\cd_account_random_forest_0.pkl"
securities_model_path = r"C:\Users\limti\PycharmProjects\DSA3101-Group-20\group_B\synthetic_data_model\securities_random_forest_0.pkl"

In [33]:
import pickle
cd_account_model = pickle.load(open(cd_account_model_path, "rb"))
securities_model = pickle.load(open(securities_model_path, "rb"))

In [38]:
data_a

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,cd_account,securities
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,0,0
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no,0,0
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes,0,0
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes,0,0
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes,0,0
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no,0,0


In [35]:
final_result

Unnamed: 0,age,education,loan,income,family
0,58,3,0,2143,2
1,44,2,0,29,1
2,33,2,1,2,2
5,35,3,0,231,2
6,28,3,1,447,1
...,...,...,...,...,...
45206,51,3,0,825,2
45207,71,1,0,1729,2
45208,72,2,0,5715,2
45209,57,2,0,668,2


In [36]:
data_a["cd_account"] = cd_account_model.predict(final_result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_a["cd_account"] = cd_account_model.predict(final_result)


In [37]:
data_a["securities"] = securities_model.predict(final_result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_a["securities"] = securities_model.predict(final_result)


In [39]:
(data_a["cd_account"] == data_a["securities"]).sum()

39786

In [40]:
data_path = r"C:\Users\limti\PycharmProjects\DSA3101-Group-20\data"
dataset_name = "clean_dataset.csv"

data_a.to_csv(os.path.join(data_path, dataset_name), index=False)