In [668]:
# pip install scikit-learn
# pip install pandas

In [669]:
import pandas as pd
import numpy as np

np.random.seed(0)

In [670]:
# dataset 1: https://www.kaggle.com/datasets/blastchar/telco-customer-churn/
csv_path = "datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [671]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [672]:
# drop customerID
df.drop('customerID', axis=1, inplace=True)

In [673]:
# https://community.ibm.com/community/user/businessanalytics/blogs/steven-macko/2019/07/11/telco-customer-churn-1113
# customerID -> unique customer id
# gender -> [Male, Female]
# SeniorCitizen -> [0, 1]
# Partner -> [Yes, No]
# Dependents -> [Yes, No]
# tenure -> Number of months the customer has stayed with the company
# PhoneService -> [Yes, No]
# MultipleLines -> [No phone service, No, Yes]
# InternetService -> [DSL, Fiber optic, No]
# OnlineSecurity -> [No, Yes, No internet service]
# OnlineBackup -> [No, Yes, No internet service]
# DeviceProtection -> [No, Yes, No internet service]
# TechSupport -> [No, Yes, No internet service]
# StreamingTV -> [No, Yes, No internet service]
# StreamingMovies -> [No, Yes, No internet service]
# Contract -> [Month-to-month, One year, Two year]    
# PaperlessBilling -> [Yes, No]
# PaymentMethod -> [Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)]
# MonthlyCharges -> Monthly charge
# TotalCharges -> Total charge
# Churn -> [Yes, No]

# find the unique values of gender, senior citizen, partner, dependents, phone service, multiple lines, 
# internet service, online security, online backup, device protection, tech support
# streaming tv, streaming movies, contract, paperless billing, payment method, churn
columns = df.columns
# drop customer id, tenure, monthly charges, total charges
columns = columns.drop(['tenure', 'MonthlyCharges', 'TotalCharges'])
for column in columns:
    print(column, df[column].unique())


gender ['Female' 'Male']
SeniorCitizen [0 1]
Partner ['Yes' 'No']
Dependents ['No' 'Yes']
PhoneService ['No' 'Yes']
MultipleLines ['No phone service' 'No' 'Yes']
InternetService ['DSL' 'Fiber optic' 'No']
OnlineSecurity ['No' 'Yes' 'No internet service']
OnlineBackup ['Yes' 'No' 'No internet service']
DeviceProtection ['No' 'Yes' 'No internet service']
TechSupport ['No' 'Yes' 'No internet service']
StreamingTV ['No' 'Yes' 'No internet service']
StreamingMovies ['No' 'Yes' 'No internet service']
Contract ['Month-to-month' 'One year' 'Two year']
PaperlessBilling ['Yes' 'No']
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn ['No' 'Yes']


In [674]:
# preprocess data
for column in columns:
    if df[column].dtype == 'object' and column != 'Churn':
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)

# convert churn to 0 or 1
df['Churn'] = df['Churn'].astype('category').cat.codes

In [675]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,True,False,False,True,True,...,False,True,False,False,False,True,False,False,True,False
1,0,34,56.95,1889.5,0,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,False,True,True,False,True,...,False,True,False,False,False,True,False,False,False,True
3,0,45,42.3,1840.75,0,False,True,True,False,True,...,False,False,True,False,True,False,True,False,False,False
4,0,2,70.7,151.65,1,True,False,True,False,True,...,False,True,False,False,False,True,False,False,True,False


In [676]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 46 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   SeniorCitizen                            7043 non-null   int64  
 1   tenure                                   7043 non-null   int64  
 2   MonthlyCharges                           7043 non-null   float64
 3   TotalCharges                             7043 non-null   object 
 4   Churn                                    7043 non-null   int8   
 5   gender_Female                            7043 non-null   bool   
 6   gender_Male                              7043 non-null   bool   
 7   Partner_No                               7043 non-null   bool   
 8   Partner_Yes                              7043 non-null   bool   
 9   Dependents_No                            7043 non-null   bool   
 10  Dependents_Yes                           7043 no

In [677]:
# convert total charges to float
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 46 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   SeniorCitizen                            7043 non-null   int64  
 1   tenure                                   7043 non-null   int64  
 2   MonthlyCharges                           7043 non-null   float64
 3   TotalCharges                             7032 non-null   float64
 4   Churn                                    7043 non-null   int8   
 5   gender_Female                            7043 non-null   bool   
 6   gender_Male                              7043 non-null   bool   
 7   Partner_No                               7043 non-null   bool   
 8   Partner_Yes                              7043 non-null   bool   
 9   Dependents_No                            7043 non-null   bool   
 10  Dependents_Yes                           7043 no

In [678]:
# save as csv
# df.to_csv('datasets/telco_customer_churn_preprocessed.csv', index=False)

In [679]:
# drop rows with missing values
df.dropna(inplace=True)

In [680]:
# split the data into 80% training and 20% testing using sklearn
from sklearn.model_selection import train_test_split

# churn is the target
X = df.drop(['Churn'], axis=1).values
y = df['Churn'].values

# split the data into 80% training and 20% testing using sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [681]:
# scale the data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# fit to the training data
X_train = sc.fit_transform(X_train)
# transform the testing data
X_test = sc.transform(X_test)


In [682]:
print(X_train.shape)
print(X_test.shape)

(5625, 45)
(1407, 45)


In [683]:
# count the number of NaN in X_train
np.isnan(X_train).sum()

0

In [684]:
# replace Nan with average
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# # fit to the training data
# X_train = imputer.fit_transform(X_train)
# # transform the testing data
# X_test = imputer.transform(X_test)

In [685]:
# use sklearn logistic regression
from sklearn.linear_model import LogisticRegression

# create the model
model = LogisticRegression()

# train the model
model.fit(X_train, y_train)


In [686]:
# evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

# print the predictions
predictions = model.predict(X_test)

In [687]:
# accuracy
print(f"accuracy: {model.score(X_test, y_test)}")
# precision, recall, f1-score, support
print(classification_report(y_test, predictions))


accuracy: 0.8073916133617626
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1038
           1       0.66      0.55      0.60       369

    accuracy                           0.81      1407
   macro avg       0.75      0.72      0.74      1407
weighted avg       0.80      0.81      0.80      1407



In [688]:
# confusion matrix
print(confusion_matrix(y_test, predictions))

[[933 105]
 [166 203]]


In [689]:
# create custom metrics functions
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def precision(y_true, y_pred):
    return np.sum(y_true * y_pred) / np.sum(y_pred)

def recall(y_true, y_pred):
    return np.sum(y_true * y_pred) / np.sum(y_true)

def specificity(y_true, y_pred):
    return np.sum((1 - y_true) * (1 - y_pred)) / np.sum(1 - y_true)

def false_discovery_rate(y_true, y_pred):
    return 1 - precision(y_true, y_pred)

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * p * r / (p + r)

In [690]:
# custom logistic regression class
class MyLogisticRegression:

    def __init__(self, n_features, 
                        lr=0.1, 
                        n_iters=1000, 
                        threshold=0,
                        show_loss=False):
        self.n_features = n_features
        self.lr = lr
        self.n_iters = n_iters
        self.weights = np.random.randn(n_features+1)
        # Early terminate Gradient Descent if error in the training set becomes less than threshold
        self.threshold = threshold 
        self.show_loss = show_loss

    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _information_gain(self, X, y, feature):
        original_entropy = self._entropy(y)

        # Get the values and counts for the feature
        values, counts = np.unique(X[:, feature], return_counts=True)

        # Calculate the remainder
        remainder = 0
        for value, count in zip(values, counts):
            remainder += count / counts.sum() * self._entropy(y[X[:, feature] == value])

        # Calculate the information gain
        info_gain = original_entropy - remainder
        return info_gain

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def _cost(self, X, y):
        y_pred = self._sigmoid(X @ self.weights)
        cost = -(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) / len(y)
        return cost

    def _gradient(self, X, y):
        y_pred = self._sigmoid(X @ self.weights)
        gradient = X.T @ (y_pred - y) / len(y)
        return gradient

    def fit(self, X, y):
        # check shape
        if X.shape[0] != y.shape[0]:
            raise ValueError("shape of X and y do not match")

        # check shape len
        if len(X.shape) != 2:
            raise ValueError("X must be 2 dimensional")

        # print(X.shape, y.shape)

        # Calculate information gain for each feature
        info_gains = [self._information_gain(X, y, feature) for feature in range(X.shape[1])]

        # Get the indices of the features sorted by information gain
        indices = np.argsort(info_gains)[::-1]

        # Select the top n features
        self.selected_features = indices[:self.n_features]

        # freate a new array with only the selected features
        X = X[:, self.selected_features]

        # print(X.shape, y.shape)

        # Add column for bias
        X = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)

        # apply gradient descent
        for i in range(self.n_iters):
            self.weights -= self.lr * self._gradient(X, y)
            loss = self._cost(X, y).mean()
            if self.show_loss:
                print(f"epoch {i+1}, loss: {loss}")
            # early terminate if mse is less than threshold
            if loss < self.threshold:
                break


    def predict(self, X):

        if self.selected_features is None:
            raise Exception("model must be trained before prediction")

        # select the features
        X = X[:, self.selected_features]
        
        # Add column for bias
        X = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)

        # predict
        y_pred = self._sigmoid(X @ self.weights)

        # convert probabilities to 0 or 1
        y_pred = np.round(y_pred).astype(int)
        return y_pred
        

In [691]:
# create the model
# model = MyLogisticRegression(n_features=X_train.shape[1])
n = int(X_train.shape[1] * 0.8)
model = MyLogisticRegression(n_features=n, show_loss=True, threshold=0)

# train the model
model.fit(X_train, y_train)

epoch 1, loss: 0.0005619391343231694
epoch 2, loss: 0.0005311144269225729
epoch 3, loss: 0.0005016827564854922
epoch 4, loss: 0.00047359555244356685
epoch 5, loss: 0.0004468293280058132
epoch 6, loss: 0.00042139898459689904
epoch 7, loss: 0.0003973644271853211
epoch 8, loss: 0.0003748181653698315
epoch 9, loss: 0.00035385168214891437
epoch 10, loss: 0.00033452541776208346
epoch 11, loss: 0.0003168677182324484
epoch 12, loss: 0.0003008894379924009
epoch 13, loss: 0.0002865858732208464
epoch 14, loss: 0.0002739234110270318
epoch 15, loss: 0.0002628270007346343
epoch 16, loss: 0.0002531781759730774
epoch 17, loss: 0.0002448227350278807
epoch 18, loss: 0.0002375844526977955
epoch 19, loss: 0.0002312813509281584
epoch 20, loss: 0.00022574067710747465
epoch 21, loss: 0.00022080936927656186
epoch 22, loss: 0.0002163589413123425
epoch 23, loss: 0.0002122857893611828
epoch 24, loss: 0.0002085087433965372
epoch 25, loss: 0.00020496546920614858
epoch 26, loss: 0.00020160874427141917
epoch 27, los

In [692]:
def report(y_true, y_pred):
    print(f"accuracy: {accuracy(y_true, y_pred):.4f}")
    print(f"precision: {precision(y_true, y_pred):.4f}")
    print(f"recall: {recall(y_true, y_pred):.4f}")
    print(f"specificity: {specificity(y_true, y_pred):.4f}")
    print(f"fdr: {false_discovery_rate(y_true, y_pred):.4f}")
    print(f"f1: {f1(y_true, y_pred):.4f}")

In [693]:
# predict train data
y_pred = model.predict(X_train)
print("train data")
report(y_train, y_pred)

train data
accuracy: 0.8039
precision: 0.6594
recall: 0.5473
f1: 0.5982


In [694]:
# predict test data
y_pred = model.predict(X_test)
print("test data")
report(y_test, y_pred)

test data
accuracy: 0.8003
precision: 0.6447
recall: 0.5312
f1: 0.5825


In [695]:
class AdaBoost:
    def __init__(self, num_classifiers):
        self.num_classifiers = num_classifiers
        self.alphas = None
        self.classifiers = None

    def resample(self, X, y, weights):
        indices = np.random.choice(len(X), len(X), p=weights)
        return X[indices], y[indices]

    def fit(self, X, y):
        n_samples, n_features = X.shape
        weights = np.ones(n_samples) / n_samples
        alphas = []
        classifiers = []

        for k in range(self.num_classifiers):
            X_resampled, y_resampled = self.resample(X, y, weights)
            n = int(X_train.shape[1] * 0.8)
            classifier = MyLogisticRegression(n_features=n, show_loss=False, threshold=0)
            classifier.fit(X_resampled, y_resampled)

            predictions = classifier.predict(X)
            error = np.sum(weights * (predictions != y))

            if error > 0.5:
                continue

            alpha = np.log((1 - error) / error)
            weights = weights * np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)

            alphas.append(alpha)
            classifiers.append(classifier)

        self.alphas = np.array(alphas)
        self.classifiers = classifiers

    def predict(self, X):
        predictions = np.zeros(len(X))

        for alpha, classifier in zip(self.alphas, self.classifiers):
            predictions += alpha * classifier.predict(X)

        return np.sign(predictions)

    def weighted_majority(self, X):
        return self.predict(X)


In [696]:
K = [5, 10, 15, 20]
for k in K:
    print(f"num_classifiers: {k}")
    model = AdaBoost(num_classifiers=k)
    model.fit(X_train, y_train)

    # predict train data
    y_pred = model.predict(X_train)
    print(f"train: k={k}, accuracy: {accuracy(y_train, y_pred):.4f}")

    # predict test data
    y_pred = model.predict(X_test)
    print(f"test: k={k}, accuracy: {accuracy(y_test, y_pred):.4f}\n")

    

num_classifiers: 5


train: k=5, accuracy: 0.8030
test: k=5, accuracy: 0.7982

num_classifiers: 10
train: k=10, accuracy: 0.8004
test: k=10, accuracy: 0.7960

num_classifiers: 15


KeyboardInterrupt: 

In [866]:
# import adult dataset from datasets/adult folder
# https://archive.ics.uci.edu/ml/datasets/adult

# preprocess the data
# convert categorical data to numerical data
# split the data into 80% training and 20% testing using sklearn
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
'hours-per-week', 'native-country', 'income']


In [867]:
train_df = pd.read_csv('datasets/adult/adult.data', names=columns)
train_df['train'] = 1

In [868]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
 15  train           32561 non-null  int64 
dtypes: int64(7), object(9)
memory usage: 4.0+ MB


In [869]:
test_df = pd.read_csv('datasets/adult/adult.test', names=columns, skiprows=1)
test_df['train'] = 0

In [870]:
print(len(train_df['native-country'].unique()))
print(len(test_df['native-country'].unique()))


42
41


In [871]:
# concatenate train and test data
df = pd.concat([train_df, test_df])

In [872]:
df['income'].unique()

array([' <=50K', ' >50K', ' <=50K.', ' >50K.'], dtype=object)

In [873]:
# replace icome with 0 or 1
df['income'] = df['income'].str.replace('.', '')
df['income'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [874]:
df['education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [875]:
df['education-num'].unique()

array([13,  9,  7, 14,  5, 10, 12, 11,  4, 16, 15,  3,  6,  2,  1,  8],
      dtype=int64)

In [876]:
# education and education-num are the same
# drop education
df.drop('education', axis=1, inplace=True)

In [877]:
df['income'] = df['income'].astype('category').cat.codes
df['income'].unique()

array([0, 1], dtype=int8)

In [878]:
# preprocess the data
# convert categorical data to numerical data
for column in columns:
    if column not in ['income', 'train', 'education']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)
df.head()

Unnamed: 0,income,train,age_17,age_18,age_19,age_20,age_21,age_22,age_23,age_24,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,0,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,0,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,0,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,0,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [879]:
df['income'].info()

<class 'pandas.core.series.Series'>
Index: 48842 entries, 0 to 16280
Series name: income
Non-Null Count  Dtype
--------------  -----
48842 non-null  int8 
dtypes: int8(1)
memory usage: 429.3 KB


In [880]:
# train test split
train_df = df[df['train'] == 1]
test_df = df[df['train'] == 0]

In [881]:
X_train = train_df.drop(['income', 'train'], axis=1).values
y_train = train_df['income'].values

In [882]:
X_test = test_df.drop(['income', 'train'], axis=1).values
y_test = test_df['income'].values

In [883]:
# scale the data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# fit to the training data
X_train = sc.fit_transform(X_train)
# transform the testing data
X_test = sc.transform(X_test)

In [884]:
# create the model
n = int(X_train.shape[1] * 0.8)
model = MyLogisticRegression(n_features=n, show_loss=True, threshold=0)

# train the model
model.fit(X_train, y_train)

MemoryError: Unable to allocate 5.63 GiB for an array with shape (32561, 23214) and data type float64

In [None]:
# predict train data
y_pred = model.predict(X_train)
print("train data")
report(y_train, y_pred)

# predict test data
y_pred = model.predict(X_test)
print("test data")

In [None]:
# adboost
K = [5, 10, 15, 20]

for k in K:
    print(f"num_classifiers: {k}")
    model = AdaBoost(num_classifiers=k)
    model.fit(X_train, y_train)

    # predict train data
    y_pred = model.predict(X_train)
    print(f"train: k={k}, accuracy: {accuracy(y_train, y_pred):.4f}")

    # predict test data
    y_pred = model.predict(X_test)
    print(f"test: k={k}, accuracy: {accuracy(y_test, y_pred):.4f}\n")

In [886]:
# read credit dataset from datasets/creditcard.csv
all_data_df = pd.read_csv('datasets/creditcard.csv')
all_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [887]:
# take all the rows with class 1
fraud_df = all_data_df[all_data_df['Class'] == 1]

# take 20000 rows with class 0
non_fraud_df = all_data_df[all_data_df['Class'] == 0].sample(20000)

df = pd.concat([fraud_df, non_fraud_df])

In [888]:

# fit to the training data
X = df.drop(['Class'], axis=1).values

# transform the testing data
X = sc.fit_transform(X)

y = df['Class'].values

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# scale the data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# fit to the training data
X_train = sc.fit_transform(X_train)
# transform the testing data
X_test = sc.transform(X_test)


In [889]:
# create the model
n = int(X_train.shape[1] * 1)
model = MyLogisticRegression(n_features=n, show_loss=False, threshold=0)

# train the model
model.fit(X_train, y_train)

epoch 1, loss: 0.0001639343811026945
epoch 2, loss: 0.0001605485840381044
epoch 3, loss: 0.00015720846074329661
epoch 4, loss: 0.0001539139205752468
epoch 5, loss: 0.00015066514011312162
epoch 6, loss: 0.0001474625296777333
epoch 7, loss: 0.0001443063714263109
epoch 8, loss: 0.00014119672553009424
epoch 9, loss: 0.00013813348598523608
epoch 10, loss: 0.00013511645109565696
epoch 11, loss: 0.00013214536311326563
epoch 12, loss: 0.00012921991815025634
epoch 13, loss: 0.0001263397551688419
epoch 14, loss: 0.00012350441692250477
epoch 15, loss: 0.00012071332723064485
epoch 16, loss: 0.00011796586148786303
epoch 17, loss: 0.00011526158385073092
epoch 18, loss: 0.00011260044372628473
epoch 19, loss: 0.00010998266993124978
epoch 20, loss: 0.00010740858842908933
epoch 21, loss: 0.00010487867687947991
epoch 22, loss: 0.00010239379641481934
epoch 23, loss: 9.995508354881736e-05
epoch 24, loss: 9.756304577855132e-05
epoch 25, loss: 9.521685541960172e-05
epoch 26, loss: 9.291487047353434e-05
epoch

  cost = -(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) / len(y)
  cost = -(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) / len(y)


epoch 89, loss: nan
epoch 90, loss: nan
epoch 91, loss: nan
epoch 92, loss: nan
epoch 93, loss: nan
epoch 94, loss: nan
epoch 95, loss: nan
epoch 96, loss: nan
epoch 97, loss: nan
epoch 98, loss: nan
epoch 99, loss: nan
epoch 100, loss: nan
epoch 101, loss: nan
epoch 102, loss: nan
epoch 103, loss: nan
epoch 104, loss: nan
epoch 105, loss: nan
epoch 106, loss: nan
epoch 107, loss: nan
epoch 108, loss: nan
epoch 109, loss: nan
epoch 110, loss: nan
epoch 111, loss: nan
epoch 112, loss: nan
epoch 113, loss: nan
epoch 114, loss: nan
epoch 115, loss: nan
epoch 116, loss: nan
epoch 117, loss: nan
epoch 118, loss: nan
epoch 119, loss: nan
epoch 120, loss: nan
epoch 121, loss: nan
epoch 122, loss: nan
epoch 123, loss: nan
epoch 124, loss: nan
epoch 125, loss: nan
epoch 126, loss: nan
epoch 127, loss: nan
epoch 128, loss: nan
epoch 129, loss: nan
epoch 130, loss: nan
epoch 131, loss: nan
epoch 132, loss: nan
epoch 133, loss: nan
epoch 134, loss: nan
epoch 135, loss: nan
epoch 136, loss: nan
epo

In [890]:
# predict train data
y_pred = model.predict(X_train)
print("train data")
report(y_train, y_pred)

# predict test data
y_pred = model.predict(X_test)
print("test data")
report(y_test, y_pred)

train data
accuracy: 0.9951
precision: 0.9731
recall: 0.8212
f1: 0.8907
test data


In [891]:
# adboost
K = [5, 10, 15, 20]

for k in K:
    print(f"num_classifiers: {k}")
    model = AdaBoost(num_classifiers=k)
    model.fit(X_train, y_train)

    # predict train data
    y_pred = model.predict(X_train)
    print(f"train: k={k}, accuracy: {accuracy(y_train, y_pred):.4f}")

    # predict test data
    y_pred = model.predict(X_test)
    print(f"test: k={k}, accuracy: {accuracy(y_test, y_pred):.4f}\n")

num_classifiers: 5


  cost = -(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) / len(y)


train: k=5, accuracy: 0.9954
test: k=5, accuracy: 0.9934

num_classifiers: 10


  cost = -(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) / len(y)


train: k=10, accuracy: 0.9949
test: k=10, accuracy: 0.9927

num_classifiers: 15
