In [5]:
import pandas as pd
from datetime import datetime

# read in the data files that were downloaded and save as df
test_df = pd.read_csv('../csv/fraudTest.csv')
train_df = pd.read_csv('../csv/fraudTrain.csv')

# remove columns to clean df
column_keep = ['merch_lat', 'merch_long', 'unix_time', 'category', 'amt', 'gender', 'lat', 'long', 'dob', 'is_fraud']

# concat dfs bc we need to use skillet train_test_split
df = pd.concat([train_df, test_df])

# clean merchant column
df['merchant'] = df['merchant'].str.split('_').str[1]

# one end code gender
df['gender'] = df['gender'].replace({'M': 0, 'F': 1})

# Create a new DataFrame with only the selected columns
df = df[column_keep]

# Encode 'category' section in df to be represented numerically (frequency based encoding)
freq_encoding = df['category'].value_counts(normalize=True)
df['category'] = df['category'].map(freq_encoding)

# convert dob to 'age'
current_year = datetime.now().year
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = current_year - df['dob'].dt.year
df.drop('dob', axis=1, inplace=True)

# display the df
df

  df['gender'] = df['gender'].replace({'M': 0, 'F': 1})


Unnamed: 0,merch_lat,merch_long,unix_time,category,amt,gender,lat,long,is_fraud,age
0,36.011293,-82.048315,1325376018,0.048939,4.97,1,36.0788,-81.1781,0,36
1,49.159047,-118.186462,1325376044,0.095115,107.23,1,48.8878,-118.2105,0,46
2,43.150704,-112.154481,1325376051,0.072403,220.11,0,42.1808,-112.2620,0,62
3,47.034331,-112.561071,1325376076,0.101506,45.00,0,46.2306,-112.1138,0,57
4,38.674999,-78.632459,1325376186,0.061666,41.96,0,38.4207,-79.4629,0,38
...,...,...,...,...,...,...,...,...,...,...
555714,39.946837,-91.333331,1388534347,0.066159,43.77,0,40.4931,-91.8912,0,58
555715,29.661049,-96.186633,1388534349,0.087307,111.84,0,29.0393,-95.4401,0,25
555716,46.658340,-119.715054,1388534355,0.087307,86.88,1,46.1966,-118.9017,0,43
555717,44.470525,-117.080888,1388534364,0.031287,7.99,0,44.6255,-116.4493,0,59


In [6]:
# Scale features down

features_to_scale = ['merch_long', 'merch_lat', 'unix_time', 'lat', 'long', 'amt', 'age']

def standardize(col):
    mean = col.mean()
    std = col.std()
    return (col - mean) / std

for feature in features_to_scale:
    df[feature] = standardize(df[feature])

df

Unnamed: 0,merch_lat,merch_long,unix_time,category,amt,gender,lat,long,is_fraud,age
0,-0.495080,0.594463,-1.830066,0.048939,-0.408741,1,-0.485167,0.658263,0,-0.845898
1,2.080081,-2.031915,-1.830065,0.095115,0.233378,1,2.040530,-2.035415,0,-0.270874
2,0.903268,-1.593534,-1.830064,0.072403,0.942183,0,0.718034,-1.602730,0,0.649165
3,1.663928,-1.623084,-1.830063,0.101506,-0.157381,0,1.516580,-1.591950,0,0.361653
4,0.026642,0.842714,-1.830057,0.061666,-0.176470,0,-0.023388,0.783024,0,-0.730893
...,...,...,...,...,...,...,...,...,...,...
555714,0.275748,-0.080335,1.641110,0.066159,-0.165105,0,0.385251,-0.120991,0,0.419155
555715,-1.738859,-0.433054,1.641110,0.087307,0.262326,0,-1.873226,-0.379132,0,-1.478425
555716,1.590285,-2.143007,1.641110,0.087307,0.105595,1,1.509876,-2.085692,0,-0.443381
555717,1.161772,-1.951566,1.641111,0.031287,-0.389777,0,1.200084,-1.907308,0,0.476658


In [9]:
from sklearn.model_selection import train_test_split

y = df['is_fraud']
X = df.drop(columns='is_fraud')

# Split data into training set and testing set
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)

# Convert to numpy arrays
Xtrain = Xtrain.values
Xtest = Xtest.values
ytrain = ytrain.values
ytest = ytest.values

In [38]:
from sklearn.metrics import r2_score
import numpy as np

# Train the model

def add_bias_column(X):
    """
    Args:
        X (array): can be either 1-d or 2-d
    
    Returns:
        Xnew (array): the same array, but 2-d with a column of 1's in the first spot
    """
    
    # If the array is 1-d
    if len(X.shape) == 1:
        Xnew = np.column_stack([np.ones(X.shape[0]), X])
    
    # If the array is 2-d
    elif len(X.shape) == 2:
        bias_col = np.ones((X.shape[0], 1))
        Xnew = np.hstack([bias_col, X])
        
    else:
        raise ValueError("Input array must be either 1-d or 2-d")

    return Xnew
# TAKEN FROM MOHIT'S METHOD ON HW4 ^^^

In [40]:
def linreg_predict(Xnew, ynew, m, threshold=0.5):
    """
    Args:
        Xnew: Features (input x for all weights)
        ynew: Expected output for feature set
        m: Weights vector
        threshold: Classification threshold (0.5 default)

    Returns:
        A dictionary containing:
            'ypreds': The predicted values from applying m to Xnew
            'classified_preds': The classified prediction from the model
            'resids': The residuals, and the differences between ynew and ypreds
            'mse': The mean squared error
            'r2': The coefficient of determination (R^2)
            'accuracy': Accuracy of model against test set
    """
    # Add bias column
    Xnew_bias = add_bias_column(Xnew)
    
    # Make prediction
    ypreds = np.dot(Xnew_bias, m)
    
    # Classify prediction
    classified_preds = (ypreds >= threshold).astype(int)
    
    # Calculate residuals
    resids = ynew - ypreds
    
    # Calculate MSE
    mse = np.mean(resids**2)
    
    # Calculate accuracy against test set
    accuracy = np.mean(classified_preds == ynew)
    
    return {
        'ypreds': ypreds,
        'classified_preds': classified_preds,
        'resids': resids,
        'mse': mse,
        'accuracy': accuracy
    }
# TAKEN FROM LIAMS HW4
# SLIGHTLY ALTERED

In [37]:
def line_of_best_fit(X, y):
    """
    Args:
        X: A 1-d or 2-d array which includes all the predictor values, not including bias term
        y: a 1-d array which includes all corresponding response values to X

    Returns:
        The vector containing the coeffecients for the line of best fit, including an intercept.
    """
    X_bias = add_bias_column(X)
    
    vector = np.linalg.inv(np.matmul(X_bias.T, X_bias))
    vector = np.matmul(vector, np.matmul(X_bias.T, y))
    
    return vector
# TAKEN FROM LIAMS HW4
# SLIGHTLY ALTERED

In [48]:
def gradient_descent(X, y, weights, alpha=0.0275, max_iter=10000):
    """
    Train the model (converge on a weight vector to minimize MSE via gradient descent)

    Args:
        X: Input vector
        y: expected output vector
        weights: The default initiated weight vector (just random starting values for each weight)
        alpha: learning rate (default 0.0275, a good, assessed value for fast convergence)
        max_iter: max iterations for training

    Returns:
        The updated weight vector containing the weights and bias of the model
    """
    X_bias = add_bias_column(X)  # Add bias term
    
    for epoch in range(max_iter):
        # Make prediction with current weights
        predicted_y = np.matmul(X_bias, weights)

        # Calculate residuals (We dont use linreg predict for this because its much faster to do manually)
        resids = y - predicted_y

        # Compute gradient using residuals
        gradient = (2 / len(y)) * np.dot(X_bias.T, resids)
        
        # Update weights
        weights += alpha * gradient
        
        # Print progress every 1000 epochs
        if epoch % 1000 == 0:
            mse = np.mean(resids**2)
            print(f"Epoch {epoch}, MSE: {mse:.6f}")
    
    return weights


In [52]:
# Initialize weights with random values
weights = np.random.rand(Xtrain.shape[1] + 1)  # + 1 to include bias term

# Converge on weights using Gradient Descent
weights = gradient_descent(Xtrain, ytrain, weights)

Epoch 0, MSE: 5.3960
Epoch 1000, MSE: 0.0054
Epoch 2000, MSE: 0.0052
Epoch 3000, MSE: 0.0051
Epoch 4000, MSE: 0.0051
Epoch 5000, MSE: 0.0051
Epoch 6000, MSE: 0.0051
Epoch 7000, MSE: 0.0050
Epoch 8000, MSE: 0.0050
Epoch 9000, MSE: 0.0050


In [None]:
# Evaluate the model
test_metrics = linreg_predict(Xtest, ytest, weights)

# Print evaluation results
print("-----")
print(f"Test MSE: {test_metrics['mse']:.4f}")
print(f"Test Accuracy: {test_metrics['accuracy']:.2%}")
print(f"Actual Classified Predictions: {test_metrics['classified_preds']}")
print(f"Expected Classified Values: {ytest}")