In [5]:
!pip install  hmmlearn


Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp312-cp312-win_amd64.whl.metadata (3.1 kB)
Downloading hmmlearn-0.3.3-cp312-cp312-win_amd64.whl (127 kB)
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/127.3 kB ? eta -

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from hmmlearn import hmm
import warnings
warnings.filterwarnings("ignore")


In [17]:
# Load dataset (ensure creditcard.csv is in the same directory)
data = pd.read_csv("creditcard.csv")

print("Shape:", data.shape)
data.head()


Shape: (284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [19]:
# Normalize 'Amount' for consistency
data['Amount'] = (data['Amount'] - data['Amount'].mean()) / data['Amount'].std()

# Group transactions into categories (for Markov Chain states)
# We'll create 4 bins representing spending categories
data['Category'] = pd.qcut(data['Amount'], q=4, labels=[0, 1, 2, 3])

# Extract features
X = data[['Time', 'Amount']].values
y = data['Class'].values  # 0 = normal, 1 = fraud

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# Build transition probabilities for normal transactions
normal_data = data[data['Class'] == 0]['Category'].astype(int).values

transition_matrix = np.zeros((4, 4))
for i in range(len(normal_data)-1):
    transition_matrix[normal_data[i], normal_data[i+1]] += 1

# Normalize
transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)
transition_matrix = np.nan_to_num(transition_matrix)

print("Transition Matrix (Markov Chain):")
print(np.round(transition_matrix, 3))


Transition Matrix (Markov Chain):
[[0.278 0.243 0.238 0.241]
 [0.24  0.27  0.25  0.24 ]
 [0.24  0.248 0.259 0.253]
 [0.241 0.242 0.252 0.265]]


In [23]:
# Train Gaussian HMM on normal (non-fraud) data
model = hmm.GaussianHMM(n_components=4, covariance_type="diag", n_iter=100, random_state=42)
model.fit(X_train[y_train == 0])

print("HMM trained successfully on normal transactions.")


HMM trained successfully on normal transactions.


In [25]:
# Compute log-likelihood scores
scores = [model.score([x]) for x in X_test]

# Threshold (bottom 5% of likelihoods = fraud)
threshold = np.percentile(scores, 5)
preds = np.array([1 if s < threshold else 0 for s in scores])


In [27]:
# Run Viterbi on a few sample sequences
sample_seq = X_test[:10]
log_prob, hidden_states = model.decode(sample_seq, algorithm="viterbi")

print("Log Probability of Sequence:", log_prob)
print("Most likely hidden state sequence:", hidden_states)


Log Probability of Sequence: -403.08565163264086
Most likely hidden state sequence: [3 2 0 1 1 0 1 1 1 0]


In [29]:
print("Accuracy:", round(accuracy_score(y_test, preds)*100, 2), "%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, preds))
print("\nClassification Report:\n", classification_report(y_test, preds))


Accuracy: 94.88 %

Confusion Matrix:
 [[54029  2835]
 [   84    14]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97     56864
           1       0.00      0.14      0.01        98

    accuracy                           0.95     56962
   macro avg       0.50      0.55      0.49     56962
weighted avg       1.00      0.95      0.97     56962

