## Unsupervised Learning  - Similarity

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np

'''
Building a dataset of documents
'''

'''
    Training docs
'''
doc0 = "hello hello world "
doc1 = "hello world world"
doc2 = "hello world hello world"
doc3 = "foo foo bar "
doc4 = "foo bar bar"
doc5 = "foo bar foo bar"
doc6 = "lottery prize winner"
doc7 = "lottery prize"
doc8 = "lottery lottery lottery prize winner"
docs_train = [doc0, doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8]

'''
    Test docs
'''
doct0 = "foo bar"
doct1 = "lottery prize hello"
docs_test = [doct0, doct1]


'''
    Converting documents to feature vectors
'''
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(docs_train).toarray()
features = vectorizer.get_feature_names()
print(f'Features: {features}')
print(f'X_train:\n====\n{X_train}\n====\n')
X_test = vectorizer.transform(docs_test).toarray()
print(f'X_test:\n====\n{X_test}\n====\n')

'''
Choosing the model
'''
model = NearestNeighbors(n_neighbors=3, algorithm='brute')

'''
    Train the model
'''
model.fit(X_train)

'''
    Test the model, by getting  k=3 most similar documents, and their distances to the ones in test docs 
'''
distances, indices = model.kneighbors(X_test)

print(f'Indices of nearest documents:\n====\n{indices}\n====\n')
print(f'Distances of nearest documents:\n====\n{distances}\n====\n')



Features: ['bar', 'foo', 'hello', 'lottery', 'prize', 'winner', 'world']
X_train:
====
[[0 0 2 0 0 0 1]
 [0 0 1 0 0 0 2]
 [0 0 2 0 0 0 2]
 [1 2 0 0 0 0 0]
 [2 1 0 0 0 0 0]
 [2 2 0 0 0 0 0]
 [0 0 0 1 1 1 0]
 [0 0 0 1 1 0 0]
 [0 0 0 3 1 1 0]]
====

X_test:
====
[[1 1 0 0 0 0 0]
 [0 0 1 1 1 0 0]]
====

Indices of nearest documents:
====
[[3 4 5]
 [7 6 0]]
====

Distances of nearest documents:
====
[[1.         1.         1.41421356]
 [1.         1.41421356 2.        ]]
====



## Unsupervised Learning  - Anomaly detection

In [4]:
import random

'''
features = ["amount_category", "merchant_web_mobile", "dow",  "hour" ]
feature description
    amount_category       0 if amount < £100
                          1 if  £100 <= amount < £500)
                          2 if  £500 <= amount

    merchant_web_mobile   0 if  transaction done in shop
                          1 if  transaction done through web
                          2 if  transaction done through mobile    
    
    dow                   transaction day of week (1-7)
    hour                  transaction hour (1-24)
'''
def normal_txn_hour():
    return random.randint(9,18)
def normal_txn_dow():
    return random.randint(1,5)

'''
Building a hypothetical dataset of normal transactions
'''
N = 1000
X_train = [[0, 0, normal_txn_dow(), normal_txn_hour()] for i in range(N)]
# print(*X_train, sep = "\n")

'''
Building a hypothetical test dataset: some fraud transactions and some normal
'''
X_test = [
    [1,2,6,23],
    [1,2,6,23],
    [0,0,2,11],
]

'''
Choosing a model
'''
from sklearn.covariance import EllipticEnvelope
model = EllipticEnvelope()

'''
    Train the model so that it understand whats normal transaction
'''
model.fit(X_train)

'''
   Use the trained model to predict if the given transactions are fraudlent or not
'''
predictions = model.predict(X_test)

print(f'Fraud Transaction Prediction (-1 means anomaly/outlier i.e fraud):\n====\n{predictions}\n====\n')




Fraud Transaction Prediction (-1 means anomaly/outlier i.e fraud):
====
[-1 -1  1]
====

