<a href="https://colab.research.google.com/github/harnalashok/hadoop/blob/main/credit_card_transactions_network_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Call libraries

In [None]:
import pandas as pd
import os

In [None]:
pd.__version__  # 1.3.5

'1.3.5'

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Mount google drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
!ls /gdrive/MyDrive/credit_card_transactions/

card_transaction_v1.csv  credit_card_edges.csv	   creditCard_users.csv
card_transaction_v1.zip  creditCard_merchants.csv  sample_2002.csv


In [None]:
os.chdir("/gdrive/MyDrive/credit_card_transactions/")
os.listdir()

['card_transaction_v1.csv',
 'card_transaction_v1.zip',
 '.ipynb_checkpoints',
 'credit_card_edges.csv',
 'creditCard_users.csv',
 'creditCard_merchants.csv',
 'sample_2002.csv']

column names:  
User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?   
Sample data:  
0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No



### Read data and explore

In [None]:
# Takes around 50secs
%%time
df = pd.read_csv("card_transaction_v1.csv",
                 usecols = ["User", "Year",  "Merchant Name","Merchant City", "Amount", "Is Fraud?"])

CPU times: user 27.3 s, sys: 5.91 s, total: 33.2 s
Wall time: 36.3 s


In [None]:
df.head()
df.tail()
df.shape

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
0,0,2002,$134.09,3527213246127876953,La Verne,No
1,0,2002,$38.48,-727612092139916043,Monterey Park,No
2,0,2002,$120.34,-727612092139916043,Monterey Park,No
3,0,2002,$128.95,3414527459579106770,Monterey Park,No
4,0,2002,$104.71,5817218446178736267,La Verne,No


Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
24386895,1999,2020,$-54.00,-5162038175624867091,Merrimack,No
24386896,1999,2020,$54.00,-5162038175624867091,Merrimack,No
24386897,1999,2020,$59.15,2500998799892805156,Merrimack,No
24386898,1999,2020,$43.12,2500998799892805156,Merrimack,No
24386899,1999,2020,$45.13,4751695835751691036,Merrimack,No


(24386900, 6)

In [None]:
df.dtypes

User              int64
Year              int64
Amount           object
Merchant Name     int64
Merchant City    object
Is Fraud?        object
dtype: object

In [None]:
df['Is Fraud?'].value_counts()

No     24357143
Yes       29757
Name: Is Fraud?, dtype: int64

In [None]:
df['Amount']=df['Amount'].str.replace("$", "")

  """Entry point for launching an IPython kernel.


In [None]:
df['Amount'] = pd.to_numeric(df['Amount'])

In [None]:
df.shape   # (24386900, 6)

(24386900, 6)

### Take a sample of data

In [None]:
dfsample = df.loc[df['Year'] == 2002, :]

In [None]:
dfsample.dtypes
dfsample.shape   # (350732, 6)

In [None]:
dfsample['Is Fraud?'].value_counts()

In [None]:
dfsample.to_csv("sample_2002.csv", index = False)

In [None]:
df_sample = dfsample.copy()

In [None]:
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
0,0,2002,134.09,3527213246127876953,La Verne,No
1,0,2002,38.48,-727612092139916043,Monterey Park,No
2,0,2002,120.34,-727612092139916043,Monterey Park,No
3,0,2002,128.95,3414527459579106770,Monterey Park,No
4,0,2002,104.71,5817218446178736267,La Verne,No


### Change user IDs

In [None]:
u_user = df_sample['User'].unique()

In [None]:
u_user.sort()

In [None]:
u_user

In [None]:
dx = pd.DataFrame(u_user, columns = ["u_user"]).reset_index().rename(columns = {'index' : 'alt_name' })
dx['alt_name'] = dx['alt_name'].apply(str)
dx['a'] = "u"
dx['alt_name'] = dx['a'].str.cat(dx['alt_name']) 
_=dx.pop('a')
dx.head()

Unnamed: 0,alt_name,u_user
0,u0,0
1,u1,2
2,u2,4
3,u3,5
4,u4,15


In [None]:
# https://stackoverflow.com/a/20250947/3282777
map_dict = dict(zip(dx['u_user'], dx['alt_name']))
map_dict

{0: 'u0',
 2: 'u1',
 4: 'u2',
 5: 'u3',
 15: 'u4',
 19: 'u5',
 22: 'u6',
 23: 'u7',
 34: 'u8',
 40: 'u9',
 42: 'u10',
 47: 'u11',
 48: 'u12',
 49: 'u13',
 53: 'u14',
 55: 'u15',
 66: 'u16',
 74: 'u17',
 75: 'u18',
 77: 'u19',
 81: 'u20',
 83: 'u21',
 87: 'u22',
 89: 'u23',
 98: 'u24',
 100: 'u25',
 102: 'u26',
 109: 'u27',
 112: 'u28',
 122: 'u29',
 123: 'u30',
 134: 'u31',
 136: 'u32',
 137: 'u33',
 139: 'u34',
 148: 'u35',
 149: 'u36',
 150: 'u37',
 151: 'u38',
 156: 'u39',
 162: 'u40',
 177: 'u41',
 178: 'u42',
 181: 'u43',
 184: 'u44',
 185: 'u45',
 187: 'u46',
 188: 'u47',
 192: 'u48',
 201: 'u49',
 205: 'u50',
 215: 'u51',
 220: 'u52',
 234: 'u53',
 239: 'u54',
 242: 'u55',
 243: 'u56',
 252: 'u57',
 255: 'u58',
 257: 'u59',
 261: 'u60',
 266: 'u61',
 275: 'u62',
 285: 'u63',
 287: 'u64',
 292: 'u65',
 304: 'u66',
 309: 'u67',
 319: 'u68',
 329: 'u69',
 332: 'u70',
 343: 'u71',
 344: 'u72',
 348: 'u73',
 359: 'u74',
 361: 'u75',
 370: 'u76',
 376: 'u77',
 377: 'u78',
 385: 'u79',

In [None]:
df_sample['User'] = df_sample['User'].replace(map_dict, inplace= False)

In [None]:
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
0,u0,2002,134.09,3527213246127876953,La Verne,No
1,u0,2002,38.48,-727612092139916043,Monterey Park,No
2,u0,2002,120.34,-727612092139916043,Monterey Park,No
3,u0,2002,128.95,3414527459579106770,Monterey Park,No
4,u0,2002,104.71,5817218446178736267,La Verne,No


### Change merchant IDs

In [None]:
mn = df_sample['Merchant Name'].unique()

In [None]:
mn.sort()

In [None]:
dx = pd.DataFrame(mn, columns = ["mt"]).reset_index().rename(columns = {'index' : 'alt_name' })
dx['alt_name'] = dx['alt_name'].apply(str)
dx['a'] = "m"
dx['alt_name'] = dx['a'].str.cat(dx['alt_name']) 
_=dx.pop('a')
dx.head()

Unnamed: 0,alt_name,mt
0,m0,-9216029123349204090
1,m1,-9215609737857753742
2,m2,-9214558223928539670
3,m3,-9211415240299816455
4,m4,-9210704852233124461


In [None]:
# https://stackoverflow.com/a/20250947/3282777
map_dict = dict(zip(dx['mt'], dx['alt_name']))
map_dict

{-9216029123349204090: 'm0',
 -9215609737857753742: 'm1',
 -9214558223928539670: 'm2',
 -9211415240299816455: 'm3',
 -9210704852233124461: 'm4',
 -9209649324591830672: 'm5',
 -9209549549743376514: 'm6',
 -9207992727245390875: 'm7',
 -9207938803334159501: 'm8',
 -9207683994561949453: 'm9',
 -9206222775446380981: 'm10',
 -9205603564578360879: 'm11',
 -9204246302766843482: 'm12',
 -9202801494022182981: 'm13',
 -9201451928704539517: 'm14',
 -9198490720191101975: 'm15',
 -9197533986719322394: 'm16',
 -9197413686674620725: 'm17',
 -9186624218311173146: 'm18',
 -9186372257785409588: 'm19',
 -9179793182211330025: 'm20',
 -9178794978343839265: 'm21',
 -9177770777448816478: 'm22',
 -9176686377883494741: 'm23',
 -9176365890837322967: 'm24',
 -9174235397148875092: 'm25',
 -9174064663949728849: 'm26',
 -9173570216444516790: 'm27',
 -9173566774685655352: 'm28',
 -9172290850563650285: 'm29',
 -9170103675056273681: 'm30',
 -9170099261064141402: 'm31',
 -9169612133827640783: 'm32',
 -916954314412077075

In [None]:
# Takes time
df_sample['Merchant Name'] = df_sample['Merchant Name'].replace(map_dict, inplace= False)

In [None]:
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
0,u0,2002,134.09,m7466,La Verne,No
1,u0,2002,38.48,m4998,Monterey Park,No
2,u0,2002,120.34,m4998,Monterey Park,No
3,u0,2002,128.95,m7393,Monterey Park,No
4,u0,2002,104.71,m8834,La Verne,No


### User Nodes

In [None]:
## Group by user
grpd_user = df_sample.groupby(['User'])
user_nodes = grpd_user['Amount'].mean().reset_index()
user_nodes = user_nodes.round(decimals = 2)
user_nodes.head()

Unnamed: 0,User,Amount
0,u0,84.78
1,u1,41.82
2,u10,44.98
3,u100,100.45
4,u101,40.78


In [None]:
#user_nodes['User'] = user_nodes['User'].apply(str)
#user_nodes['a'] = "u"
#user_nodes['User'] = user_nodes['a'].str.cat(user_nodes['User'])
#_= user_nodes.pop('a')

In [None]:
user_nodes.head()

Unnamed: 0,User,Amount
0,u0,84.78
1,u1,41.82
2,u10,44.98
3,u100,100.45
4,u101,40.78


In [None]:
# Which of the users have committed fraud
df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'User' ].unique()
users_suspect = df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'User' ].unique()
len(users_suspect)

array(['u12', 'u16', 'u43', 'u103', 'u117', 'u131', 'u171', 'u183',
       'u196', 'u198', 'u210', 'u212', 'u228', 'u235', 'u244', 'u264',
       'u315', 'u335', 'u349', 'u361', 'u373', 'u381', 'u384', 'u388',
       'u413'], dtype=object)

25

In [None]:
user_nodes['suspect'] = "No" 
user_nodes['Cat'] = 'Member'

In [None]:
for i in users_suspect:
  user_nodes.loc[user_nodes['User'] == i, 'suspect' ] = "Yes"  

In [None]:
user_nodes['suspect'].value_counts()  # No: 401, 'Yes: 25

No     401
Yes     25
Name: suspect, dtype: int64

In [None]:
user_nodes.head()

Unnamed: 0,User,Amount,suspect,Cat
0,u0,84.78,No,Member
1,u1,41.82,No,Member
2,u10,44.98,No,Member
3,u100,100.45,No,Member
4,u101,40.78,No,Member


In [None]:
user_nodes['Label'] = user_nodes['User']
user_nodes = user_nodes.rename(columns = {'User': "Id"})

In [None]:
user_nodes.head()

Unnamed: 0,Id,Amount,suspect,Cat,Label
0,u0,84.78,No,Member,u0
1,u1,41.82,No,Member,u1
2,u10,44.98,No,Member,u10
3,u100,100.45,No,Member,u100
4,u101,40.78,No,Member,u101


In [None]:
user_nodes['suspect_n'] = user_nodes['suspect']
user_nodes['suspect_n'] = user_nodes['suspect_n'].map({"Yes":1 , "No" : 0})

In [None]:
user_nodes.to_csv("creditCard_users.csv", index = False, sep = ";")

### Merchant nodes

In [None]:
## Group by merchant
grpd_merchant = df_sample.groupby(['Merchant Name'])
merchant_nodes = grpd_merchant['Amount'].mean().reset_index()
merchant_nodes = merchant_nodes.round(2)
merchant_nodes.head()

Unnamed: 0,Merchant Name,Amount
0,m0,104.4
1,m1,18.14
2,m10,186.43
3,m100,18.96
4,m1000,5.0


In [None]:
# For which of the merchants fraud comitted
df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'Merchant Name' ].unique()
merchant_suspect = df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'Merchant Name' ].unique()
len(merchant_suspect)

array(['m8978', 'm2242', 'm8070', 'm2241', 'm7725', 'm9211', 'm5086',
       'm6523', 'm4461', 'm5917', 'm9139', 'm814', 'm9344', 'm1118',
       'm3261', 'm5437', 'm3758', 'm5091', 'm3777', 'm1270', 'm10751',
       'm6848', 'm2174', 'm5283', 'm1856', 'm2364', 'm8199', 'm5548',
       'm9850', 'm10682', 'm7528', 'm10183', 'm8246', 'm9405', 'm8985',
       'm397', 'm4637', 'm1471', 'm4333', 'm2234', 'm3470', 'm2655',
       'm6654', 'm91', 'm2052', 'm5110', 'm4233', 'm1016', 'm9658',
       'm9321', 'm1485', 'm7274', 'm6370', 'm2015', 'm6985', 'm7393',
       'm9329', 'm5613', 'm9045', 'm8473', 'm3570', 'm5705', 'm4693',
       'm5934', 'm5247', 'm7082', 'm5466', 'm874', 'm10498', 'm4976',
       'm8834', 'm2115', 'm1721', 'm4250', 'm3655', 'm2286'], dtype=object)

76

In [None]:
merchant_nodes['suspect'] = "No" 
merchant_nodes['Cat'] = 'Institution'

In [None]:
for i in merchant_suspect:
  merchant_nodes.loc[merchant_nodes['Merchant Name'] == i, 'suspect' ] = "Yes"
  

In [None]:
merchant_nodes['suspect'].value_counts()  # 10777, 76

No     10777
Yes       76
Name: suspect, dtype: int64

In [None]:
merchant_nodes['Label'] = merchant_nodes['Merchant Name']
merchant_nodes = merchant_nodes.rename(columns = {'Merchant Name': "Id"})
merchant_nodes.head()

Unnamed: 0,Id,Amount,suspect,Cat,Label
0,m0,104.4,No,Institution,m0
1,m1,18.14,No,Institution,m1
2,m10,186.43,No,Institution,m10
3,m100,18.96,No,Institution,m100
4,m1000,5.0,No,Institution,m1000


In [None]:
merchant_nodes['suspect_n'] = merchant_nodes['suspect']
merchant_nodes['suspect_n'] = merchant_nodes['suspect_n'].map({"Yes":1 , "No" : 0})

In [None]:
merchant_nodes.head()

Unnamed: 0,Id,Amount,suspect,Cat,Label,suspect_n
0,m0,104.4,No,Institution,m0,0
1,m1,18.14,No,Institution,m1,0
2,m10,186.43,No,Institution,m10,0
3,m100,18.96,No,Institution,m100,0
4,m1000,5.0,No,Institution,m1000,0


In [None]:
merchant_nodes.to_csv("creditCard_merchants.csv", index = False, sep = ";")

### Edges

In [None]:
# Group by User and Merchant Name
edges = df_sample.groupby([df_sample['User'], df_sample['Merchant Name']]).size()

In [None]:
edges = df_sample[['User', 'Merchant Name']].groupby(['User', 'Merchant Name']).size().reset_index()
edges = edges.rename(columns = { 0 : "weight"})
edges.head()


Unnamed: 0,User,Merchant Name,weight
0,u0,m10039,3
1,u0,m10101,5
2,u0,m10183,1
3,u0,m10800,2
4,u0,m1118,5


In [None]:
edges = edges.rename(columns = {'User': 'Source', 'Merchant Name': 'Target'})

In [None]:
edges['Type']= 'Undirected'

In [None]:
edges.head()

Unnamed: 0,Source,Target,weight,Type
0,u0,m10039,3,Undirected
1,u0,m10101,5,Undirected
2,u0,m10183,1,Undirected
3,u0,m10800,2,Undirected
4,u0,m1118,5,Undirected


In [None]:
edges.to_csv("credit_card_edges.csv", index = False, sep = ";")

In [None]:
########### Done ###########