<a href="https://colab.research.google.com/github/harnalashok/hadoop/blob/main/credit_card_transactions_network_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 18th March, 2022
#               Holi
# Data source: https://ibm.ent.box.com/v/tabformer-data/folder/130747715605
# Data source simulator: https://fraud-detection-handbook.github.io/fraud-detection-handbook/Foreword.html
# Objective: Discovering community or collaboration among
#            credit-card fraudsters.

## Generate user/merchant nodes

### Call libraries

In [281]:
# 1.0
import pandas as pd
import numpy as np
import os

In [282]:
# 1.1
pd.__version__  # 1.3.5

'1.3.5'

In [283]:
# 1.2 Display cell outputs from multiple commands
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Mount google drive

In [284]:
# 2.0 Mount to ccma@fsm.ac.in
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [285]:
# 2.1 Check credit card data files
!ls /gdrive/MyDrive/credit_card_transactions/

card_transaction_v1.csv  card_transaction_v1.zip  sample_2002.csv


In [286]:
# 2.2 Change current directory
path = "/gdrive/MyDrive/credit_card_transactions/"
os.chdir(path)
os.listdir()

['.ipynb_checkpoints',
 'sample_2002.csv',
 'card_transaction_v1.csv',
 'card_transaction_v1.zip']

column names:  
User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?   
Sample data:  
0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No



### Read full data and store a sample

In [287]:
# 3.0 Read data. File size is > 2gb
#     Takes around 
#     User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
#     MCC : Merchant Category code:  A four-digit number that describes a merchant's primary business activities. 
#     Feel free to include more fields (city, time of transaction etc) and analyse:
#
%%time
df = pd.read_csv("card_transaction_v1.csv",
                 usecols = ["User", "Year", "Merchant Name", "Merchant City", "Amount", "MCC", "Is Fraud?"])

CPU times: user 24.1 s, sys: 8.83 s, total: 32.9 s
Wall time: 35 s


In [288]:
# 3.1 Our data:
df.head()
df.tail()
df.shape   # (24386900, 6)

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,MCC,Is Fraud?
0,0,2002,$134.09,3527213246127876953,La Verne,5300,No
1,0,2002,$38.48,-727612092139916043,Monterey Park,5411,No
2,0,2002,$120.34,-727612092139916043,Monterey Park,5411,No
3,0,2002,$128.95,3414527459579106770,Monterey Park,5651,No
4,0,2002,$104.71,5817218446178736267,La Verne,5912,No


Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,MCC,Is Fraud?
24386895,1999,2020,$-54.00,-5162038175624867091,Merrimack,5541,No
24386896,1999,2020,$54.00,-5162038175624867091,Merrimack,5541,No
24386897,1999,2020,$59.15,2500998799892805156,Merrimack,4121,No
24386898,1999,2020,$43.12,2500998799892805156,Merrimack,4121,No
24386899,1999,2020,$45.13,4751695835751691036,Merrimack,5814,No


(24386900, 7)

In [289]:
# 3.2 Attempt should be made to
#     reduce data size:

df.dtypes

User              int64
Year              int64
Amount           object
Merchant Name     int64
Merchant City    object
MCC               int64
Is Fraud?        object
dtype: object

In [290]:
# 3.3 Is data balanced?

df['Is Fraud?'].value_counts()
print("\n")
df['Is Fraud?'].value_counts(normalize = True)  # 99.78% vs 0.12%

No     24357143
Yes       29757
Name: Is Fraud?, dtype: int64





No     0.99878
Yes    0.00122
Name: Is Fraud?, dtype: float64

In [291]:
# 3.4 Remove '$' sign from 'Amount':
#     Takes time 15 secs:
%%time
df['Amount']=df['Amount'].str.replace("$", "")

  """Entry point for launching an IPython kernel.


CPU times: user 11.7 s, sys: 860 ms, total: 12.5 s
Wall time: 12.6 s


In [292]:
# 3.5 Convert 'Amount' to numeric:
df['Amount'] = pd.to_numeric(df['Amount'])

In [293]:
# 3.6 Data shape
df.shape   # rows: 243,86,900; cols: 6)

(24386900, 7)

#### Store a sample of data

In [294]:
# 4.0 As RAM is limited, we will work
#     with a sample for the year 2002:

dfsample = df.loc[df['Year'] == 2002, :]

In [295]:
# 4.1 Sample size: 
dfsample.shape   # (350732, 6)

(350732, 7)

In [296]:
# 4.2 How many incidents of frauds?
dfsample['Is Fraud?'].value_counts()   # Yes: 139, No: 350593

No     350593
Yes       139
Name: Is Fraud?, dtype: int64

In [297]:
# 4.3 Should you like to save this sample
#     for latter quick reading?

dfsample.to_csv("sample_2002.csv", index = False)

### Read sample of data

In [298]:
%%time
df = pd.read_csv(path+"sample_2002.csv")                 

CPU times: user 232 ms, sys: 20.7 ms, total: 253 ms
Wall time: 273 ms


### Perform processing on data

In [299]:
# 4.4 We work with a copy of data
#     If, we make mistakes, we come back here:

df_sample = dfsample.copy()

In [300]:
# 4.5 And our data:
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,MCC,Is Fraud?
0,0,2002,134.09,3527213246127876953,La Verne,5300,No
1,0,2002,38.48,-727612092139916043,Monterey Park,5411,No
2,0,2002,120.34,-727612092139916043,Monterey Park,5411,No
3,0,2002,128.95,3414527459579106770,Monterey Park,5651,No
4,0,2002,104.71,5817218446178736267,La Verne,5912,No


### Change user IDs
> 1.0 Know unique user names and transform them to short names  
> 2.0 Prepare a dictionary of user-ids and proposed Ids  
> 3.0 Make changes to our dataset using the dictionary  

In [301]:
# 5.0 To distinguish userids on network graph,
#     we will prefix userids with 'u' and also assign
#      them a short name so that they fit within nodes 
#       when displayed on graph:

#    First get unique user names
u_user = df_sample['User'].unique()

In [302]:
# 5.1
u_user.sort()
u_user
print("\n")  
len(u_user)  # 426

array([   0,    2,    4,    5,   15,   19,   22,   23,   34,   40,   42,
         47,   48,   49,   53,   55,   66,   74,   75,   77,   81,   83,
         87,   89,   98,  100,  102,  109,  112,  122,  123,  134,  136,
        137,  139,  148,  149,  150,  151,  156,  162,  177,  178,  181,
        184,  185,  187,  188,  192,  201,  205,  215,  220,  234,  239,
        242,  243,  252,  255,  257,  261,  266,  275,  285,  287,  292,
        304,  309,  319,  329,  332,  343,  344,  348,  359,  361,  370,
        376,  377,  385,  390,  393,  396,  398,  405,  413,  417,  426,
        440,  445,  446,  449,  456,  464,  466,  473,  474,  480,  481,
        486,  487,  488,  489,  490,  497,  500,  502,  504,  510,  511,
        513,  519,  525,  531,  540,  545,  548,  549,  552,  556,  562,
        564,  569,  574,  576,  579,  580,  583,  598,  614,  615,  619,
        625,  628,  635,  667,  669,  672,  673,  680,  683,  688,  690,
        693,  697,  705,  713,  720,  726,  731,  7





426

In [303]:
# 5.2 Get alternate names by prefixing userids with 'u'

# 5.2.1 Transform sorted-user array to a dataframe; Get its index as a column; Rename new column as 'alt_name'
#       So, alternate names are sequenial: 0,1,2,3.. and final names will be: u1,u2,u3...
dx = pd.DataFrame(u_user, columns = ["u_user"]).reset_index().rename(columns = {'index' : 'alt_name' })

# 5.2.2 Transform the new column to string
dx['alt_name'] = dx['alt_name'].apply(str)

# 5.2.3 Create a new column 'a' with a constant value:
dx['a'] = "u"

# 5.2.4 Concatenate 'a' column and 'alt_name' column:
dx['alt_name'] = dx['a'].str.cat(dx['alt_name']) 

# 5.2.5 Drop 'a'
_=dx.pop('a')

# 5.2.6 Check:
dx.head()

Unnamed: 0,alt_name,u_user
0,u0,0
1,u1,2
2,u2,4
3,u3,5
4,u4,15


In [304]:
# 5.3 Prepare a dictionary having
#     old names (key) and new names (value) 
#     for further transformation:

# https://stackoverflow.com/a/20250947/3282777
map_dict = dict(zip(dx['u_user'], dx['alt_name']))
map_dict

{0: 'u0',
 2: 'u1',
 4: 'u2',
 5: 'u3',
 15: 'u4',
 19: 'u5',
 22: 'u6',
 23: 'u7',
 34: 'u8',
 40: 'u9',
 42: 'u10',
 47: 'u11',
 48: 'u12',
 49: 'u13',
 53: 'u14',
 55: 'u15',
 66: 'u16',
 74: 'u17',
 75: 'u18',
 77: 'u19',
 81: 'u20',
 83: 'u21',
 87: 'u22',
 89: 'u23',
 98: 'u24',
 100: 'u25',
 102: 'u26',
 109: 'u27',
 112: 'u28',
 122: 'u29',
 123: 'u30',
 134: 'u31',
 136: 'u32',
 137: 'u33',
 139: 'u34',
 148: 'u35',
 149: 'u36',
 150: 'u37',
 151: 'u38',
 156: 'u39',
 162: 'u40',
 177: 'u41',
 178: 'u42',
 181: 'u43',
 184: 'u44',
 185: 'u45',
 187: 'u46',
 188: 'u47',
 192: 'u48',
 201: 'u49',
 205: 'u50',
 215: 'u51',
 220: 'u52',
 234: 'u53',
 239: 'u54',
 242: 'u55',
 243: 'u56',
 252: 'u57',
 255: 'u58',
 257: 'u59',
 261: 'u60',
 266: 'u61',
 275: 'u62',
 285: 'u63',
 287: 'u64',
 292: 'u65',
 304: 'u66',
 309: 'u67',
 319: 'u68',
 329: 'u69',
 332: 'u70',
 343: 'u71',
 344: 'u72',
 348: 'u73',
 359: 'u74',
 361: 'u75',
 370: 'u76',
 376: 'u77',
 377: 'u78',
 385: 'u79',

In [305]:
# 5.4 Use 'replace' method to replace values in column 'User':

df_sample['User'] = df_sample['User'].replace(map_dict, inplace= False)

In [306]:
# 5.5 Check:
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,MCC,Is Fraud?
0,u0,2002,134.09,3527213246127876953,La Verne,5300,No
1,u0,2002,38.48,-727612092139916043,Monterey Park,5411,No
2,u0,2002,120.34,-727612092139916043,Monterey Park,5411,No
3,u0,2002,128.95,3414527459579106770,Monterey Park,5651,No
4,u0,2002,104.71,5817218446178736267,La Verne,5912,No


### Change merchant IDs
Same steps for changing long merchantIds with short IDs. We prefix these Ids with 'm'

In [307]:
# 6.0 Unique merchant ids
mn = df_sample['Merchant Name'].unique()

In [308]:
# 6.1 Sort them in place
mn.sort()

In [309]:
# 6.2 Prepare a dataframe of existing and alternate names:
dx = pd.DataFrame(mn, columns = ["mt"]).reset_index().rename(columns = {'index' : 'alt_name' })
dx['alt_name'] = dx['alt_name'].apply(str)
dx['a'] = "m"
dx['alt_name'] = dx['a'].str.cat(dx['alt_name']) 
_=dx.pop('a')
dx.head()

Unnamed: 0,alt_name,mt
0,m0,-9216029123349204090
1,m1,-9215609737857753742
2,m2,-9214558223928539670
3,m3,-9211415240299816455
4,m4,-9210704852233124461


In [310]:
# 6.3 Create a dictionary for transformation:
# https://stackoverflow.com/a/20250947/3282777
map_dict = dict(zip(dx['mt'], dx['alt_name']))
map_dict

{-9216029123349204090: 'm0',
 -9215609737857753742: 'm1',
 -9214558223928539670: 'm2',
 -9211415240299816455: 'm3',
 -9210704852233124461: 'm4',
 -9209649324591830672: 'm5',
 -9209549549743376514: 'm6',
 -9207992727245390875: 'm7',
 -9207938803334159501: 'm8',
 -9207683994561949453: 'm9',
 -9206222775446380981: 'm10',
 -9205603564578360879: 'm11',
 -9204246302766843482: 'm12',
 -9202801494022182981: 'm13',
 -9201451928704539517: 'm14',
 -9198490720191101975: 'm15',
 -9197533986719322394: 'm16',
 -9197413686674620725: 'm17',
 -9186624218311173146: 'm18',
 -9186372257785409588: 'm19',
 -9179793182211330025: 'm20',
 -9178794978343839265: 'm21',
 -9177770777448816478: 'm22',
 -9176686377883494741: 'm23',
 -9176365890837322967: 'm24',
 -9174235397148875092: 'm25',
 -9174064663949728849: 'm26',
 -9173570216444516790: 'm27',
 -9173566774685655352: 'm28',
 -9172290850563650285: 'm29',
 -9170103675056273681: 'm30',
 -9170099261064141402: 'm31',
 -9169612133827640783: 'm32',
 -916954314412077075

In [311]:
# 6.4 Transform values in 'Merchant Name' using map_dict:
#     Takes time 4 minutes
%%time
df_sample['Merchant Name'] = df_sample['Merchant Name'].replace(map_dict, inplace= False)

CPU times: user 12 s, sys: 1min 16s, total: 1min 28s
Wall time: 1min 28s


In [312]:
# 6.5 And check
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,MCC,Is Fraud?
0,u0,2002,134.09,m7466,La Verne,5300,No
1,u0,2002,38.48,m4998,Monterey Park,5411,No
2,u0,2002,120.34,m4998,Monterey Park,5411,No
3,u0,2002,128.95,m7393,Monterey Park,5651,No
4,u0,2002,104.71,m8834,La Verne,5912,No


### User Nodes   


#### Fields

>This node must have two <i>must</i> fields: Id and Label. Besides these two, to distinguish Users from Merchants, we also have a 'Cat' field. The 'Cat' field has two values: *Member* for Users and *Institution* for Merchants.  

> Besides, these three common field names, other fields may also be there and may carry any names.


> If a user has committed fraud, even once, we mark him with propensity to commit frauds. A column 'suspect' is added to records this propensity.  

In [313]:
## 7.0 Group by user to get user node charteristics:
# StackOverflow:  https://stackoverflow.com/a/68726106/3282777
grpd_user = df_sample.groupby(['User'])
user_nodes = grpd_user.agg({'Amount' : [('u_min','min'),('u_max','max'),('u_mean','mean'),('u_std',np.std)] }).reset_index()
user_nodes = user_nodes.round(decimals = 2)
user_nodes.head()

Unnamed: 0_level_0,User,Amount,Amount,Amount,Amount
Unnamed: 0_level_1,Unnamed: 1_level_1,u_min,u_max,u_mean,u_std
0,u0,-255.0,1049.82,84.78,90.16
1,u1,-498.0,498.0,41.82,54.28
2,u10,-492.0,1067.21,44.98,89.07
3,u100,-289.0,766.36,100.45,122.21
4,u101,-302.0,302.0,40.78,47.84


In [314]:
# 7.1 Which of the users have committed fraud
users_suspect = df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'User' ].unique()
len(users_suspect)  # 25

25

In [315]:
# 7.2 Add a column 'suspect' with default value of 'No'
user_nodes['suspect'] = "No" 

# 7.3 To distinguish user nodes from merchant nodes on the graph
#     we add a 'Cat' column.

user_nodes['Cat'] = 'Member'

In [316]:
# 7.4 Even if a user committed fraud once, we set value
#     in 'suspect' as 'Yes' for every transaction:

for i in users_suspect:
  user_nodes.loc[user_nodes['User'] == i, 'suspect' ] = "Yes"  

In [317]:
# 7.5 So how many of them?

user_nodes['suspect'].value_counts()  # No: 401, 'Yes: 25
user_nodes.head()

No     401
Yes     25
Name: suspect, dtype: int64

Unnamed: 0_level_0,User,Amount,Amount,Amount,Amount,suspect,Cat
Unnamed: 0_level_1,Unnamed: 1_level_1,u_min,u_max,u_mean,u_std,Unnamed: 6_level_1,Unnamed: 7_level_1
0,u0,-255.0,1049.82,84.78,90.16,No,Member
1,u1,-498.0,498.0,41.82,54.28,No,Member
2,u10,-492.0,1067.21,44.98,89.07,No,Member
3,u100,-289.0,766.36,100.45,122.21,No,Member
4,u101,-302.0,302.0,40.78,47.84,No,Member


In [318]:
# 7.6 Create a 'Label' column
#     that simply records userids.
#     Or, rather a copy of 'User' feature
#     'Label' column is a MUST in a graph:

user_nodes['Label'] = user_nodes['User']

# 7.7 Also a graph should have an 'Id' column
#     We rename 'User' field as 'Id'

user_nodes = user_nodes.rename(columns = {'User': "Id"})

In [319]:
# 7.8 our user nodes data:
user_nodes.head()

Unnamed: 0_level_0,Id,Amount,Amount,Amount,Amount,suspect,Cat,Label
Unnamed: 0_level_1,Unnamed: 1_level_1,u_min,u_max,u_mean,u_std,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,u0,-255.0,1049.82,84.78,90.16,No,Member,u0
1,u1,-498.0,498.0,41.82,54.28,No,Member,u1
2,u10,-492.0,1067.21,44.98,89.07,No,Member,u10
3,u100,-289.0,766.36,100.45,122.21,No,Member,u100
4,u101,-302.0,302.0,40.78,47.84,No,Member,u101


In [320]:
# 7.9 We also create a column 'suspect_n' that records
#     1 for 'Yes' fraud and 0 for 'No' fraud
#     It is simply a numeric transformation of 'suspect' field:
#     We do this as data manipulation capabilities within gephi
#     are limited:

user_nodes['suspect_n'] = user_nodes['suspect']
user_nodes['suspect_n'] = user_nodes['suspect_n'].map({"Yes":1 , "No" : 0})

In [321]:
# 7.11
user_nodes.head()

Unnamed: 0_level_0,Id,Amount,Amount,Amount,Amount,suspect,Cat,Label,suspect_n
Unnamed: 0_level_1,Unnamed: 1_level_1,u_min,u_max,u_mean,u_std,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,u0,-255.0,1049.82,84.78,90.16,No,Member,u0,0
1,u1,-498.0,498.0,41.82,54.28,No,Member,u1,0
2,u10,-492.0,1067.21,44.98,89.07,No,Member,u10,0
3,u100,-289.0,766.36,100.45,122.21,No,Member,u100,0
4,u101,-302.0,302.0,40.78,47.84,No,Member,u101,0


In [322]:
! rm  /gdrive/MyDrive/credit_card_transactions/creditCard_users.csv

rm: cannot remove '/gdrive/MyDrive/credit_card_transactions/creditCard_users.csv': No such file or directory


In [323]:
# 7.12 Save usernodes to gdrive with semicolon separator:
#     Header is stored as:
#       Id;Amount;Amount;Amount;Amount;suspect;Cat;Label;suspect_n
#       ;u_min;u_max;u_mean;u_std;;;;
#     Change this manually to:
#       Id;u_min;u_max;u_mean;u_std;suspect;Cat;Label;suspect_n

user_nodes.to_csv("creditCard_users.csv",
                  index = False,
                  sep = ";"
                  )

### Merchant nodes

In [324]:
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,MCC,Is Fraud?
0,u0,2002,134.09,m7466,La Verne,5300,No
1,u0,2002,38.48,m4998,Monterey Park,5411,No
2,u0,2002,120.34,m4998,Monterey Park,5411,No
3,u0,2002,128.95,m7393,Monterey Park,5651,No
4,u0,2002,104.71,m8834,La Verne,5912,No


In [325]:
## 8.0 Group by merchant and get merchant charteristics:
grpd_merchant = df_sample.groupby(['Merchant Name'])
merchant_nodes = grpd_merchant.agg({'Amount' : [('m_min','min'),('m_max','max'),('m_mean','mean'),('m_std', 'std')]}).reset_index()
merchant_nodes = merchant_nodes.round(2)
merchant_nodes.head()
merchant_nodes.shape   # (10853, 2)

Unnamed: 0_level_0,Merchant Name,Amount,Amount,Amount,Amount
Unnamed: 0_level_1,Unnamed: 1_level_1,m_min,m_max,m_mean,m_std
0,m0,58.53,150.28,104.4,64.88
1,m1,18.14,18.14,18.14,
2,m10,186.43,186.43,186.43,
3,m100,18.96,18.96,18.96,
4,m1000,4.82,5.26,5.0,0.19


(10853, 5)

In [326]:
## 8.01 Group by merchant and merchant city:
grpd_merchant1 = df_sample.groupby(['Merchant Name','Merchant City'])
merchant_nodes1 = grpd_merchant1['Amount'].mean().reset_index()
merchant_nodes1 = merchant_nodes1.round(2)
merchant_nodes1.head()
merchant_nodes1.shape    # (25984, 3). It appears a Merchant operates in more than one City

Unnamed: 0,Merchant Name,Merchant City,Amount
0,m0,Beaverton,104.4
1,m1,Stanwood,18.14
2,m10,Islandton,186.43
3,m100,Covington,18.96
4,m1000,Memphis,5.0


(25984, 3)

In [327]:
## 8.01 Group by merchant and MCC:
grpd_merchant2 = df_sample.groupby(['Merchant Name','MCC'])
merchant_nodes2 = grpd_merchant2['Amount'].mean().reset_index()
merchant_nodes2 = merchant_nodes2.round(2)
merchant_nodes2.head()
merchant_nodes2.shape    # (10875, 3). It appears a Merchant has more than one line of business
                         #  or two merchants have the same name. 

Unnamed: 0,Merchant Name,MCC,Amount
0,m0,8021,104.4
1,m1,5310,18.14
2,m10,8011,186.43
3,m100,5912,18.96
4,m1000,7832,5.0


(10875, 3)

In [328]:
# 8.1 With which one of the merchants fraud comitted:
df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'Merchant Name' ].unique()
merchant_suspect = df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'Merchant Name' ].unique()
len(merchant_suspect)

array(['m8978', 'm2242', 'm8070', 'm2241', 'm7725', 'm9211', 'm5086',
       'm6523', 'm4461', 'm5917', 'm9139', 'm814', 'm9344', 'm1118',
       'm3261', 'm5437', 'm3758', 'm5091', 'm3777', 'm1270', 'm10751',
       'm6848', 'm2174', 'm5283', 'm1856', 'm2364', 'm8199', 'm5548',
       'm9850', 'm10682', 'm7528', 'm10183', 'm8246', 'm9405', 'm8985',
       'm397', 'm4637', 'm1471', 'm4333', 'm2234', 'm3470', 'm2655',
       'm6654', 'm91', 'm2052', 'm5110', 'm4233', 'm1016', 'm9658',
       'm9321', 'm1485', 'm7274', 'm6370', 'm2015', 'm6985', 'm7393',
       'm9329', 'm5613', 'm9045', 'm8473', 'm3570', 'm5705', 'm4693',
       'm5934', 'm5247', 'm7082', 'm5466', 'm874', 'm10498', 'm4976',
       'm8834', 'm2115', 'm1721', 'm4250', 'm3655', 'm2286'], dtype=object)

76

In [329]:
# 8.2 We record this also in 'victim':
merchant_nodes['victim'] = "No" 

# 8.3 Merchant 'Cat' we designate as 'Institution'
merchant_nodes['Cat'] = 'Institution'

In [330]:
# 8.3 Record which all merchants have propensity to become victims:

for i in merchant_suspect:
  merchant_nodes.loc[merchant_nodes['Merchant Name'] == i, 'victim' ] = "Yes"
  

In [331]:
# 8.4
merchant_nodes['victim'].value_counts()  # 10777, 76

No     10777
Yes       76
Name: victim, dtype: int64

In [332]:
# 8.5 Label and Id columns of Merchant nodes:

merchant_nodes['Label'] = merchant_nodes['Merchant Name']
merchant_nodes = merchant_nodes.rename(columns = {'Merchant Name': "Id"})
merchant_nodes.head()
merchant_nodes.shape

Unnamed: 0_level_0,Id,Amount,Amount,Amount,Amount,victim,Cat,Label
Unnamed: 0_level_1,Unnamed: 1_level_1,m_min,m_max,m_mean,m_std,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,m0,58.53,150.28,104.4,64.88,No,Institution,m0
1,m1,18.14,18.14,18.14,,No,Institution,m1
2,m10,186.43,186.43,186.43,,No,Institution,m10
3,m100,18.96,18.96,18.96,,No,Institution,m100
4,m1000,4.82,5.26,5.0,0.19,No,Institution,m1000


(10853, 8)

In [333]:
# 8.5 We have a numeric field victim_n recording the
#     same information as 'victim':

merchant_nodes['victim_n'] = merchant_nodes['victim']
merchant_nodes['victim_n'] = merchant_nodes['victim_n'].map({"Yes":1 , "No" : 0})

In [334]:
# 8.5.1
merchant_nodes.head()
merchant_nodes.shape   # (10853, 6)

Unnamed: 0_level_0,Id,Amount,Amount,Amount,Amount,victim,Cat,Label,victim_n
Unnamed: 0_level_1,Unnamed: 1_level_1,m_min,m_max,m_mean,m_std,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,m0,58.53,150.28,104.4,64.88,No,Institution,m0,0
1,m1,18.14,18.14,18.14,,No,Institution,m1,0
2,m10,186.43,186.43,186.43,,No,Institution,m10,0
3,m100,18.96,18.96,18.96,,No,Institution,m100,0
4,m1000,4.82,5.26,5.0,0.19,No,Institution,m1000,0


(10853, 9)

In [335]:
! rm  /gdrive/MyDrive/credit_card_transactions/creditCard_merchants.csv

rm: cannot remove '/gdrive/MyDrive/credit_card_transactions/creditCard_merchants.csv': No such file or directory


In [336]:
# 8.6 Save merchant_nodes information to a file:

merchant_nodes.to_csv("creditCard_merchants.csv",
                      index = False,
                      sep = ";")

### Edges

Edges must have three fields: *Source* , *Target* and *Type*. *Type* field records if an edge is directed or undirected. Besides these three, it may have other fields also.

In [337]:
# 9.0 Group by User and Merchant Name:
#     We also wish to record the number of interactions between
#     customer and merchants. These will be edge weights:

edges = df_sample.groupby([df_sample['User'], df_sample['Merchant Name']]).size()

In [338]:
# 9.1 The size column has a name '0'. We need to rename it:

edges = df_sample[['User', 'Merchant Name']].groupby(['User', 'Merchant Name']).size().reset_index()
edges = edges.rename(columns = { 0 : "weight"})
edges.head()

Unnamed: 0,User,Merchant Name,weight
0,u0,m10039,3
1,u0,m10101,5
2,u0,m10183,1
3,u0,m10800,2
4,u0,m1118,5


In [339]:
# 9.2 Rename two other columns appropriately:

edges = edges.rename(columns = {'User': 'Source', 'Merchant Name': 'Target'})

In [340]:
# 9.3 Our edges are Undirected:

edges['Type']= 'Undirected'
edges.head()

Unnamed: 0,Source,Target,weight,Type
0,u0,m10039,3,Undirected
1,u0,m10101,5,Undirected
2,u0,m10183,1,Undirected
3,u0,m10800,2,Undirected
4,u0,m1118,5,Undirected


In [341]:
! rm  /gdrive/MyDrive/credit_card_transactions/creditCard_edges.csv

rm: cannot remove '/gdrive/MyDrive/credit_card_transactions/creditCard_edges.csv': No such file or directory


In [342]:
# 9.4 Finally save edges information to a file:

edges.to_csv("creditCard_edges.csv",
             index = False,
             sep = ";"
             )

### Check files


In [343]:
! ls -la !ls /gdrive/MyDrive/credit_card_transactions/

ls: cannot access '!ls': No such file or directory
/gdrive/MyDrive/credit_card_transactions/:
total 2583813
-rw------- 1 root root 2354626737 Mar  9 08:19 card_transaction_v1.csv
-rw------- 1 root root  271088780 Mar 11 00:16 card_transaction_v1.zip
-rw------- 1 root root     802088 Mar 18 07:24 creditCard_edges.csv
-rw------- 1 root root     553021 Mar 18 07:24 creditCard_merchants.csv
-rw------- 1 root root      20195 Mar 18 07:24 creditCard_users.csv
drwx------ 2 root root       4096 Mar  7 07:08 .ipynb_checkpoints
-rw------- 1 root root   18727485 Mar 18 07:23 sample_2002.csv
