<a href="https://colab.research.google.com/github/harnalashok/hadoop/blob/main/credit_card_transactions_network_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Call libraries

In [None]:
# 1.0
import pandas as pd
import os

In [None]:
# 1.1
pd.__version__  # 1.3.5

'1.3.5'

In [None]:
# 1.2 Display cell outputs from multiple commands
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Mount google drive

In [None]:
# 2.0 Mount to ccma@fsm.ac.in
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# 2.1 Check files
!ls /gdrive/MyDrive/credit_card_transactions/

card_transaction_v1.csv  credit_card_edges.csv	   creditCard_users.csv
card_transaction_v1.zip  creditCard_merchants.csv


In [None]:
# 2.2 Change current directory
os.chdir("/gdrive/MyDrive/credit_card_transactions/")
os.listdir()

['card_transaction_v1.csv',
 'card_transaction_v1.zip',
 '.ipynb_checkpoints',
 'creditCard_users.csv',
 'creditCard_merchants.csv',
 'credit_card_edges.csv']

column names:  
User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?   
Sample data:  
0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No



### Read data and explore

In [None]:
# 3.0 Read data. File size is > 2gb
#     Takes around 
#     Feel free to include more fields (city, time of transaction etc) and analyse:
%%time
df = pd.read_csv("card_transaction_v1.csv",
                 usecols = ["User", "Year",  "Merchant Name","Merchant City", "Amount", "Is Fraud?"])

CPU times: user 30.8 s, sys: 5.82 s, total: 36.7 s
Wall time: 55.3 s


In [None]:
# 3.1
df.head()
df.tail()
df.shape

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
0,0,2002,$134.09,3527213246127876953,La Verne,No
1,0,2002,$38.48,-727612092139916043,Monterey Park,No
2,0,2002,$120.34,-727612092139916043,Monterey Park,No
3,0,2002,$128.95,3414527459579106770,Monterey Park,No
4,0,2002,$104.71,5817218446178736267,La Verne,No


Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
24386895,1999,2020,$-54.00,-5162038175624867091,Merrimack,No
24386896,1999,2020,$54.00,-5162038175624867091,Merrimack,No
24386897,1999,2020,$59.15,2500998799892805156,Merrimack,No
24386898,1999,2020,$43.12,2500998799892805156,Merrimack,No
24386899,1999,2020,$45.13,4751695835751691036,Merrimack,No


(24386900, 6)

In [None]:
# 3.2 Attempt should be made to
#     reduce data size:
df.dtypes

User              int64
Year              int64
Amount           object
Merchant Name     int64
Merchant City    object
Is Fraud?        object
dtype: object

In [None]:
# 3.3 Is data balanced?
df['Is Fraud?'].value_counts()

No     24357143
Yes       29757
Name: Is Fraud?, dtype: int64

In [None]:
# 3.4 Remove '$' sign from 'Amount':
#     Takes time:
%%time
df['Amount']=df['Amount'].str.replace("$", "")

  


In [None]:
# 3.5 Convert 'Amount' to numeric:
df['Amount'] = pd.to_numeric(df['Amount'])

In [None]:
# 3.6 Data shape
df.shape   # rows: 243,86,900; cols: 6)

(24386900, 6)

### Take a sample of data

In [None]:
# 4.0 As memory is limited, we will work
#     with a sample for the year 2002:
dfsample = df.loc[df['Year'] == 2002, :]

In [None]:
# 4.1 Sample size: 
dfsample.dtypes
dfsample.shape   # (350732, 6)

In [None]:
# 4.2 How many incidents of frauds?
dfsample['Is Fraud?'].value_counts()   # Yes: 139, No: 350593

No     350593
Yes       139
Name: Is Fraud?, dtype: int64

In [None]:
# 4.3 Should you like to save this sample
#     for latter quick reading?

dfsample.to_csv("sample_2002.csv", index = False)

In [None]:
# 4.4 We work with a copy of data
#     Lest, we make mistakes, we come back here:

df_sample = dfsample.copy()

In [None]:
# 4.5 And our data:
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
0,0,2002,134.09,3527213246127876953,La Verne,No
1,0,2002,38.48,-727612092139916043,Monterey Park,No
2,0,2002,120.34,-727612092139916043,Monterey Park,No
3,0,2002,128.95,3414527459579106770,Monterey Park,No
4,0,2002,104.71,5817218446178736267,La Verne,No


### Change user IDs
> 1.0 Know unique user names and transform them to short names  
> 2.0 Prepare a dictionary of user-ids and proposed Ids  
> 3.0 Make changes to our dataset using the dictionary  

In [None]:
# 5.0 To distinguish userids on network graph,
#     we will prefix them with 'u' and also assign
#      a short name:

#    So first get unique user names
u_user = df_sample['User'].unique()

In [None]:
# 5.1
u_user.sort()
u_user

In [None]:
# 5.2 Get alternate names by prefixing with 'u'
# 5.2.1 Transform users to dataframe; Get index as column; Rename new column as 'alt_name'
dx = pd.DataFrame(u_user, columns = ["u_user"]).reset_index().rename(columns = {'index' : 'alt_name' })

# 5.2.2 Transform the new column to string
dx['alt_name'] = dx['alt_name'].apply(str)

# 5.2.3 A new column 'a' with a constant value:
dx['a'] = "u"

# 5.2.4 Concatenate 'a' column and 'alt_name' column:
dx['alt_name'] = dx['a'].str.cat(dx['alt_name']) 

# 5.2.5 Forget 'a'
_=dx.pop('a')

# 5.2.6 Check:
dx.head()

Unnamed: 0,alt_name,u_user
0,u0,0
1,u1,2
2,u2,4
3,u3,5
4,u4,15


In [None]:
# 5.3 Prepare a dictionary for
#     old names (key) and new names (value) 
#     this transformation:

# https://stackoverflow.com/a/20250947/3282777
map_dict = dict(zip(dx['u_user'], dx['alt_name']))
map_dict

In [None]:
# 5.4 Use 'replace' method to replace values:

df_sample['User'] = df_sample['User'].replace(map_dict, inplace= False)

In [None]:
# 5.5 Check:
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
0,u0,2002,134.09,3527213246127876953,La Verne,No
1,u0,2002,38.48,-727612092139916043,Monterey Park,No
2,u0,2002,120.34,-727612092139916043,Monterey Park,No
3,u0,2002,128.95,3414527459579106770,Monterey Park,No
4,u0,2002,104.71,5817218446178736267,La Verne,No


### Change merchant IDs
Same steps for changing long merchantIds with short IDs. We prefix these Ids with 'm'

In [None]:
# 6.0 Unique merchant ids
mn = df_sample['Merchant Name'].unique()

In [None]:
# 6.1 Sort them in place
mn.sort()

In [None]:
# 6.2 Prepare a dataframe of existing and alternate names:
dx = pd.DataFrame(mn, columns = ["mt"]).reset_index().rename(columns = {'index' : 'alt_name' })
dx['alt_name'] = dx['alt_name'].apply(str)
dx['a'] = "m"
dx['alt_name'] = dx['a'].str.cat(dx['alt_name']) 
_=dx.pop('a')
dx.head()

Unnamed: 0,alt_name,mt
0,m0,-9216029123349204090
1,m1,-9215609737857753742
2,m2,-9214558223928539670
3,m3,-9211415240299816455
4,m4,-9210704852233124461


In [None]:
# 6.3 Create a dictionary for transformation:
# https://stackoverflow.com/a/20250947/3282777
map_dict = dict(zip(dx['mt'], dx['alt_name']))
map_dict

In [None]:
# 6.4 Transform: Takes time
%%time
df_sample['Merchant Name'] = df_sample['Merchant Name'].replace(map_dict, inplace= False)

CPU times: user 14.2 s, sys: 1min 8s, total: 1min 22s
Wall time: 1min 28s


In [None]:
# 6.5 And check
df_sample.head()

Unnamed: 0,User,Year,Amount,Merchant Name,Merchant City,Is Fraud?
0,u0,2002,134.09,m7466,La Verne,No
1,u0,2002,38.48,m4998,Monterey Park,No
2,u0,2002,120.34,m4998,Monterey Park,No
3,u0,2002,128.95,m7393,Monterey Park,No
4,u0,2002,104.71,m8834,La Verne,No


### User Nodes
If a user has committed fraud, even once, we mark him with propensity to commit frauds. A column 'suspect' is added tht records this propensity.

In [None]:
## 7.0 Group by user to get user node charteristics:
grpd_user = df_sample.groupby(['User'])
user_nodes = grpd_user['Amount'].mean().reset_index()
user_nodes = user_nodes.round(decimals = 2)
user_nodes.head()

Unnamed: 0,User,Amount
0,u0,84.78
1,u1,41.82
2,u10,44.98
3,u100,100.45
4,u101,40.78


In [None]:
#user_nodes['User'] = user_nodes['User'].apply(str)
#user_nodes['a'] = "u"
#user_nodes['User'] = user_nodes['a'].str.cat(user_nodes['User'])
#_= user_nodes.pop('a')

In [None]:
# user_nodes.head()

Unnamed: 0,User,Amount
0,u0,84.78
1,u1,41.82
2,u10,44.98
3,u100,100.45
4,u101,40.78


In [None]:
# 7.1 Which of the users have committed fraud
df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'User' ].unique()
users_suspect = df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'User' ].unique()
len(users_suspect)

array(['u12', 'u16', 'u43', 'u103', 'u117', 'u131', 'u171', 'u183',
       'u196', 'u198', 'u210', 'u212', 'u228', 'u235', 'u244', 'u264',
       'u315', 'u335', 'u349', 'u361', 'u373', 'u381', 'u384', 'u388',
       'u413'], dtype=object)

25

In [None]:
# 7.2 Add column 'suspect' with default value of 'No'
user_nodes['suspect'] = "No" 

# 7.3 To distinguish user nodes from merchant nodes on the graph
#     we add a 'Cat' column.

user_nodes['Cat'] = 'Member'

In [None]:
# 7.4 Even if a user committed fraud, we set value
#     in 'suspect' as 'Yes':

for i in users_suspect:
  user_nodes.loc[user_nodes['User'] == i, 'suspect' ] = "Yes"  

In [None]:
# 7.5 So how many of them?
user_nodes['suspect'].value_counts()  # No: 401, 'Yes: 25

No     401
Yes     25
Name: suspect, dtype: int64

In [None]:
# 7.6 Our user nodes data
user_nodes.head()

Unnamed: 0,User,Amount,suspect,Cat
0,u0,84.78,No,Member
1,u1,41.82,No,Member
2,u10,44.98,No,Member
3,u100,100.45,No,Member
4,u101,40.78,No,Member


In [None]:
# 7.7 Create a 'Label' column
#     that simply records usrids.
#     'Label' column is a MUST:

user_nodes['Label'] = user_nodes['User']

# 7.8 Also data should have an Id column:
user_nodes = user_nodes.rename(columns = {'User': "Id"})

In [None]:
# 7.8.1 our user nodes data:
user_nodes.head()

Unnamed: 0,Id,Amount,suspect,Cat,Label
0,u0,84.78,No,Member,u0
1,u1,41.82,No,Member,u1
2,u10,44.98,No,Member,u10
3,u100,100.45,No,Member,u100
4,u101,40.78,No,Member,u101


In [None]:
# 7.9 We also create a column 'suspect_n' that records
#     1 for 'Yes' fraud and 0 for 'No' fraud
#    We do this as data manipulation capabilities are limited in gephi:

user_nodes['suspect_n'] = user_nodes['suspect']
user_nodes['suspect_n'] = user_nodes['suspect_n'].map({"Yes":1 , "No" : 0})

In [None]:
# 7.10 Save usernodes:
user_nodes.to_csv("creditCard_users.csv", index = False, sep = ";")

### Merchant nodes

In [None]:
## 8.0 Group by merchant andget merchant charteristics:
grpd_merchant = df_sample.groupby(['Merchant Name'])
merchant_nodes = grpd_merchant['Amount'].mean().reset_index()
merchant_nodes = merchant_nodes.round(2)
merchant_nodes.head()

Unnamed: 0,Merchant Name,Amount
0,m0,104.4
1,m1,18.14
2,m10,186.43
3,m100,18.96
4,m1000,5.0


In [None]:
# 8.1 With which of the merchants fraud comitted:
df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'Merchant Name' ].unique()
merchant_suspect = df_sample.loc[df_sample["Is Fraud?"] == "Yes", 'Merchant Name' ].unique()
len(merchant_suspect)

In [None]:
# 8.2 We record this also in 'suspect':
merchant_nodes['suspect'] = "No" 

# 8.3 Merchant 'Cat' we designate as 'Institution'
merchant_nodes['Cat'] = 'Institution'

In [None]:
# 8.3 
for i in merchant_suspect:
  merchant_nodes.loc[merchant_nodes['Merchant Name'] == i, 'suspect' ] = "Yes"
  

In [None]:
merchant_nodes['suspect'].value_counts()  # 10777, 76

No     10777
Yes       76
Name: suspect, dtype: int64

In [None]:
merchant_nodes['Label'] = merchant_nodes['Merchant Name']
merchant_nodes = merchant_nodes.rename(columns = {'Merchant Name': "Id"})
merchant_nodes.head()

Unnamed: 0,Id,Amount,suspect,Cat,Label
0,m0,104.4,No,Institution,m0
1,m1,18.14,No,Institution,m1
2,m10,186.43,No,Institution,m10
3,m100,18.96,No,Institution,m100
4,m1000,5.0,No,Institution,m1000


In [None]:
merchant_nodes['suspect_n'] = merchant_nodes['suspect']
merchant_nodes['suspect_n'] = merchant_nodes['suspect_n'].map({"Yes":1 , "No" : 0})

In [None]:
merchant_nodes.head()

Unnamed: 0,Id,Amount,suspect,Cat,Label,suspect_n
0,m0,104.4,No,Institution,m0,0
1,m1,18.14,No,Institution,m1,0
2,m10,186.43,No,Institution,m10,0
3,m100,18.96,No,Institution,m100,0
4,m1000,5.0,No,Institution,m1000,0


In [None]:
merchant_nodes.to_csv("creditCard_merchants.csv", index = False, sep = ";")

### Edges

In [None]:
# Group by User and Merchant Name
edges = df_sample.groupby([df_sample['User'], df_sample['Merchant Name']]).size()

In [None]:
edges = df_sample[['User', 'Merchant Name']].groupby(['User', 'Merchant Name']).size().reset_index()
edges = edges.rename(columns = { 0 : "weight"})
edges.head()


Unnamed: 0,User,Merchant Name,weight
0,u0,m10039,3
1,u0,m10101,5
2,u0,m10183,1
3,u0,m10800,2
4,u0,m1118,5


In [None]:
edges = edges.rename(columns = {'User': 'Source', 'Merchant Name': 'Target'})

In [None]:
edges['Type']= 'Undirected'

In [None]:
edges.head()

Unnamed: 0,Source,Target,weight,Type
0,u0,m10039,3,Undirected
1,u0,m10101,5,Undirected
2,u0,m10183,1,Undirected
3,u0,m10800,2,Undirected
4,u0,m1118,5,Undirected


In [None]:
edges.to_csv("credit_card_edges.csv", index = False, sep = ";")

In [None]:
########### Done ###########