### Imports

In [1]:
# Import pandas in the line below
import os
import pandas as pd

### Load The Credit Card Data

In [2]:
credit_data = pd.read_csv('credit.csv')

In [3]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37097 entries, 0 to 37096
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   AcountNumber               37097 non-null  int64  
 1   CVV                        37097 non-null  int64  
 2   CustomerAge                28246 non-null  float64
 3   Gender                     37097 non-null  object 
 4   Marital Status             37097 non-null  object 
 5   CardColour                 37097 non-null  object 
 6   CardType                   37097 non-null  object 
 7   Domain                     37097 non-null  object 
 8   Amount                     37097 non-null  int64  
 9   AverageIncomeExpendicture  37097 non-null  int64  
 10  Outcome                    37097 non-null  int64  
 11  Customer_City_Address      37097 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 3.4+ MB


In [4]:
credit_data.head()

Unnamed: 0,AcountNumber,CVV,CustomerAge,Gender,Marital Status,CardColour,CardType,Domain,Amount,AverageIncomeExpendicture,Outcome,Customer_City_Address
0,1275734409,364,28.0,Male,Married,Gold,Verve,Local,129282,170919,0,Enugu
1,1271246193,401,25.0,Male,Single,Gold,Verve,International,574384,329353,1,Enugu
2,1242290165,266,21.0,Female,Married,White,Visa,International,190766,292922,0,Enugu
3,1245478185,402,26.0,Male,Unknown,White,Visa,Local,130395,145444,0,Ibadan
4,1258212072,334,28.0,Female,Married,Gold,Verve,International,685145,295990,1,Port Harcourt


### Number of missing values by column

In [5]:
missing = pd.concat([credit_data.isnull().sum(), 100 * credit_data.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count', ascending=False)

Unnamed: 0,count,%
CustomerAge,8851,23.859072
AcountNumber,0,0.0
CVV,0,0.0
Gender,0,0.0
Marital Status,0,0.0
CardColour,0,0.0
CardType,0,0.0
Domain,0,0.0
Amount,0,0.0
AverageIncomeExpendicture,0,0.0


### Categorical Features

In [6]:
credit_data.select_dtypes('object')

Unnamed: 0,Gender,Marital Status,CardColour,CardType,Domain,Customer_City_Address
0,Male,Married,Gold,Verve,Local,Enugu
1,Male,Single,Gold,Verve,International,Enugu
2,Female,Married,White,Visa,International,Enugu
3,Male,Unknown,White,Visa,Local,Ibadan
4,Female,Married,Gold,Verve,International,Port Harcourt
...,...,...,...,...,...,...
37092,Female,Married,Gold,Verve,Local,Lagos
37093,Male,Divorced,Gold,Verve,Local,Enugu
37094,Male,Married,White,Visa,International,Abuja
37095,Female,Unknown,Gold,Verve,International,Port Harcourt


Renaming the column Martial Status to MaritalStatus

In [7]:
credit_data.rename(columns={'Marital Status': 'MaritalStatus'}, inplace=True)

Finding out how many duplicates where both Gender and Martial Status are combined together to see who is processing the most transactions? Maybe there is a pattern in who commits the most fraudulent transactions.

In [8]:
(credit_data['Gender'] + ', ' + credit_data['MaritalStatus']).value_counts().head()

Male, Married      10776
Male, Single        9002
Female, Married     6481
Female, Single      5391
Male, Divorced      1724
dtype: int64

Finding out how many duplicates where both Martial Status and Domain are combined together. Seems to me that people who are married commit the most transactions.

In [9]:
(credit_data['MaritalStatus'] + ', ' + credit_data['Domain']).value_counts().head()

Married, International     12329
Single, International      10263
Married, Local              4928
Single, Local               4130
Divorced, International     1980
dtype: int64

Finding out if there is any relationship between the card color and type. Since, the card color white has two different card types then we can't possibly remove one of these columns.

In [10]:
(credit_data['CardColour'] + ', ' + credit_data['CardType']).value_counts().head()

Gold, Verve          18550
White, Visa          11289
White, MasterCard     7258
dtype: int64

### Numerical Features

Checking for duplicates in account number to see if there is a pattern.

In [11]:
(credit_data['AcountNumber']).value_counts().head(20)

1269094635    2
1276061861    2
1259815378    2
1285336445    2
1257782942    2
1284493739    2
1278174058    2
1252922519    2
1242817103    2
1248724577    2
1287116131    2
1251077435    2
1268851746    2
1260235156    2
1269308901    2
1265305760    2
1282575323    1
1265418423    1
1276557568    1
1243030869    1
Name: AcountNumber, dtype: int64

Filling all null customer age with the mean average.

In [12]:
credit_data['CustomerAge'] = credit_data['CustomerAge'].fillna(credit_data['CustomerAge'].mean())

In [13]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37097 entries, 0 to 37096
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   AcountNumber               37097 non-null  int64  
 1   CVV                        37097 non-null  int64  
 2   CustomerAge                37097 non-null  float64
 3   Gender                     37097 non-null  object 
 4   MaritalStatus              37097 non-null  object 
 5   CardColour                 37097 non-null  object 
 6   CardType                   37097 non-null  object 
 7   Domain                     37097 non-null  object 
 8   Amount                     37097 non-null  int64  
 9   AverageIncomeExpendicture  37097 non-null  int64  
 10  Outcome                    37097 non-null  int64  
 11  Customer_City_Address      37097 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 3.4+ MB


### Mean averages and Percentages

Finding the mean average for Amount and AverageIncomeExpendicture.

In [14]:
gender_means = credit_data.groupby('Gender')[['Amount', 'AverageIncomeExpendicture']].mean()
gender_means.head()

Unnamed: 0_level_0,Amount,AverageIncomeExpendicture
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,547912.898066,227907.611387
Male,551669.796774,227074.577029


In [15]:
customer_city_means = credit_data.groupby('Customer_City_Address')[['Amount', 'AverageIncomeExpendicture']].mean()
customer_city_means.head()

Unnamed: 0_level_0,Amount,AverageIncomeExpendicture
Customer_City_Address,Unnamed: 1_level_1,Unnamed: 2_level_1
Abuja,546960.543427,227515.907199
Enugu,551723.983219,228508.967728
Ibadan,553072.223532,228135.190099
Kano,551989.34875,228464.383271
Lagos,551016.711818,225813.33874


In [16]:
outcome_means = credit_data.groupby('Outcome')[['Amount', 'AverageIncomeExpendicture']].mean()
outcome_means.head()

Unnamed: 0_level_0,Amount,AverageIncomeExpendicture
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
0,277735.534286,209412.748227
1,647113.580234,233774.79441


In [17]:
acount_number_means = credit_data.groupby('AcountNumber')[['Amount', 'AverageIncomeExpendicture']].mean()
acount_number_means.head()

Unnamed: 0_level_0,Amount,AverageIncomeExpendicture
AcountNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
1239297369,416060.0,219725.0
1239297757,721586.0,190844.0
1239298985,322428.0,258338.0
1239299086,611134.0,194377.0
1239300503,398998.0,179275.0


This will be an imbalenced data set. For every transaction there is roughly a 74% chance the transaction will be fradulent.

In [18]:
credit_data['Outcome'].value_counts()/len(credit_data['Outcome']) * 100

1    73.779551
0    26.220449
Name: Outcome, dtype: float64

Checking the percentage of fraudlent transactions.

In [21]:
total_transactions = len(credit_data)
print(total_transactions)

37097


In [24]:
normal_transactions = len(credit_data[credit_data['Outcome'] == 0])
print(normal_transactions)

9727


In [25]:
fradulent_transactions = len(credit_data[credit_data['Outcome'] == 1])
print(fradulent_transactions)

27370


In [30]:
fraudulent_percentage = round(fradulent_transactions / normal_transactions * 100, 2)
print(fraudulent_percentage)

281.38


According to the above transactions there is a 281% chance the tranasction is fradulent.

### Removing duplicates if there are any

In [32]:
credit_data.shape

(37097, 12)

In [33]:
credit_data.drop_duplicates(inplace=True)

In [34]:
credit_data.shape

(37097, 12)