In [1]:
import pandas as pd
from janitor import clean_names

### Question 1: Read in all 4 credit card transaction datasets and clean up the column names.

In [2]:
card_details = pd.read_csv('data/CardBase.csv').clean_names()

In [4]:
customer_details = pd.read_csv('data/CustomerBase.csv').clean_names()

In [5]:
fraud_details = pd.read_csv('data/FraudBase.csv').clean_names()

In [12]:
transaction_details = pd.read_csv('data/TransactionBase.csv').clean_names()

### Question 2: Join the data containing card details and customer details by customer id, so that all records of card details and any matching records in customer details are kept.

In [9]:
card_details.merge(customer_details, how = 'left', on = 'cust_id')

Unnamed: 0,card_number,card_family,credit_limit,cust_id,age,customer_segment,customer_vintage_group
0,8638-5407-3631-8196,Premium,530000,CC67088,27,Diamond,VG1
1,7106-4239-7093-1515,Gold,18000,CC12076,48,Gold,VG3
2,6492-5655-8241-3530,Premium,596000,CC97173,22,Diamond,VG1
3,2868-5606-5152-5706,Gold,27000,CC55858,30,Diamond,VG1
4,1438-6906-2509-8219,Platinum,142000,CC90518,50,Platinum,VG2
...,...,...,...,...,...,...,...
495,4708-4407-9601-6022,Premium,691000,CC64993,34,Diamond,VG1
496,6042-2856-7280-2925,Gold,33000,CC26787,46,Platinum,VG2
497,8706-3809-5167-3899,Premium,144000,CC32532,28,Platinum,VG2
498,9540-8558-5897-5046,Premium,830000,CC90246,35,Gold,VG3


### Question 3: Join the data containing fraud details with transaction details so all rows of both tables are kept. What does the resulting row number tell you?

Before joining, fraud details had 109 rows and transaction details had 10000

After joining there were 10000 rows meaning that of the 10000 transactions, 109 were fraudulent 

In [19]:
fraud_details.merge(transaction_details, how = 'outer', on = 'transaction_id')

Unnamed: 0,transaction_id,fraud_flag,transaction_date,credit_card_id,transaction_value,transaction_segment
0,CTID50558449,1.0,6-May-16,4246-1369-3659-8804,6984,SEG23
1,CTID55936882,1.0,29-Nov-16,1336-9200-1264-2551,34367,SEG21
2,CTID63762180,1.0,5-Dec-16,8528-6154-7390-5081,44550,SEG15
3,CTID76723439,1.0,15-Sep-16,7908-2695-7391-7499,48275,SEG16
4,CTID21246201,1.0,29-Feb-16,2524-4184-5908-6750,35751,SEG25
...,...,...,...,...,...,...
9995,CTID25037573,,10-Jun-16,9157-2802-8374-1145,44280,SEG18
9996,CTID43832298,,23-May-16,7416-4529-6690-5703,27163,SEG12
9997,CTID56532072,,6-Aug-16,6699-2639-4522-6219,36424,SEG13
9998,CTID88101446,,10-Oct-16,8341-5263-4582-7396,8464,SEG14


### Question 4. [Harder] Join the data containing card details with transaction details so rows from the first which have matching ones in the second are returned.

In [30]:
card_transactions = card_details.merge(transaction_details, how = 'inner', left_on='card_number', right_on='credit_card_id')

In [31]:
card_details.loc[card_details.card_number.isin(card_transactions.card_number), ]

Unnamed: 0,card_number,card_family,credit_limit,cust_id
0,8638-5407-3631-8196,Premium,530000,CC67088
1,7106-4239-7093-1515,Gold,18000,CC12076
2,6492-5655-8241-3530,Premium,596000,CC97173
3,2868-5606-5152-5706,Gold,27000,CC55858
4,1438-6906-2509-8219,Platinum,142000,CC90518
...,...,...,...,...
495,4708-4407-9601-6022,Premium,691000,CC64993
496,6042-2856-7280-2925,Gold,33000,CC26787
497,8706-3809-5167-3899,Premium,144000,CC32532
498,9540-8558-5897-5046,Premium,830000,CC90246


### Question 5. Read in hat_observations and split observation into two columns, hat_colour and hat_type.

In [37]:
hat_observations = pd.read_csv('data/hat_observations.csv').clean_names()

In [38]:
hat_observations.head()

Unnamed: 0,day,month,year,observation,observation_count
0,25,10,2019,"black,beret",7
1,11,9,2019,"blue,top hat",10
2,4,12,2018,"green,santa hat",9
3,18,6,2018,"white,beret",9
4,26,9,2018,"black,santa hat",1


In [41]:
hat_observations[['hat_colour', 'hat_type']] = hat_observations.observation.str.split(pat=',', expand = True)

In [42]:
hat_observations

Unnamed: 0,day,month,year,observation,observation_count,hat_colour,hat_type
0,25,10,2019,"black,beret",7,black,beret
1,11,9,2019,"blue,top hat",10,blue,top hat
2,4,12,2018,"green,santa hat",9,green,santa hat
3,18,6,2018,"white,beret",9,white,beret
4,26,9,2018,"black,santa hat",1,black,santa hat
...,...,...,...,...,...,...,...
95,15,8,2018,"purple,bucket hat",8,purple,bucket hat
96,26,1,2018,"green,bowler hat",5,green,bowler hat
97,15,11,2018,"black,top hat",7,black,top hat
98,14,9,2019,"green,helmet",4,green,helmet


### Question 6: Unite day, month, and year columns into a column called date using a suitable separator. Then find the date where the most berets were observed.

In [46]:
hat_observations['date'] = hat_observations.day.astype(str) + '-' + hat_observations.month.astype(str) + '-' + hat_observations.year.astype(str)

In [64]:
(
    hat_observations
    .loc[hat_observations.hat_type == 'beret', ['observation_count', 'hat_type', 'date']]
    .groupby(['date', 'hat_type'])
    .observation_count
    .sum()
    .reset_index(name = 'hat_count')
    .sort_values('hat_count', ascending=False)
)

Unnamed: 0,date,hat_type,hat_count
2,18-6-2018,beret,11
4,25-10-2019,beret,7
9,8-12-2019,beret,7
5,26-5-2018,beret,6
7,4-1-2018,beret,6
1,16-11-2018,beret,5
8,7-12-2019,beret,5
3,20-2-2019,beret,2
6,26-9-2019,beret,2
0,14-1-2018,beret,1
