cluster columns before deleting currency

In [53]:
# Import necessary libraries and modules
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("anon_data.csv", low_memory=False)
df = df.rename(columns=lambda x: x.lower())


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1707 entries, 0 to 1706
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   account           1707 non-null   object
 1   amount            1707 non-null   object
 2   currency          1707 non-null   object
 3   direction         1707 non-null   object
 4   cash in/cash out  1707 non-null   object
 5   effective_date    1707 non-null   object
 6   description       1707 non-null   object
 7   label             1707 non-null   object
dtypes: object(8)
memory usage: 106.8+ KB


In [55]:
# Display the first few rows of the dfset, df types, and a basic statistical summary
print(df.info())
print(df.head())
print(df.describe(include = 'all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1707 entries, 0 to 1706
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   account           1707 non-null   object
 1   amount            1707 non-null   object
 2   currency          1707 non-null   object
 3   direction         1707 non-null   object
 4   cash in/cash out  1707 non-null   object
 5   effective_date    1707 non-null   object
 6   description       1707 non-null   object
 7   label             1707 non-null   object
dtypes: object(8)
memory usage: 106.8+ KB
None
  account    amount currency direction cash in/cash out effective_date  \
0     a 1  1,814.39      USD    credit          Cash in     2023-02-28   
1     a 1     16.00      USD     debit         Cash out     2023-02-28   
2     a 1     16.00      USD     debit         Cash out     2023-02-28   
3     a 1    246.75      USD     debit         Cash out     2023-02-28   
4     a 1      4.

In [56]:
df.label.value_counts()

label
f    706
d    415
b    404
a     84
c     76
e     22
Name: count, dtype: int64

In [57]:
# Removing commas and converting the 'amount' column to numeric type
df['amount'] = df['amount'].replace('[\$,]', '', regex=True).astype(float)

# Display the first few rows to verify the transformation
df_head_transformed_amount = df.head()
df_head_transformed_amount



Unnamed: 0,account,amount,currency,direction,cash in/cash out,effective_date,description,label
0,a 1,1814.39,USD,credit,Cash in,2023-02-28,cbl hp bul adg,a
1,a 1,16.0,USD,debit,Cash out,2023-02-28,amb asg cbe cbr avd cpf,b
2,a 1,16.0,USD,debit,Cash out,2023-02-28,amb asg cbe cbr avd bxa,b
3,a 1,246.75,USD,debit,Cash out,2023-02-28,amb asg ub dp avd cgb,b
4,a 1,4.8,USD,debit,Cash out,2023-02-28,amb asg ip bip avd xl,b


In [58]:
# Converting 'effective_date' to datetime and extracting day of the week, month, and year
df['effective_date'] = pd.to_datetime(df['effective_date'])
df['day_of_week'] = df['effective_date'].dt.day_name()
df['month'] = df['effective_date'].dt.month
df['year'] = df['effective_date'].dt.year


# Display the first few rows to verify the transformation
df_head_transformed_date = df.head()
df_head_transformed_date



Unnamed: 0,account,amount,currency,direction,cash in/cash out,effective_date,description,label,day_of_week,month,year
0,a 1,1814.39,USD,credit,Cash in,2023-02-28,cbl hp bul adg,a,Tuesday,2,2023
1,a 1,16.0,USD,debit,Cash out,2023-02-28,amb asg cbe cbr avd cpf,b,Tuesday,2,2023
2,a 1,16.0,USD,debit,Cash out,2023-02-28,amb asg cbe cbr avd bxa,b,Tuesday,2,2023
3,a 1,246.75,USD,debit,Cash out,2023-02-28,amb asg ub dp avd cgb,b,Tuesday,2,2023
4,a 1,4.8,USD,debit,Cash out,2023-02-28,amb asg ip bip avd xl,b,Tuesday,2,2023


In [59]:
df.nunique()


account                8
amount              1377
currency               1
direction              2
cash in/cash out       2
effective_date        19
description         1495
label                  6
day_of_week            5
month                  1
year                   1
dtype: int64

In [60]:
df

Unnamed: 0,account,amount,currency,direction,cash in/cash out,effective_date,description,label,day_of_week,month,year
0,a 1,1814.39,USD,credit,Cash in,2023-02-28,cbl hp bul adg,a,Tuesday,2,2023
1,a 1,16.00,USD,debit,Cash out,2023-02-28,amb asg cbe cbr avd cpf,b,Tuesday,2,2023
2,a 1,16.00,USD,debit,Cash out,2023-02-28,amb asg cbe cbr avd bxa,b,Tuesday,2,2023
3,a 1,246.75,USD,debit,Cash out,2023-02-28,amb asg ub dp avd cgb,b,Tuesday,2,2023
4,a 1,4.80,USD,debit,Cash out,2023-02-28,amb asg ip bip avd xl,b,Tuesday,2,2023
...,...,...,...,...,...,...,...,...,...,...,...
1702,a 3,1685.50,USD,debit,Cash out,2023-02-01,amb asg avh nq cih cne,b,Wednesday,2,2023
1703,a 3,952.58,USD,debit,Cash out,2023-02-01,yg bcc aqh,c,Wednesday,2,2023
1704,a 3,343.00,USD,debit,Cash out,2023-02-01,yg kr bel,c,Wednesday,2,2023
1705,a 3,7577.70,USD,credit,Cash in,2023-02-01,nj cdc ha cih alk,d,Wednesday,2,2023


In [61]:
df.drop(columns = ['month','year','cash in/cash out','effective_date','currency'],inplace = True)

In [62]:
df['bank'] = df['account'].apply(lambda x: x.split(" ")[0])


In [63]:
df

Unnamed: 0,account,amount,direction,description,label,day_of_week,bank
0,a 1,1814.39,credit,cbl hp bul adg,a,Tuesday,a
1,a 1,16.00,debit,amb asg cbe cbr avd cpf,b,Tuesday,a
2,a 1,16.00,debit,amb asg cbe cbr avd bxa,b,Tuesday,a
3,a 1,246.75,debit,amb asg ub dp avd cgb,b,Tuesday,a
4,a 1,4.80,debit,amb asg ip bip avd xl,b,Tuesday,a
...,...,...,...,...,...,...,...
1702,a 3,1685.50,debit,amb asg avh nq cih cne,b,Wednesday,a
1703,a 3,952.58,debit,yg bcc aqh,c,Wednesday,a
1704,a 3,343.00,debit,yg kr bel,c,Wednesday,a
1705,a 3,7577.70,credit,nj cdc ha cih alk,d,Wednesday,a


In [64]:
df.amount = np.log(df.amount)

In [65]:
df.amount

0       7.503505
1       2.772589
2       2.772589
3       5.508376
4       1.568616
          ...   
1702    7.429818
1703    6.859174
1704    5.837730
1705    8.932965
1706    6.849925
Name: amount, Length: 1707, dtype: float64

In [66]:
df

Unnamed: 0,account,amount,direction,description,label,day_of_week,bank
0,a 1,7.503505,credit,cbl hp bul adg,a,Tuesday,a
1,a 1,2.772589,debit,amb asg cbe cbr avd cpf,b,Tuesday,a
2,a 1,2.772589,debit,amb asg cbe cbr avd bxa,b,Tuesday,a
3,a 1,5.508376,debit,amb asg ub dp avd cgb,b,Tuesday,a
4,a 1,1.568616,debit,amb asg ip bip avd xl,b,Tuesday,a
...,...,...,...,...,...,...,...
1702,a 3,7.429818,debit,amb asg avh nq cih cne,b,Wednesday,a
1703,a 3,6.859174,debit,yg bcc aqh,c,Wednesday,a
1704,a 3,5.837730,debit,yg kr bel,c,Wednesday,a
1705,a 3,8.932965,credit,nj cdc ha cih alk,d,Wednesday,a


In [67]:
df.to_csv('processed_data.csv',index = False)


In [68]:
df

Unnamed: 0,account,amount,direction,description,label,day_of_week,bank
0,a 1,7.503505,credit,cbl hp bul adg,a,Tuesday,a
1,a 1,2.772589,debit,amb asg cbe cbr avd cpf,b,Tuesday,a
2,a 1,2.772589,debit,amb asg cbe cbr avd bxa,b,Tuesday,a
3,a 1,5.508376,debit,amb asg ub dp avd cgb,b,Tuesday,a
4,a 1,1.568616,debit,amb asg ip bip avd xl,b,Tuesday,a
...,...,...,...,...,...,...,...
1702,a 3,7.429818,debit,amb asg avh nq cih cne,b,Wednesday,a
1703,a 3,6.859174,debit,yg bcc aqh,c,Wednesday,a
1704,a 3,5.837730,debit,yg kr bel,c,Wednesday,a
1705,a 3,8.932965,credit,nj cdc ha cih alk,d,Wednesday,a
