Step 1. Ensure that you have the dataset file named `transactions.csv` in the current directory.

The dataset is a subset of https://www.kaggle.com/ealaxi/paysim1/version/2 which was originally generated as part of the following research:

E. A. Lopez-Rojas , A. Elmir, and S. Axelsson. "PaySim: A financial mobile money simulator for fraud detection". In: The 28th European Modeling and Simulation Symposium-EMSS, Larnaca, Cyprus. 2016

Step 2. Complete the following exercises.

0. Read the dataset (`transactions.csv`) as a Pandas dataframe. Note that the first row of the CSV contains the column names.
he column names as a list from the dataframe.

0. Return t
0. Return the first k rows from the dataframe.

0. Return a random sample of k rows from the dataframe.

0. Return a list of the unique transaction types.

0. Return a Pandas series of the top 10 transaction destinations with frequencies.

0. Return all the rows from the dataframe for which fraud was detected.

0. Bonus. Return a dataframe that contains the number of distinct destinations that each source has interacted with to, sorted in descending order. You will find [groupby](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html) and [agg](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html) useful. The predefined aggregate functions are under `pandas.core.groupby.GroupBy.*`. See the [left hand column](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.DataFrameGroupBy.nunique.html).

Use the empty cell to test the exercises. If you modify the original `df`, you can rerun the cell containing `exercise_0`.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

def exercise_0(file):
    df = pd.read_csv(file)
    return df

def exercise_1(df):
    column_names = df.columns.tolist()
    return column_names

def exercise_2(df, k):
    first_k_rows = df.head(k)
    return first_k_rows

def exercise_3(df, k):
    random_sample = df.sample(n=k)
    return random_sample

def exercise_4(df):
    unique_transaction_types = df['type'].unique().tolist()
    return unique_transaction_types

def exercise_5(df):
    top_10_destinations = df['amount'].value_counts().head(10)
    return top_10_destinations

def exercise_6(df):
    fraud_detected = df[df['isFraud'] == True]
    return fraud_detected

def exercise_7(df):
    distinct_destinations = df.groupby('nameOrig')['nameDest'].nunique().reset_index()
    distinct_destinations.columns = ['nameOrig', 'nameUnique']
    sorted_distinct_destinations = distinct_destinations.sort_values(by='nameUnique', ascending=False)
    return sorted_distinct_destinations


def visual_1(df):
    def transaction_counts(df):
        return df['type'].value_counts()
    def transaction_counts_split_by_fraud(df):
        return df.groupby(by=['type', 'isFraud']).size()

    fig, axs = plt.subplots(2, figsize=(6,10))
    transaction_counts(df).plot(ax=axs[0], kind='bar')
    axs[0].set_title('Transaction Types Frequencies')
    axs[0].set_xlabel('Transaction Type')
    axs[0].set_ylabel('Count')
    transaction_counts_split_by_fraud(df).plot(ax=axs[1], kind='bar')
    axs[1].set_title('Transaction Types Frequencies, Split by Fraud')
    axs[1].set_xlabel('Transaction Type, Fraud')
    axs[1].set_ylabel('Count')
    fig.suptitle('Transaction Types')
    fig.tight_layout(rect=[0, 0.03, 1, 0.95],h_pad=3.0)
    for ax in axs:
      for p in ax.patches:
          ax.annotate(p.get_height(), (p.get_x(), p.get_height()))
    plt.show()
    return 'The bar charts provide a visual representation of the distribution of transaction types and how they are split between fraudulent and non-fraudulent transactions.'


def visual_2(df):
    def query(df):
        df['Origin Delta'] = df['oldbalanceOrg']-df['newbalanceOrig']
        df['Destination Delta'] = df['oldbalanceDest']-df['newbalanceDest']
        return df[df['type']=='CASH_OUT']
    plot = query(df).plot.scatter(x='Origin Delta',y='Destination Delta')
    plot.set_title('Source & Destination Account Balance Delta for Cash Out Transactions')
    plot.set_xlim(left=-1e3, right=1e3)
    plot.set_ylim(bottom=-1e3, top=1e3)
    return 'The scatter plot shows the relationship between the changes in origin and destination account balances for Cash Out transactions. This visualization helps identify patterns or anomalies in the balance changes, which can be useful for detecting fraudulent activities or understanding transaction behaviors.'
    
def exercise_custom(df):
    avg_amount_by_type_fraud = df.groupby(['type', 'isFraud'])['amount'].mean().unstack().fillna(0)
    avg_amount_by_type_fraud.columns = ['Not Fraud', 'Fraud']
    avg_amount_by_type_fraud.index.name = 'Transaction Type'
    return avg_amount_by_type_fraud
    
def visual_custom(df):
    avg_amount_by_type_fraud = exercise_custom(df)
    fig, ax = plt.subplots(figsize=(10, 6))
    avg_amount_by_type_fraud.plot(kind='bar', ax=ax, colormap='viridis')
    ax.set_title('Average Transaction Amount by Type and Fraud Status')
    ax.set_xlabel('Transaction Type')
    ax.set_ylabel('Average Amount')
    ax.tick_params(axis='x', rotation=45)  
    
    return 'The bar chart shows the average transaction amount for each transaction type, split by fraud status. This visualization helps identify any significant differences in transaction amounts between fraudulent and non-fraudulent transactions.'

ModuleNotFoundError: No module named 'pandas'

In [None]:
df = exercise_0('transactions.csv')

In [None]:
# Test exercises here
df=exercise_0('transactions.csv')
column_names = exercise_1(df)
k_rows = exercise_2(df,10)
k_randoms = exercise_3(df,2);
k_unique = exercise_4(df)
top10_destinations = exercise_5(df)
fraud = exercise_6(df)
sorted_distinct_destinations = exercise_7(df)
average_amount=exercise_custom(df)

print(column_names)
print(k_rows)
print(k_randoms)
print(k_unique)
print(top10_destinations)
print(fraud)
print(sorted_distinct_destinations)
print(average_amount)
print(visual_1(df))
print(visual_2(df))
print(visual_custom(df))
plt.show()


Create graphs for the following. 
1. Transaction types bar chart, Transaction types split by fraud bar chart
1. Origin account balance delta v. Destination account balance delta scatter plot for Cash Out transactions

Ensure that the graphs have the following:
 - Title
 - Labeled Axes
 
The function plot the graph and then return a string containing a short description explaining the relevance of the chart.

In [None]:
def visual_1(df):
    def transaction_counts(df):
        return df['type'].value_counts()
    def transaction_counts_split_by_fraud(df):
        return df.groupby(by=['type', 'isFraud']).size()

    fig, axs = plt.subplots(2, figsize=(6,10))
    transaction_counts(df).plot(ax=axs[0], kind='bar')
    axs[0].set_title('Transaction Types Frequencies')
    axs[0].set_xlabel('Transaction Type')
    axs[0].set_ylabel('Count')
    transaction_counts_split_by_fraud(df).plot(ax=axs[1], kind='bar')
    axs[1].set_title('Transaction Types Frequencies, Split by Fraud')
    axs[1].set_xlabel('Transaction Type, Fraud')
    axs[1].set_ylabel('Count')
    fig.suptitle('Transaction Types')
    fig.tight_layout(rect=[0, 0.03, 1, 0.95],h_pad=3.0)
    for ax in axs:
      for p in ax.patches:
          ax.annotate(p.get_height(), (p.get_x(), p.get_height()))
    plt.show()
    return 'The bar charts provide a visual representation of the distribution of transaction types and how they are split between fraudulent and non-fraudulent transactions.'


In [None]:
def visual_2(df):
    def query(df):
        df['Origin Delta'] = df['oldbalanceOrg']-df['newbalanceOrig']
        df['Destination Delta'] = df['oldbalanceDest']-df['newbalanceDest']
        return df[df['type']=='CASH_OUT']
    plot = query(df).plot.scatter(x='Origin Delta',y='Destination Delta')
    plot.set_title('Source & Destination Account Balance Delta for Cash Out Transactions')
    plot.set_xlim(left=-1e3, right=1e3)
    plot.set_ylim(bottom=-1e3, top=1e3)
    return 'The scatter plot shows the relationship between the changes in origin and destination account balances for Cash Out transactions. This visualization helps identify patterns or anomalies in the balance changes, which can be useful for detecting fraudulent activities or understanding transaction behaviors.'

Use your newly-gained Pandas skills to find an insight from the dataset. You have full flexibility to go in whichever direction interests you. Please create a visual as above for this query. `visual_custom` should call `exercise_custom`.

In [None]:
def exercise_custom(df):
    avg_amount_by_type_fraud = df.groupby(['type', 'isFraud'])['amount'].mean().unstack().fillna(0)
    avg_amount_by_type_fraud.columns = ['Not Fraud', 'Fraud']
    avg_amount_by_type_fraud.index.name = 'Transaction Type'
    return avg_amount_by_type_fraud
    
def visual_custom(df):
    avg_amount_by_type_fraud = exercise_custom(df)
    fig, ax = plt.subplots(figsize=(10, 6))
    avg_amount_by_type_fraud.plot(kind='bar', ax=ax, colormap='viridis')
    ax.set_title('Average Transaction Amount by Type and Fraud Status')
    ax.set_xlabel('Transaction Type')
    ax.set_ylabel('Average Amount')
    ax.tick_params(axis='x', rotation=45)  
    
    return 'The bar chart shows the average transaction amount for each transaction type, split by fraud status. This visualization helps identify any significant differences in transaction amounts between fraudulent and non-fraudulent transactions.'

Submission

1. Copy the exercises into `task1.py`.
2. Upload `task1.py` to Forage.

All done!

Your work will be instrumental for our team's continued success.