In [1]:
!pip install joblib



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

# **Load datasets**

In [9]:
def load_data():

    fraud_data = pd.read_csv('/content/drive/MyDrive/Data/Fraud_Data.csv')
    ip_address = pd.read_csv('/content/drive/MyDrive/Data/IpAddress_to_Country.csv')
    creditcard_data = pd.read_csv('/content/drive/MyDrive/Data/creditcard.csv.zip')
    return fraud_data, ip_address, creditcard_data

fraud_data, ip_address, creditcard_data = load_data()

In [10]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


# **Data overview**

In [11]:
def data_overview(df):
    num_rows = df.shape[0]
    num_columns = df.shape[1]
    data_types = df.dtypes

    print(f"Number of rows:{num_rows}")
    print(f"Number of columns:{num_columns}")
    print(f"Data types of each column:\n{data_types}")

data_overview(fraud_data)

Number of rows:151112
Number of columns:11
Data types of each column:
user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
dtype: object


# **Handling missing values**

In [12]:
def check_missing(df):
    missing = df.isnull().sum()

    return missing

check_missing(fraud_data)

Unnamed: 0,0
user_id,0
signup_time,0
purchase_time,0
purchase_value,0
device_id,0
source,0
browser,0
sex,0
age,0
ip_address,0


# **Data cleaning**

In [15]:
def data_cleaning(df):
    # Remove duplicates
    df.drop_duplicates(inplace=True)

    print("Duplicates are removed from fraud data!")

    # convert timestamps

    df['signup_time'] = pd.to_datetime(df['signup_time'])
    df['purchase_time'] = pd.to_datetime(df['purchase_time'])

    print("Timestamps are converted!")
data_cleaning(fraud_data)

Duplicates are removed from fraud data!
Timestamps are converted!


In [23]:
def summarize_dataset(df, numerical_columns):
    """
    Calculates summary statistics for numerical columns in a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to summarize.
        numerical_columns (list or str): A list of numerical column names or a single column name.

    Returns:
        pd.DataFrame: A DataFrame containing the summary statistics for the specified columns.
    """
    # If numerical_columns is a string (single column name), convert it to a list
    if isinstance(numerical_columns, str):
        numerical_columns = [numerical_columns]

    # Initialize a list to hold summary statistics for each column
    summary_list = []

    for col in numerical_columns:
        summary_stats = {
            'Mean': df[col].mean(),
            'Median': df[col].median(),
            'Mode': df[col].mode().iloc[0],  # Taking the first mode in case of multiple modes
            'Standard Deviation': df[col].std(),
            'Variance': df[col].var(),
            'Range': df[col].max() - df[col].min(),
            'IQR': df[col].quantile(0.75) - df[col].quantile(0.25),
            'Skewness': df[col].skew(),
            'Kurtosis': df[col].kurtosis()
        }

        # Append the summary statistics for the current column to the list
        summary_list.append(summary_stats)

    # Convert summary stats list to DataFrame with appropriate index
    summary_df = pd.DataFrame(summary_list, index=numerical_columns)

    return summary_df

# Specify the numerical column name as a string
numerical_columns = 'purchase_value'
summarize_dataset(fraud_data, numerical_columns)

Unnamed: 0,Mean,Median,Mode,Standard Deviation,Variance,Range,IQR,Skewness,Kurtosis
purchase_value,36.935372,35.0,28,18.322762,335.723613,145,27.0,0.670485,0.138292
