# **Hypotesis Testing**

In [None]:
# Load data from Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


The `Chi-square` test of independence tests if there is a **significant relationship** between **two categorical variables**.

In [None]:
# Load libraries
import pandas as pd
from scipy.stats import chi2_contingency

In [None]:
# Read data
path = "the-path-in-gdrive"
data = pd.read_csv(path)

In [None]:
# due created_at actually has timestamp value convert and sort dataframe according timestamp distribution from past to current
# Convert 'created_date' to datetime and extract the date
data['created_at_date'] = pd.to_datetime(data['created_at']).dt.date
data = data.sort_values(by = "created_at").reset_index(drop = True)

In [None]:
# Overview dataset
data

Unnamed: 0,transaction_id,user_id,amount,payment_type,created_at,updated_at,deleted_at,category_vendor_name,vendor_name,jajan_item_name,country,sub_country,day,hour,created_at_date
0,277040,7884,17143,balance,2023-07-03 07:00:14,2023-07-03 07:31:51,,makanan,Sambel Sejahtera,Ayam Goreng Kriuk,Jakarta Pusat,Kemayoran,Monday,7,2023-07-03
1,131303,5098,25283,cash,2023-07-03 07:00:19,2023-07-03 08:13:49,,jasa,Creative Solutions Agency,Desain Grafis Kustom,Jakarta Selatan,Pancoran,Monday,7,2023-07-03
2,299267,6089,44640,cash,2023-07-03 07:00:21,2023-07-03 07:25:54,,alat rumah tangga,Modern Living Appliances,Penghisap Debu Robot,Jakarta Selatan,Kebayoran Lama,Monday,7,2023-07-03
3,75138,7125,30692,balance,2023-07-03 07:01:52,2023-07-03 07:18:44,,elektronik,Inovasi Gadget,Headphone Wireless Premium,Jakarta Barat,Kebon Jeruk,Monday,7,2023-07-03
4,333300,5130,41472,balance,2023-07-03 07:02:29,2023-07-03 08:15:01,,pakaian,Trendy Footwear Boutique,Sepatu Sneakers Urban,Jakarta Pusat,Kemayoran,Monday,7,2023-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437995,39130,6438,34176,balance,2024-07-03 06:56:09,2024-07-03 06:58:50,,alat rumah tangga,Home Essentials Emporium,Vakum Cleaner Kuat,Jakarta Pusat,Senen,Wednesday,6,2024-07-03
437996,240500,7472,14521,cash,2024-07-03 06:57:00,2024-07-03 08:45:29,,pakaian,Tren Mode Terkini,Celana Denim Stylish,Jakarta Barat,Palmerah,Wednesday,6,2024-07-03
437997,73832,7360,39802,balance,2024-07-03 06:57:32,2024-07-03 06:59:11,,pakaian,Koleksi Fashion Elegan,Sepatu Boots Kulit,Jakarta Barat,Palmerah,Wednesday,6,2024-07-03
437998,99775,5651,14367,balance,2024-07-03 06:58:53,2024-07-03 07:34:10,,makanan,Sambel Sejahtera,Ayam Goreng Kriuk,Jakarta Pusat,Senen,Wednesday,6,2024-07-03


In [None]:
# Group by 'created_at_date' and 'day', and count the transactions for each day
daily_transactions = data.groupby(['created_at_date', 'day']).size().reset_index(name='total_transactions')

In [None]:
# Overview new dataframe
daily_transactions

Unnamed: 0,created_at_date,day,total_transactions
0,2023-07-03,Monday,891
1,2023-07-04,Tuesday,1145
2,2023-07-05,Wednesday,1162
3,2023-07-06,Thursday,1208
4,2023-07-07,Friday,1255
...,...,...,...
362,2024-06-29,Saturday,1178
363,2024-06-30,Sunday,1214
364,2024-07-01,Monday,1243
365,2024-07-02,Tuesday,1190


In [None]:
# Create a pivot table with 'day' as columns, 'created_at_date' as index, and 'total_transactions' as values
pivot_table = daily_transactions.pivot(index='created_at_date', columns='day', values='total_transactions')

In [None]:
# Overview pivot_table
pivot_table

day,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
created_at_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-07-03,,891.0,,,,,
2023-07-04,,,,,,1145.0,
2023-07-05,,,,,,,1162.0
2023-07-06,,,,,1208.0,,
2023-07-07,1255.0,,,,,,
...,...,...,...,...,...,...,...
2024-06-29,,,1178.0,,,,
2024-06-30,,,,1214.0,,,
2024-07-01,,1243.0,,,,,
2024-07-02,,,,,,1190.0,


Null Hypothesis (H0): **There is no relationship between the day of the week and the total daily transactions.**

Alternative Hypothesis (H1): **There is a relationship between the day of the week and the total daily transactions.**

The `p-value` is the **probability** that you would have found the current result if the **null hypothesis were true**. If the p-value is less than 0.05, we reject the null hypothesis.

In [None]:
# Perform Chi-Square test of independence
chi2, p, dof, expected = chi2_contingency(pivot_table.fillna(0))

In [None]:
print(f"Chi-square: {chi2}")
print(f"P-value: {p}")

Chi-square: 2628000.0
P-value: 0.0


 In this case, the Chi-square value is quite large, which suggests a **significant difference**.

p-value of 0.0 leads us to reject the null hypothesis. This means **there is a statistically significant relationship between the day of the week and the total daily transactions**.

In other words, the total daily transactions are dependent on what day of the week it is.