# category 카이제곱 검정

### **<구하고 싶은 것>**
**2024-01-11 이전 유저와 이후 유저의 TOP3 카테고리, 브랜드 구매 비율이 얼마나 동질적인지**  

#### 데이터 가져오기

In [13]:
import pandas as pd
import numpy as np

from scipy.stats import chi2_contingency

In [14]:
df = pd.read_csv('./looker.csv')

  df = pd.read_csv('./looker.csv')


In [15]:
pd.set_option('display.max_columns', None)
df.head(1)

Unnamed: 0,user_id,age,gender,account_state,account_city,account_country,latitude,longitude,account_traffic_source,account_created_at,order_id,order_items_id,status,sale_price,order_created_at,session_id,sequence_number,login_user_id,sess_created_at,sess_traffic_source,uri,event_type,ip_address,sess_state,sess_city,inventory_item_id,product_id,category,brand,product_name,inventory_created_at,inventory_sold_at,cost
0,73890.0,40.0,F,California,San Francisco,United States,37.795407,-122.422234,Search,2023-05-09,92631.0,134592.0,Complete,54.950001,2023-09-13,,,,,,,,,,,363366.0,5672.0,Pants & Capris,Eddie Bauer,Eddie Bauer Curvy Blakely Legend Wash Pants,2023-02-03,,29.28835


#### 형 변환

In [16]:
# 형변환
def convert_type(df):
    for col in df.columns:
        if '_at' in col:  # 시간 데이터
            df = df.assign(**{col:pd.to_datetime(df[col])})
            print('[+] {0:<30} >>> datetime'.format(col))
        elif '_id' in col:  # id 컬럼 -> obj형으로
            df = df.assign(**{col:df[col].astype('object')})
            print('[+] {0:<30} >>> object'.format(col))
    return df

In [17]:
df = convert_type(df)

[+] user_id                        >>> object
[+] account_created_at             >>> datetime
[+] order_id                       >>> object
[+] order_items_id                 >>> object
[+] order_created_at               >>> datetime
[+] session_id                     >>> object
[+] login_user_id                  >>> object
[+] sess_created_at                >>> datetime
[+] inventory_item_id              >>> object
[+] product_id                     >>> object
[+] inventory_created_at           >>> datetime
[+] inventory_sold_at              >>> datetime


#### 실험군(A), 대조군(B) 나누기
실험군 : 2024-01-11 이후 유저  
대조군 : 2024-01-11 이전 유저

In [18]:
# 실험군: 2024 신규 유입 유저
treatment = df[df['account_created_at'] >= '2024-01-11'].copy()
display(treatment.head(1))

# 대조군: 기존 유저
control = df[df['account_created_at'] < '2024-01-11'].copy()
display(control.head(1))

Unnamed: 0,user_id,age,gender,account_state,account_city,account_country,latitude,longitude,account_traffic_source,account_created_at,order_id,order_items_id,status,sale_price,order_created_at,session_id,sequence_number,login_user_id,sess_created_at,sess_traffic_source,uri,event_type,ip_address,sess_state,sess_city,inventory_item_id,product_id,category,brand,product_name,inventory_created_at,inventory_sold_at,cost
1711,65932.0,35.0,F,Ningxia Hui Autonomous Region,Rizhao,China,37.990152,106.165456,Organic,2024-01-15,82634.0,119972.0,Complete,49.990002,2024-01-16,43f18013-17af-4eeb-934a-7dce5eef8398,10.0,65932.0,2024-01-16,Adwords,/purchase,purchase,79.86.248.35,Ningxia Hui Autonomous Region,Rizhao,323796.0,1637.0,Fashion Hoodies & Sweatshirts,Ed Hardy,Ed Hardy Womens Rope & Tiger Hoodie - Mustard,2021-11-04,NaT,25.044991


Unnamed: 0,user_id,age,gender,account_state,account_city,account_country,latitude,longitude,account_traffic_source,account_created_at,order_id,order_items_id,status,sale_price,order_created_at,session_id,sequence_number,login_user_id,sess_created_at,sess_traffic_source,uri,event_type,ip_address,sess_state,sess_city,inventory_item_id,product_id,category,brand,product_name,inventory_created_at,inventory_sold_at,cost
0,73890.0,40.0,F,California,San Francisco,United States,37.795407,-122.422234,Search,2023-05-09,92631.0,134592.0,Complete,54.950001,2023-09-13,,,,NaT,,,,,,,363366.0,5672.0,Pants & Capris,Eddie Bauer,Eddie Bauer Curvy Blakely Legend Wash Pants,2023-02-03,NaT,29.28835


#### 전체 유저 수

In [19]:
print(
    f'전체 유저 수 : {df.user_id.nunique(dropna=True)}명\n\
실험군 (treatment) : {treatment.user_id.nunique()}명\n\
대조군 (control) : {control.user_id.nunique()}명'
    )

전체 유저 수 : 75608명
실험군 (treatment) : 2108명
대조군 (control) : 73500명


## 1. Category

### Category TOP3 (user)

In [26]:
# 2024-01-11 이후 주문량 TOP3
#df[df.order_created_at >= '2024-01-11'].category.value_counts()[:3].index
category_top3 = treatment.groupby('category')['order_items_id'].nunique().sort_values(ascending=False)[:3].index.to_list()
category_top3

['Intimates', 'Fashion Hoodies & Sweatshirts', 'Jeans']

##### 카이 제곱 검정

In [27]:
print('[Chi-square Analysis Result Report]\n------------------------------------')
for i,category_name in enumerate(category_top3, start=1):

    test = treatment[treatment.category==category_name].user_id.nunique()
    ctrl = control[control.category==category_name].user_id.nunique()

    ## Part1. Chi-square Test Report
    purchase_category = [test, ctrl] # A와 B의 특정 category를 구매한 유저 수
    unpurchase_category = [treatment.user_id.nunique() - test, 
                          control.user_id.nunique() - ctrl]  # A와 B의 특정 category를 구매하지 않은 유저 수
    
    ## Part2. Contingency Table & Chi-squaure Model
    ## 기초 테이블 형성
    cont_table = pd.DataFrame([purchase_category, unpurchase_category], columns=['treatment', 'control'], index=[f'purchase_{category_name}', f'unpurchase_{category_name}'])
    
    ## Part3. 카이제곱 독립성 검정 모델 선언
    chi2, p_val, _, _= chi2_contingency([purchase_category, unpurchase_category])
    expected = chi2_contingency([purchase_category, unpurchase_category])[3]
    
    ## 기대값 표 형성
    ex = pd.DataFrame(expected, columns = ['treatment', 'control'], index = [f'purchase_{category_name}', f'unpurchase_{category_name}'])

    print(
      'Category {}: {}'.format(i, category_name)
    , 'Chi-square: {}'.format(round(chi2, 2))
    , 'P-value: {}'.format(round(p_val, 2))
    , '--------------------------'
    , 'Expected Values'
    , ex
    , '--------------------------'
    , 'Observed Values'
    , cont_table
    ,  '=========================='
    , sep = '\n'
    )

[Chi-square Analysis Result Report]
------------------------------------
Category 1: Intimates
Chi-square: 1.5
P-value: 0.22
--------------------------
Expected Values
                        treatment       control
purchase_Intimates     159.839752   5573.160248
unpurchase_Intimates  1948.160248  67926.839752
--------------------------
Observed Values
                      treatment  control
purchase_Intimates          175     5558
unpurchase_Intimates       1933    67942
Category 2: Fashion Hoodies & Sweatshirts
Chi-square: 2.54
P-value: 0.11
--------------------------
Expected Values
                                            treatment       control
purchase_Fashion Hoodies & Sweatshirts     146.178235   5096.821765
unpurchase_Fashion Hoodies & Sweatshirts  1961.821765  68403.178235
--------------------------
Observed Values
                                          treatment  control
purchase_Fashion Hoodies & Sweatshirts          165     5078
unpurchase_Fashion Hoodies & Sweatshi

### ( X ) Category TOP3 (order_items_id)

In [28]:
# 2024-01-11 이후 주문량 TOP3
category_top3 = df[df.order_created_at >= '2024-01-11'].category.value_counts()[:3].index
category_top3

Index(['Intimates', 'Fashion Hoodies & Sweatshirts', 'Jeans'], dtype='object', name='category')

In [29]:
# 전체 category 주문량 TOP3
category_order_top3 = treatment.groupby('category')['order_items_id'].nunique().sort_values(ascending=False)[:3].index.to_list()
category_order_top3

['Intimates', 'Fashion Hoodies & Sweatshirts', 'Jeans']

In [30]:
print('[Chi-square Analysis Result Report]\n------------------------------------')
for i,category_name in enumerate(category_order_top3, start=1):

    test = treatment[treatment.category==category_name].order_items_id.nunique()
    ctrl = control[control.category==category_name].order_items_id.nunique()

    ## Part1. Chi-square Test Report
    purchase_category1 = [test, ctrl] # A와 B의 특정 category를 구매한 주문 수
    unpurchase_category1 = [treatment.order_items_id.nunique() - test, 
                          control.order_items_id.nunique() - ctrl]  # A와 B의 특정 category를 구매하지 않은 주문 수
    
    ## Part2. Contingency Table & Chi-squaure Model
    ## 기초 테이블 형성
    cont_table = pd.DataFrame([purchase_category1, unpurchase_category1], columns=['treatment', 'control'], index=[f'purchase_{category_name}', f'unpurchase_{category_name}'])
    
    ## Part3. 카이제곱 독립성 검정 모델 선언
    chi2, p_val, _, _= chi2_contingency([purchase_category1, unpurchase_category1])
    expected = chi2_contingency([purchase_category1, unpurchase_category1])[3]
    
    ## 기대값 표 형성
    ex = pd.DataFrame(expected, columns = ['treatment', 'control'], index = [f'purchase_{category_name}', f'unpurchase_{category_name}'])

    print(
      'Category {}: {}'.format(i, category_name)
    , 'Chi-square: {}'.format(round(chi2, 2))
    , 'P-value: {}'.format(round(p_val, 2))
    , '--------------------------'
    , 'Expected Values'
    , ex
    , '--------------------------'
    , 'Observed Values'
    , cont_table
    ,  '=========================='
    , sep = '\n'
    )

[Chi-square Analysis Result Report]
------------------------------------
Category 1: Intimates
Chi-square: 0.8
P-value: 0.37
--------------------------
Expected Values
                       treatment      control
purchase_Intimates     174.33432   5962.66568
unpurchase_Intimates  2166.66568  74105.33432
--------------------------
Observed Values
                      treatment  control
purchase_Intimates          186     5951
unpurchase_Intimates       2155    74117
Category 2: Fashion Hoodies & Sweatshirts
Chi-square: 1.28
P-value: 0.26
--------------------------
Expected Values
                                            treatment       control
purchase_Fashion Hoodies & Sweatshirts     154.136878   5271.863122
unpurchase_Fashion Hoodies & Sweatshirts  2186.863122  74796.136878
--------------------------
Observed Values
                                          treatment  control
purchase_Fashion Hoodies & Sweatshirts          168     5258
unpurchase_Fashion Hoodies & Sweatshirts   

## 2. Brand

In [None]:
# 전체 brand 주문량 TOP5
order_total_brand = df.groupby('brand').agg({'order_items_id':'nunique', 'category':'unique'}).reset_index().sort_values(by='order_items_id', ascending=False)[:5]
order_total_brand

Unnamed: 0,brand,order_items_id,category
96,Allegra K,2896,"[Fashion Hoodies & Sweatshirts, Shorts, Outerw..."
447,Calvin Klein,1477,"[Dresses, Sweaters, Underwear, Intimates, Oute..."
465,Carhartt,1166,"[Outerwear & Coats, Fashion Hoodies & Sweatshi..."
2583,Volcom,854,"[Accessories, Shorts, Fashion Hoodies & Sweats..."
1721,Nautica,822,"[Shorts, Jeans, Sleep & Lounge, Sweaters, Pant..."


In [None]:
# 2024-01-11 이전 주문량 TOP5
order_control_brand = control.groupby('brand').agg({'order_items_id':'nunique', 'category':'unique'}).reset_index().sort_values(by='order_items_id', ascending=False)[:5]
order_control_brand

Unnamed: 0,brand,order_items_id,category
94,Allegra K,2821,"[Fashion Hoodies & Sweatshirts, Shorts, Outerw..."
439,Calvin Klein,1433,"[Dresses, Sweaters, Underwear, Outerwear & Coa..."
455,Carhartt,1125,"[Outerwear & Coats, Fashion Hoodies & Sweatshi..."
2525,Volcom,830,"[Accessories, Shorts, Fashion Hoodies & Sweats..."
1059,Hanes,798,"[Underwear, Socks, Socks & Hosiery, Active, In..."


In [None]:
# 2024-01-11 이후 주문량 TOP5
order_test_brand = treatment.groupby('brand').agg({'order_items_id':'count', 'category':'unique'}).reset_index().sort_values(by='order_items_id', ascending=False)[:5]
order_test_brand

Unnamed: 0,brand,order_items_id,category
33,Allegra K,131,"[Dresses, Tops & Tees, Blazers & Jackets, Pant..."
142,Calvin Klein,81,"[Intimates, Tops & Tees, Pants, Fashion Hoodie..."
151,Carhartt,70,"[Jeans, Fashion Hoodies & Sweatshirts, Outerwe..."
551,Nautica,63,"[Sleep & Lounge, Socks, Suits & Sport Coats, U..."
749,Tommy Hilfiger,54,"[Outerwear & Coats, Accessories, Shorts, Sweat..."


### Brand TOP3 (user_id)

In [33]:
# 2024-01-11 이후 주문량 TOP3
# df[df.order_created_at >= '2024-01-11'].brand.value_counts()[:3].index
Brand_top3 = treatment.groupby('brand')['order_items_id'].nunique().sort_values(ascending=False)[:3].index.to_list()
Brand_top3

['Allegra K', 'Calvin Klein', 'Carhartt']

##### 카이 제곱 검정

In [34]:
print('[Chi-square Analysis Result Report]\n------------------------------------')
for i,brand_name in enumerate(Brand_top3, start=1):

    test = treatment[treatment.brand==brand_name].user_id.nunique()
    ctrl = control[control.brand==brand_name].user_id.nunique()

    ## Part1. Chi-square Test Report
    purchase_brand = [test, ctrl] # A와 B의 특정 brand를 구매한 유저 수
    unpurchase_brand = [treatment.user_id.nunique() - test, 
                        control.user_id.nunique() - ctrl]  # A와 B의 특정 brand를 구매하지 않은 유저 수
    
    ## Part2. Contingency Table & Chi-squaure Model
    ## 기초 테이블 형성
    cont_table = pd.DataFrame([purchase_brand, unpurchase_brand], columns=['treatment', 'control'], index=[f'purchase_{brand_name}', f'unpurchase_{brand_name}'])
    
    ## Part3. 카이제곱 독립성 검정 모델 선언
    chi2, p_val, _, _= chi2_contingency([purchase_brand, unpurchase_brand])
    expected = chi2_contingency([purchase_brand, unpurchase_brand])[3]
    
    ## 기대값 표 형성
    ex = pd.DataFrame(expected, columns = ['treatment', 'control'], index = [f'purchase_{brand_name}', f'unpurchase_{brand_name}'])

    print(
      'Brand {}: {}'.format(i, brand_name)
    , 'Chi-square: {}'.format(round(chi2, 2))
    , 'P-value: {}'.format(round(p_val, 2))
    , '--------------------------'
    , 'Expected Values'
    , ex
    , '--------------------------'
    , 'Observed Values'
    , cont_table
    ,  '=========================='
    , sep = '\n'
    )

[Chi-square Analysis Result Report]
------------------------------------
Brand 1: Allegra K
Chi-square: 0.2
P-value: 0.66
--------------------------
Expected Values
                        treatment       control
purchase_Allegra K      79.348323   2766.651677
unpurchase_Allegra K  2028.651677  70733.348323
--------------------------
Observed Values
                      treatment  control
purchase_Allegra K           75     2771
unpurchase_Allegra K       2033    70729
Brand 2: Calvin Klein
Chi-square: 0.08
P-value: 0.77
--------------------------
Expected Values
                           treatment       control
purchase_Calvin Klein      40.705745   1419.294255
unpurchase_Calvin Klein  2067.294255  72080.705745
--------------------------
Observed Values
                         treatment  control
purchase_Calvin Klein           43     1417
unpurchase_Calvin Klein       2065    72083
Brand 3: Carhartt
Chi-square: 1.77
P-value: 0.18
--------------------------
Expected Values
         

### ( X ) Brand TOP3 (order_items_id)

In [35]:
# 2024-01-11 이후 주문량 TOP5
Brand_top3 = df[df.order_created_at >= '2024-01-11'].brand.value_counts()[:3].index
Brand_top3

Index(['Allegra K', 'Calvin Klein', 'Carhartt'], dtype='object', name='brand')

In [36]:
# 전체 category 주문량 TOP3
brand_order_top3 = treatment.groupby('brand')['order_items_id'].nunique().sort_values(ascending=False)[:3].index.to_list()
brand_order_top3

['Allegra K', 'Calvin Klein', 'Carhartt']

In [37]:
print('[Chi-square Analysis Result Report]\n------------------------------------')
for i,brand_name in enumerate(brand_order_top3, start=1):

    test = treatment[treatment.brand==brand_name].order_items_id.nunique()
    ctrl = control[control.brand==brand_name].order_items_id.nunique()

    ## Part1. Chi-square Test Report
    purchase_brand1 = [test, ctrl] # A와 B의 특정 category를 구매한 주문 수
    unpurchase_brand1 = [treatment.order_items_id.nunique() - test, 
                          control.order_items_id.nunique() - ctrl]  # A와 B의 특정 category를 구매하지 않은 주문 수
    
    ## Part2. Contingency Table & Chi-squaure Model
    ## 기초 테이블 형성
    cont_table = pd.DataFrame([purchase_brand1, unpurchase_brand1], columns=['treatment', 'control'], index=[f'purchase_{brand_name}', f'unpurchase_{brand_name}'])
    
    ## Part3. 카이제곱 독립성 검정 모델 선언
    chi2, p_val, _, _= chi2_contingency([purchase_brand1, unpurchase_brand1])
    expected = chi2_contingency([purchase_brand1, unpurchase_brand1])[3]
    
    ## 기대값 표 형성
    ex = pd.DataFrame(expected, columns = ['treatment', 'control'], index = [f'purchase_{brand_name}', f'unpurchase_{brand_name}'])

    print(
      'Brand {}: {}'.format(i, brand_name)
    , 'Chi-square: {}'.format(round(chi2, 2))
    , 'P-value: {}'.format(round(p_val, 2))
    , '--------------------------'
    , 'Expected Values'
    , ex
    , '--------------------------'
    , 'Observed Values'
    , cont_table
    ,  '=========================='
    , sep = '\n'
    )

[Chi-square Analysis Result Report]
------------------------------------
Brand 1: Allegra K
Chi-square: 0.59
P-value: 0.44
--------------------------
Expected Values
                        treatment       control
purchase_Allegra K      82.266937   2813.733063
unpurchase_Allegra K  2258.733063  77254.266937
--------------------------
Observed Values
                      treatment  control
purchase_Allegra K           75     2821
unpurchase_Allegra K       2266    77247
Brand 2: Calvin Klein
Chi-square: 0.06
P-value: 0.81
--------------------------
Expected Values
                           treatment       control
purchase_Calvin Klein      41.957274   1435.042726
unpurchase_Calvin Klein  2299.042726  78632.957274
--------------------------
Observed Values
                         treatment  control
purchase_Calvin Klein           44     1433
unpurchase_Calvin Klein       2297    78635
Brand 3: Carhartt
Chi-square: 1.72
P-value: 0.19
--------------------------
Expected Values
        