In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import ast
from scipy.stats import ttest_ind
import seaborn as sns
import time
import networkx as nx
import warnings


# SettingWithCopyWarning 무시
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# 모든 출력을 보여주도록 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


In [2]:
parquet_file = 'C:/Users/jjccyy9741/OneDrive - postech.ac.kr/바탕 화면/LG/데이터/postech_company_project.parquet'
# Parquet 파일 열기
table = pq.read_table(parquet_file)

# Table에서 데이터를 DataFrame으로 변환
data = table.to_pandas()
print(data.shape)

(13881284, 71)


In [3]:
df = data.copy()
df = df[['PREV_RECEIPT_NUMBER','RECEIPT_NUMBER']]
print(df.shape)

(13881284, 2)


In [4]:
df11 = df.copy()

In [39]:
n = 50000

df12 = df11.head(n)

# 시작 시간 기록
start_time = time.time()

print('데이터 크기(n):', n)
print(' ')

# 네트워크 그래프 생성
G = nx.Graph()
for _, row in df12.iterrows():
    if row['PREV_RECEIPT_NUMBER'] is not None:
        G.add_edge(row['PREV_RECEIPT_NUMBER'], row['RECEIPT_NUMBER'])

# 같은 그룹 찾기
connected_components = list(nx.connected_components(G))

# 결과 출력
result_df12 = pd.DataFrame()
if not connected_components:
    # 시퀀스가 하나도 없는 경우
    result_df12 = df12.copy()
    result_df12['group'] = 0
else:
    for idx, component in enumerate(connected_components):
        matching_group = df12[df12['RECEIPT_NUMBER'].isin(component)].copy()  # .copy() 추가
        matching_group['group'] = idx + 1
        result_df12 = pd.concat([result_df12, matching_group], ignore_index=True)

    # 'group' 컬럼으로 정렬
    result_df12 = result_df12.sort_values(by='group')

# 종료 시간 기록
end_time = time.time()
print('실행 시간:', end_time - start_time)
print(' ')

# 빈도가 2 이상인 값을 출력
seq_group = result_df12['group'].value_counts()[result_df12['group'].value_counts() >= 2].index.tolist()
print('시퀀스 그룹 개수:', len(seq_group))
# print('시퀀스 그룹:',seq_group)
print(' ')
print('데이터 크기:', len(result_df12))

# 'group' 열 값의 빈도 계산 및 정렬
group_counts = result_df12['group'].value_counts().sort_values(ascending=False)

# 결과 출력
print('재서비스 그룹:',group_counts[group_counts >= 2])

# Sort result_df12_filtered within each group based on 'RECEIPT_NUMBER'
result_df12_filtered = result_df12[result_df12['group'].isin(group_counts[group_counts >= 2].index)]

result_df12_filtered = result_df12_filtered.groupby('group', group_keys=False).apply(lambda x: x.sort_values('RECEIPT_NUMBER'))


데이터 크기(n): 50000
 
실행 시간: 8.323346376419067
 
시퀀스 그룹 개수: 1066
 
데이터 크기: 6591
재서비스 그룹: group
4062    4
4209    4
3447    3
3005    3
2873    3
5008    3
5001    3
5057    3
4302    3
3403    3
2697    3
3207    3
2898    3
1551    3
3310    3
3507    3
4668    3
2030    3
1665    3
3293    3
3461    3
1254    3
3610    3
3571    3
2502    3
2897    3
4064    3
3630    3
1369    3
2491    3
3481    3
3597    3
1244    3
1631    3
4510    3
4189    3
1669    3
5348    3
1900    3
1221    3
4748    3
1633    3
1134    3
5449    3
1135    3
2801    3
1220    3
2264    3
3493    3
1660    2
2768    2
5201    2
2767    2
2770    2
3126    2
5194    2
3844    2
1661    2
2772    2
5198    2
3993    2
5188    2
3127    2
5426    2
2761    2
5183    2
5306    2
3065    2
2728    2
3001    2
1745    2
5420    2
66      2
64      2
5423    2
5428    2
2936    2
5205    2
1666    2
5153    2
5206    2
2730    2
5159    2
1644    2
5157    2
5156    2
3148    2
5430    2
5151    2
1641    2
3152    

Unnamed: 0,PREV_RECEIPT_NUMBER,RECEIPT_NUMBER,group
4,LGE_REC_6091,LGE_REC_4,5
5,,LGE_REC_6091,5
18,LGE_REC_61,LGE_REC_18,18
19,,LGE_REC_61,18
21,,LGE_REC_13312,19


## 시퀀스 데이터 안에서 순서 정렬하기

In [59]:
result_df12_filtered[result_df12_filtered['group']==4209]

Unnamed: 0,PREV_RECEIPT_NUMBER,RECEIPT_NUMBER,group
5013,,LGE_REC_33965,4209
5014,LGE_REC_33965,LGE_REC_36795,4209
5015,LGE_REC_33965,LGE_REC_36857,4209
5016,LGE_REC_36795,LGE_REC_46918,4209


In [60]:
ff=result_df12_filtered[result_df12_filtered['group']==4209]


def custom_sort(df):
    # 새로운 열 생성 (현재 행의 인덱스와 해당 행의 PREV_RECEIPT_NUMBER가 매핑되는 인덱스의 차이)
    df['SORT_KEY'] = df.apply(lambda row: row.name - df[df['RECEIPT_NUMBER'] == row['PREV_RECEIPT_NUMBER']].index[0] if not pd.isnull(row['PREV_RECEIPT_NUMBER']) else np.nan, axis=1)
    
    # 정렬 후 SORT_KEY 열 제거
    df_sorted = df.sort_values(by=['SORT_KEY']).drop(columns=['SORT_KEY']).reset_index(drop=True)
    
    return df_sorted

# 데이터 프레임 정렬
ff_sorted = custom_sort(ff)

# 결과 출력
ff_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SORT_KEY'] = df.apply(lambda row: row.name - df[df['RECEIPT_NUMBER'] == row['PREV_RECEIPT_NUMBER']].index[0] if not pd.isnull(row['PREV_RECEIPT_NUMBER']) else np.nan, axis=1)


Unnamed: 0,PREV_RECEIPT_NUMBER,RECEIPT_NUMBER,group
0,LGE_REC_33965,LGE_REC_36795,4209
1,LGE_REC_33965,LGE_REC_36857,4209
2,LGE_REC_36795,LGE_REC_46918,4209
3,,LGE_REC_33965,4209


In [63]:
ff=result_df12_filtered[result_df12_filtered['group']==4209]

def custom_sort(df):
    # 새로운 열 생성 (현재 행의 인덱스와 같은 그룹 내에서의 순서)
    df['SORT_KEY'] = df.groupby(['group', 'PREV_RECEIPT_NUMBER']).cumcount()
    
    # 정렬 후 SORT_KEY 열 제거
    df_sorted = df.sort_values(by=['group', 'SORT_KEY']).drop(columns=['SORT_KEY']).reset_index(drop=True)
    
    return df_sorted

# 데이터 프레임 정렬
ff_sorted = custom_sort(ff)


# 결과 출력
ff_sorted

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SORT_KEY'] = df.groupby(['group', 'PREV_RECEIPT_NUMBER']).cumcount()


Unnamed: 0,PREV_RECEIPT_NUMBER,RECEIPT_NUMBER,group
0,LGE_REC_33965,LGE_REC_36795,4209
1,LGE_REC_36795,LGE_REC_46918,4209
2,LGE_REC_33965,LGE_REC_36857,4209
3,,LGE_REC_33965,4209


In [28]:
result_df12_filtered[result_df12_filtered['group']==4062]

Unnamed: 0,PREV_RECEIPT_NUMBER,RECEIPT_NUMBER,group
4823,LGE_REC_36666,LGE_REC_36635,4062
4824,LGE_REC_48938,LGE_REC_36666,4062
4825,LGE_REC_36635,LGE_REC_46529,4062
4826,,LGE_REC_48938,4062


In [6]:
result_df12[result_df12['group']==8741]

Unnamed: 0,PREV_RECEIPT_NUMBER,RECEIPT_NUMBER,group


In [7]:
result_df2 = result_df12.groupby('group').last().reset_index()
print(result_df2.shape)
print(result_df2['group'].nunique())
result_df2.head(10)

(37, 3)
37


Unnamed: 0,group,PREV_RECEIPT_NUMBER,RECEIPT_NUMBER
0,1,LGE_REC_14193717,LGE_REC_0
1,2,LGE_REC_14193718,LGE_REC_1
2,3,LGE_REC_14193719,LGE_REC_2
3,4,LGE_REC_14193720,LGE_REC_3
4,5,LGE_REC_6091,LGE_REC_4
5,6,LGE_REC_14193721,LGE_REC_5
6,7,LGE_REC_14193722,LGE_REC_6
7,8,LGE_REC_14193723,LGE_REC_7
8,9,LGE_REC_14193724,LGE_REC_8
9,10,LGE_REC_14193725,LGE_REC_9


In [8]:
# result_df2에 있는 RECEIPT_NUMBER 값만 갖는 데이터프레임 생성
data2 = data[data['RECEIPT_NUMBER'].isin(result_df2['RECEIPT_NUMBER'])]
data2.shape

(37, 71)