# Exploratory Data Analysis

- 데이터 전처리
    - 데이터 탐색
    - 데이터 클렌징

In [1]:
%matplotlib inline
pd.options.display.max_columns = 200
from numpy.linalg import svd
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
# df_action = pd.read_csv('../input/제6회 L.POINT Big Data Competition-분석용데이터-01.온라인 행동 정보.csv')
df_transaction = pd.read_csv('../input/제6회 L.POINT Big Data Competition-분석용데이터-02.거래 정보.csv')
# df_client_demo = pd.read_csv('../input/제6회 L.POINT Big Data Competition-분석용데이터-03.고객 Demographic 정보.csv')
df_product = pd.read_csv('../input/제6회 L.POINT Big Data Competition-분석용데이터-04.상품분류 정보.csv')

# Online Action columns
- clnt_id : 클라이언트 ID, 고객을 고유하게 식별할 수 있도록 랜덤으로 부여된 ID
- sess_id : 세션 ID, Web/App에 접속 후 세션이 시작될 때 부여된 순번 ID (클라이언트 ID에 여러개의 세션 ID 발급될 수 있음)
- hit_seq : 조회일련번호, 조회 순서를 알 수 있도록 부여된 일련번호
- action_type : 행동유형, 총 8가지의 행동 유형을 구분한 코드 [0.검색/ 1.제품 목록/ 2.제품 세부정보 보기/ 3. 장바구니 제품 추가/ 4, 장바구니 제품 삭제/ 5. 결제시도/ 6. 구매 완료/ 7. 구매환불/ 8. 결제옵션]
- biz_unit : 업종단위, 온라인 및 오프라인 이용처를 구분하는 단위코드
- sess_dt : 세션일자, (YYYYMMDD 형식)
- hit_tm : 조회시간 (HH:MM 형식)
- hit_pss_tm : 조회경과시간, 세션이 시작된 이후 해당 조회까지 경과한 시간(단위:밀리초), 첫번째 조회하면 0으로 설정
- trans_id : 거래 ID, 구매내역을 고유하게 식별할 수 있도록 랜덤으로 부여된 ID
- srch_kwd : 검색 키워드, 고객이 검색한 키워드
- tot_pag_view_ct : 홈페이지조회건수, 세션 내의 총 페이지(화면)뷰 수
- tot_sess_hr_v : 총세션시간값, 세션 내 총 시간(단위: 초)
- trfc_src : 유입채널, 고객이 유입된 채널, [DIRECT/ PUSH/ WEBSITE/ PORTAL_1/ PORTAL_2/ PORTAL_3/ unknown]
- dvc_ctg_nm : 기기유형, [mobile_web/ mobile_app/ PC]

In [None]:
df_action.tail(10)

### Missing Data

In [None]:
df_missing = df_action.isnull().sum(axis=0).reset_index()
df_missing.columns = ['column_name', 'missing_count']
df_missing = df_missing.loc[df_missing['missing_count']>0]
ind = np.arange(df_missing.shape[0])
fig, ax = plt.subplots(figsize=(9,5))
rects = ax.barh(ind, (df_missing.missing_count.values/len(df_action))*100, color='r')
ax.set_yticks(ind)
ax.set_yticklabels(df_missing.column_name.values, rotation='horizontal')
ax.set_xlabel("% of missing values")
plt.show()

# Transaction columns
- clnt_id : 클라이언트 ID 
- trans_id : 거래 ID
- trans_seq: 거래일련번호, 해당 거래 ID의 구매 내역 내 구매 순서를 알 수 있도록 부여된 일련번호
- biz_unit : 업종단위  
- pd_c : 상품소분류코드,
- de_dt : 상품을 구매한 일자 (YYYYMMDD)
- de_tm : 상품을 구매한 시각 (HH:MM)
- buy_am : 구매한 상품의 금액
- buy_ct : 구매한 상품의 수량

In [None]:
df_transaction.tail(10)

# Client Demographic
- clnt_id : 클라이언트 ID
- clnt_gender : 성별 [M/F/unknown]
- clnt_age : 연령대 [10대 이하/ 20대/ 30대/ 40대/ 50대/ 60대 이상/ 정보없음 : unknown]

In [None]:
df_client_demo.tail(10)

# Product columns
- pd_c : 상품 소분류코드 (최소단위)
- clac_nm1 : 상품 대분류명
- clac_nm2 : 상품 중분류명
- clac_nm3 : 상품 소분류명

In [None]:
df_product.tail(10)

### Missing Data

In [None]:
df_missing = df_product.isnull().sum(axis=0).reset_index()
df_missing.columns = ['column_name', 'missing_count']
df_missing = df_missing.loc[df_missing['missing_count']>0]
ind = np.arange(df_missing.shape[0])
fig, ax = plt.subplots(figsize=(9,1))
rects = ax.barh(ind, (df_missing.missing_count.values/len(df_product))*100, color='r')
ax.set_yticks(ind)
ax.set_yticklabels(df_missing.column_name.values, rotation='horizontal')
ax.set_xlabel("% of missing values")
plt.show()

In [None]:
df1 = df_transaction[['clnt_id', 'pd_c']]
df1.loc[df1['pd_c'] == 'unknown', 'pd_c'] = '0000'
df1['clnt_id'] = df1['clnt_id'].astype(int)
df1['pd_c'] = df1['pd_c'].astype(int)
df = pd.merge(df1, df_product, how='left')
df2 = df[['clnt_id', 'clac_nm3']]
df3 = df2.groupby(["clnt_id", "clac_nm3"]).size().reset_index(name="Counts")
df3.tail(2)

# Creation of Sparse matrix(User x Item/MCAT)
- https://medium.com/@ashutoshsingh93/recommendation-system-for-e-commerce-using-collaborative-filtering-fa04d6ab1fd8

In [3]:
# convert df_product['pd_c'] datatype : int -> object
df_product['pd_c'] = df_product['pd_c'].apply(lambda num: "{:04n}".format(num))

# df_transaction and df_prodcct merge!
df_0 = pd.merge(df_transaction, df_product, how='left')

column_titles = ['clnt_id', 'de_dt', 'pd_c', 'clac_nm1', 'clac_nm2', 'clac_nm3', 'buy_ct']
df = df_0.reindex(columns=column_titles)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 599961 entries, 0 to 599960
Data columns (total 7 columns):
clnt_id     599961 non-null int64
de_dt       599961 non-null int64
pd_c        599961 non-null object
clac_nm1    582309 non-null object
clac_nm2    582309 non-null object
clac_nm3    582283 non-null object
buy_ct      599961 non-null int64
dtypes: int64(3), object(4)
memory usage: 36.6+ MB


In [5]:
# Convert user_id and product_id names into numerical IDs
df['clnt_id_cat'] = df['clnt_id'].astype('category').cat.codes
df['pd_c_cat'] = df['pd_c'].astype("category").cat.codes

In [6]:
df

Unnamed: 0,clnt_id,de_dt,pd_c,clac_nm1,clac_nm2,clac_nm3,buy_ct,clnt_id_cat,pd_c_cat
0,21922,20190920,unknown,,,,1,3401,1667
1,21279,20190920,unknown,,,,1,3291,1667
2,39423,20190920,unknown,,,,1,6172,1667
3,18362,20190920,unknown,,,,1,2851,1667
4,39423,20190920,0565,Fruits,Imported Fruits,Bananas,1,6172,564
...,...,...,...,...,...,...,...,...,...
599956,35311,20190927,0339,Cosmetics / Beauty Care,Skin Care,Facial Masks,1,5494,338
599957,35311,20190927,0339,Cosmetics / Beauty Care,Skin Care,Facial Masks,2,5494,338
599958,35311,20190927,0339,Cosmetics / Beauty Care,Skin Care,Facial Masks,1,5494,338
599959,35311,20190927,0339,Cosmetics / Beauty Care,Skin Care,Facial Masks,1,5494,338


In [None]:
df.tail(15)

In [None]:
# Create a lookup frame so we can get the product names back in
item_lookup = df[['pd_c_cat', 'clac_nm3']].drop_duplicates()
item_lookup['pd_c_cat'] = item_lookup.pd_c_cat.astype(str)
item_lookup.tail(5)

In [None]:
user_lookup = df[['clnt_id_cat', 'clnt_id']].drop_duplicates()
user_lookup['clnt_id_cat'] = user_lookup.clnt_id_cat.astype(str)
user_lookup.tail(4)

In [None]:
df = df.drop(['clnt_id', 'pd_c'], axis=1)
df.tail(5)

In [None]:
# Create lists of all users, products and their purchase counts
users = list(np.sort(df.clnt_id_cat.unique()))
products = list(np.sort(df.pd_c_cat.unique()))
purchases = list(df.buy_ct)

In [None]:
len(users), len(products), len(purchases)

In [None]:
# Get the rows and columns for our new matrix
rows = df.clnt_id_cat.astype(int)
cols = df.pd_c_cat.astype(int)

In [None]:
import scipy.sparse as sparse
# Create a sparse matrix for our users and mcats containing number of purchases
data_sparse_new = sparse.csr_matrix((purchases, (rows, cols)), shape=(len(users), len(products)))

## Calculation of User vectors and MCAT vectors

In [None]:
# lambda: Regularizer value
def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
    # Calculate the Confidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns using numpy array shape
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users x features, the item vectors
    # Y of size items x features and randimly assign values to them using np.random.normal
    X = sparse.csr_matrix(np.random.normal(size=(user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size=(item_size, features)))
    
    # Identity matrix and lamda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I
    
    for i in range(iterations):
        print('iteration %d of %d' % (i+1, iterations))
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)
        
        # Run in a loop for entire user data
        for u in range(user_size):
            
            # Get the user row.
            u_row = confidence[u, :].toarray()
            
            # Calulate the binary preference p(u). If known then preference is 1 and zero if not known.
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0
            
            # Calculate Cu and Cu - I. Confidence values for users and mcats.
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I
            
            # Put it all together and compute the final formula
            yT_CuI_y = Y.T.dot(CuI).dot(Y)
            yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)
            
        for i in range(item_size):
            
            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()
            
            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0
            
            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I
            
            # Put it all together and compute the final formula
            xT_CiI_x = X.T.dot(CiI).dot(X)
            xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)
            
    return X, Y

## Recommendataion of mcats to users 

In [None]:
user_vecs, item_vecs = implicit_als(data_sparse_new, iterations=5, features=20, alpha_val=40)

In [None]:
# Let's say we want to recommend mcats for user having user ID = 11
user_id = 11

#-----------------------------------------
# GET the buy leads purchased by the user
#-----------------------------------------

# Let's print out what the user has purchased
consumed_idx = data_sparse_new[user_id, :].nonzero()[1].astype(str)
consumed_items = item_lookup.loc[item_lookup.pd_c_cat.isin(consumed_idx)]
print(consumed_items)

#----------------------------------------
# Lets create users recommendations
#----------------------------------------

def recommend(user_id, data_sparse_new, user_vecs1, item_vecs1, item_lookup, num_items=10):
    
    # Get all interactions by the user
    user_interactions = data_sparse_new[user_id, :].toarray()
    
    # We don't want to recommend items the items the user has consumed. 
    # let's set them all to 0 and the unknowns to 1.
    user_interactions = user_interactions.reshape(-1) + 1 #Reshape to turn into 1D array
    user_interactions[user_interactions > 1] = 0
    
    # This is where we calculate the recommendation by taking the dot-product of the user vectors with the item vectors
    rec_vector = user_vecs1[user_id, :].dot(item_vecs1.T).toarray()
    
    # Let's scale our scores between 0 and 1 to make it all easier to interpret.
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions*rec_vector_scaled
    
    # Get all the mcats indices in order of recommendations (descending) and select only the top mcats
    item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    
    products = []
    product_scores = []
    
    # Loop through our recommended mcat indicies and look up the actual mcat name 
    for idx in item_idx:
        products.append(item_lookup.clac_nm3.loc[item_lookup.pd_c_cat == str(idx)].iloc[0])
        product_scores.append(recommend_vector[idx])
        
    # Create a new dataframe with recommended mcat names and scores
    recommendations = pd.DataFrame({'clac_nm3': products, 'score': product_scores})
    
    return recommendations

# Lets generate and print our recommendations of mcats against users
recommendations = recommend(user_id, data_sparse_new, user_vecs, item_vecs, item_lookup)
print(recommendations)