In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# 1. read to data

In [None]:
# Load in the data
memb_info = pd.read_csv('/data/user/workspace/model-irt/memb_info_v1.0.csv')

# 2. preprocessing for the generation of derivative variables

In [None]:
# 필요한 변수만을 추출하여 clustering용 data set 지정
memb_info = memb_info[['학년',
                       '완료학습개수', '출석횟수',
                       '국어_평균_점수', '영어_평균_점수', '수학_평균_점수', '사회_평균_점수', '과학_평균_점수',
                       '평균출석율', '계획완료개수_코스', '계획완료개수_차시추가', '비계획완료', '국어_오답_수행율', '영어_오답_수행율', '수학_오답_수행율', '사회_오답_수행율', '과학_오답_수행율',
                       '주중출석횟수', '강의타입:(영상+문제)_수', '강의타입:첨삭_수', '강의타입:문제_수', '강의타입:영상_수', '강의타입:T_ABILITY_EVAL_SS_OL없음_수', '강의타입:CORE없음_수', '강의타입:WHY없음_수', '강의타입:T_PENG없음_수']]

In [None]:
memb_info['주말출석율'] = (memb_info['출석횟수'] - memb_info['주중출석횟수']) / memb_info['출석횟수']
memb_info['주중출석율'] = memb_info['주중출석횟수'] / memb_info['출석횟수']

In [None]:
# 강의타입:XX_수 / 완료학습개수
# 단, T_ABILITY_EVAL_SS_OL없음_수, CORE없음_수, WHY없음_수, T_PENG없음_수는
# null이 매우 많기에 날림
# 날리는 방식에 있어서는 해당 학생의 완료학습개수를 빼줌
# 위에서 확인한 결과 null이 될 경우를 고려 안 해도 될 듯

del_list = ['강의타입:T_ABILITY_EVAL_SS_OL없음_수', '강의타입:CORE없음_수', '강의타입:WHY없음_수', '강의타입:T_PENG없음_수']

user_list = []

for type in del_list:
    print("type name: {}".format(type))
    for row in memb_info['{}'.format(type)].dropna().iteritems():
#         print("pre_완료학습개수: {}".format(origin_set['완료학습개수'][row[0]]))
#         print("abstract value: {}".format(row[1]))
        memb_info['완료학습개수'][row[0]] = memb_info['완료학습개수'][row[0]] - row[1]
        
#         print("post_완료학습개수: {}".format(origin_set['완료학습개수'][row[0]]))

In [None]:
# 강의타입:XX_수 / 완료학습개수

type_list = ['강의타입:(영상+문제)_수', '강의타입:첨삭_수', '강의타입:문제_수', '강의타입:영상_수']

for type in type_list:
    memb_info['타입율:{}'.format(type[5:])] = memb_info[type] / memb_info['완료학습개수']

In [None]:
# 출석횟수 대비 완료학습
memb_info['일일_공부량'] = memb_info['완료학습개수'] / memb_info['출석횟수']

In [None]:
# 계획완료율
memb_info['계획완료율'] = (memb_info['계획완료개수_코스'] + memb_info['계획완료개수_차시추가']) / memb_info['완료학습개수']

In [None]:
# 비계획완료
memb_info['비계획완료율'] = memb_info['비계획완료'] / memb_info['완료학습개수']

In [None]:
memb_info = memb_info[['학년',
                       '일일_공부량',
                       '국어_평균_점수', '영어_평균_점수', '수학_평균_점수', '사회_평균_점수', '과학_평균_점수',
                       '계획완료율', '비계획완료율', '국어_오답_수행율', '영어_오답_수행율', '수학_오답_수행율', '사회_오답_수행율', '과학_오답_수행율',
                       '평균출석율']]
memb_info

In [None]:
# remove row with outlier
z_scores = stats.zscore(memb_info, axis=1)
z_scores
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3.5).all(axis=1)
memb_info = memb_info[filtered_entries]
memb_info

In [None]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
mms_values = mms.fit_transform(memb_info.values)
mms_set = pd.DataFrame(mms_values, columns=memb_info.columns)
mms_set

# 3. clustering

## 3.1 Kmeans

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# define of the data set
X = mms_set

# define of the n_cluster
range_n_clusters = list(range(3, 16))

for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    clusterer = KMeans(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(X)

    # Silhouette_score 
    silhouette_avg = silhouette_score(X, cluster_labels)
    
    # Davies_Bouldin_socre 
    davies_avg = davies_bouldin_score(X, cluster_labels)
    
    # Calinski_harabasz_score
    calinski_avg = calinski_harabasz_score(X, cluster_labels)
    
    print("for n_clusters = {} | silhouettet_socre = {} | davies_score = {} | calinski_score = {}".format(n_clusters, round(silhouette_avg, 4), round(davies_avg, 4), round(calinski_avg, 2)))

In [None]:
# visualization for Elbow
ks = range(1, 15)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(X)
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

## 3.2 Hierarchical

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# define of the data set
X = mms_set

# define of the n_cluster
range_n_clusters = list(range(3, 16))

for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='average') # linkage = 'average'
    cluster_labels = clusterer.fit_predict(X)

    # Silhouette_score 
    silhouette_avg = silhouette_score(X, cluster_labels)
    
    # Davies_Bouldin_socre 
    davies_avg = davies_bouldin_score(X, cluster_labels)
    
    # Calinski_harabasz_score
    calinski_avg = calinski_harabasz_score(X, cluster_labels)
    
    print("for n_clusters = {} | silhouettet_socre = {} | davies_score = {} | calinski_score = {}".format(n_clusters, round(silhouette_avg, 4), round(davies_avg, 4), round(calinski_avg, 2)))

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# define of the data set
X = mms_set

# define of the n_cluster
range_n_clusters = list(range(3, 16))

for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete') # linkage = 'complete'
    cluster_labels = clusterer.fit_predict(X)

    # Silhouette_score 
    silhouette_avg = silhouette_score(X, cluster_labels)
    
    # Davies_Bouldin_socre 
    davies_avg = davies_bouldin_score(X, cluster_labels)
    
    # Calinski_harabasz_score
    calinski_avg = calinski_harabasz_score(X, cluster_labels)
    
    print("for n_clusters = {} | silhouettet_socre = {} | davies_score = {} | calinski_score = {}".format(n_clusters, round(silhouette_avg, 2), round(davies_avg, 2), round(calinski_avg, 2)))

## 3.3 DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# define of the data set
X = mms_set

for min_samples in range(5, 21):
    for eps in range(5, 20):
        try:
            if eps < 10:
                # min_samples & eps
                print("############################################")
                print("min_samples: {} | eps: 0.{}".format(min_samples, eps))

                # Initialize the clusterer with n_clusters value and a random generator
                dbscan = DBSCAN(min_samples=min_samples, n_jobs=-1, eps=float("0.{}".format(eps)))
                dbscan_labels = dbscan.fit_predict(pd.DataFrame(X, columns=stdsc_set.columns))

                # silhouette_score 
                silhouette_avg = silhouette_score(X, dbscan_labels)
                print("For n_clusters =", len(pd.Series(dbscan_labels).unique()),
                    "The silhouette_score is :", round(silhouette_avg, 2))

                # Davies_Bouldin_socre 
                davies_avg = davies_bouldin_score(X, dbscan_labels)
                print("For n_clusters =", len(pd.Series(dbscan_labels).unique()),
                    "The davies_bouldin_score is :", round(davies_avg, 2))

                # Calinski_harabasz_score
                calinski_avg = calinski_harabasz_score(X, dbscan_labels)
                print("For n_clusters =", len(pd.Series(dbscan_labels).unique()),
                    "The calinsk_harabasz_socre is :", round(calinski_avg, 2))

                print("############################################\n")
            else:
                # min_samples & eps
                print("############################################")
                print("min_samples: {} | eps: {}.{}".format(min_samples, str(eps)[0], str(eps)[1]))

                # Initialize the clusterer with n_clusters value and a random generator
                dbscan = DBSCAN(min_samples=min_samples, n_jobs=-1, eps=float("{}.{}".format(str(eps)[0], str(eps)[1])))
                dbscan_labels = dbscan.fit_predict(pd.DataFrame(X, columns=stdsc_set.columns))

                # silhouette_score 
                silhouette_avg = silhouette_score(X, dbscan_labels)
                print("For n_clusters =", len(pd.Series(dbscan_labels).unique()),
                    "The silhouette_score is :", round(silhouette_avg, 2))

                # Davies_Bouldin_socre 
                davies_avg = davies_bouldin_score(X, dbscan_labels)
                print("For n_clusters =", len(pd.Series(dbscan_labels).unique()),
                    "The davies_bouldin_score is :", round(davies_avg, 2))

                # Calinski_harabasz_score
                calinski_avg = calinski_harabasz_score(X, dbscan_labels)
                print("For n_clusters =", len(pd.Series(dbscan_labels).unique()),
                    "The calinsk_harabasz_socre is :", round(calinski_avg, 2))

                print("############################################\n")
                
        # DBSCAN parameter인 min_samples, eps에 따라 군집형성이 안 될 수 있기 때문에
        # 군집 형성 자체가 안 되었다면 실루엣 지수를 구할 수 없고
        # 이에 따라서 ValueError가 발생
        except ValueError:
            print("ValueError... for n_clusters = {}".format(len(pd.Series(dbscan_labels).unique())))

## 3.4 SOM

In [None]:
# Import the library
import SimpSOM as sps

In [None]:
mms_set = np.array(mms_set)

In [None]:
# Build a network 20x20 with a weights format taken from the raw_data and activate Periodic Boundary Conditions. 
net = sps.somNet(10, 10, mms_set, PBC=True, PCI=True)

# Train the network for 10000 epochs and with initial learning rate of 0.01. 
net.train(0.01, 10000)

# Project the datapoints on the new 2D network map
prj = np.array(net.project(pd.DataFrame(mms_set).values))

In [None]:
# Print a map of the network nodes and colour them according to the first feature (column number 0) of the dataset
# and then according to the distance between each node and its neighbours.
# this will help us identify cluster centers 
net.diff_graph()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# define of the data set
X = prj

# define of the n_cluster
range_n_clusters = list(range(3, 15))

for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value and a random generator
    clusterer = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = clusterer.fit_predict(X)

    # Silhouette_score 
    silhouette_avg = silhouette_score(X, cluster_labels)
    
    # Davies_Bouldin_socre 
    davies_avg = davies_bouldin_score(X, cluster_labels)
    
    # Calinski_harabasz_score
    calinski_avg = calinski_harabasz_score(X, cluster_labels)
    
    print("for n_clusters = {} | silhouettet_socre = {} | davies_score = {} | calinski_score = {}".format(n_clusters, round(silhouette_avg, 4), round(davies_avg, 4), round(calinski_avg, 2)))