### 2024년 1학기 Python 기초입문 - v 2.0


[colaboratory로 실행](https://colab.research.google.com/drive/1APtsENilV7Igkr0nDuSrwCmd15Q6HGub?usp=sharing)

# 파이썬 데이터 분석

## 기초적인 통계

In [None]:
from itertools import permutations, combinations, product, combinations_with_replacement
import numpy as np
import pandas as pd
import scipy.stats as sp
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 순열과 조합

# permutations : iter 가능한 n개 원소의 객체에서 중복을 허용하지 않고 r개를 뽑아서 나열
# 순서에 따라서 다른 경우라 인정
print("permutations")
for p in permutations([1,2,3,4],2):
    print(p, end=" ")
print(f"\n{"":-<70}")

# combinations :  iter 가능한  n개 원소의 객체에서 중복을 허용하지 않고  r개를 뽑아서 나열
# 순서에 따라서 다른 경우를 인정 하지 않음.
print("combinations")
for c in combinations([1,2,3,4],2):
    print(c, end=" ")
print(f"\n{"":-<70}")

# product : 조건이 있는 경우
print("product")
for i in product([1,2,3,4],'ab'):
    print(i, end=" ")
print(f"\n{"":-<70}")

for i in product(range(3), range(3), range(3)):
    print(i, end=" ")
print(f"\n{"":-<70}")

for i in product([1,2,3], repeat=2):
    print(i, end=" ")
print(f"\n{"":-<70}")

# combinations_with_replacement : 반복이 가능한 조합
print("combination with replacemnet")
for r in combinations_with_replacement([1,2,3,4],2):
    print(r, end=" ")
print(f"\n{"":-<70}")

In [None]:
# 정규분포
# 연속 continuous 을 표현하기 위해서 x 관측의 범위를 0 에서, 30 까지로 하고 표본은 1000
x = np.linspace(0, 30, 1000)
print(f"sample's mean : {np.mean(x)}")
print(f"sample's median : {np.median(x)}") # 정규분포이므로 좌우 대칭. 아닌경우 한쪽으로 기우어진 분포


In [None]:
mu = 15
sig = 3
np.random.seed(2024)
plt.hist(np.random.randn(10000) * sig + mu, bins= 50)

# 중심극한정리란
# 충분한 숫자(일반적으로 샘플이 30개 이상일 경우)로
# 중분한 횟수(150 ~ : 모집단의 크기에 의해 결정됨)만큼 반복해서 샘플을 뽑을 경우
# 실시한 추출 횟수가 가증가할 수록
# 모집단이 정규분포를 따른 다른 가정하에 점점 추출된 샘플들의 평균과 분산의 평균이
# 모집단이 정규분포일 경우에 평균에 가까원 진다는 이론

In [None]:

# 다시 충분한 표본으로 복귀 연속(continuous)표현
x = np.linspace(0, 30, 10000)
# 확률누적함수
density = norm_dist.pdf(x)
plt.plot(x, density)
plt.scatter(17, norm_dist.pdf(17), c='r', marker='o') # x = 12인점 표시
plt.plot([0, 17], [norm_dist.pdf(17), norm_dist.pdf(17)], 'r--')
plt.axvline(17, ls=':')
plt.xlabel('x')
plt.ylabel('density')
plt.xlim(0, 30)

print(norm_dist.pdf(17)) # 특정 변수 위치의 활률밀도함수의 값
print(f"{"":-<70}")

In [None]:
# CPF:cumulative probability density function
# 누적확률분포
print(norm_dist.cdf(17))
c_density = norm_dist.cdf(x)
plt.plot(x,c_density, c='g')
plt.scatter(17, norm_dist.cdf(17), c='r', marker='o') # x = 12인점 표시
plt.plot([0, 17], [norm_dist.cdf(17), norm_dist.cdf(17)], 'r--')
plt.axvline(17, ls=':')
plt.xlabel('x')
plt.ylabel('density')
plt.xlim(0, 30)

In [None]:
# 특정 확률까지의
print(norm_dist.ppf(0.7475)) # 19.934560

p_density = norm_dist.ppf(x)
plt.plot(x, p_density)

plt.scatter(0.7475, norm_dist.ppf(0.7475), c = 'r') # p = 0.7475인점 표시
plt.plot([-0.05, 0.7475], [norm_dist.ppf(0.7475), norm_dist.ppf(0.7475)], 'r--')
plt.axvline(0.7475, ls=':')
plt.xlabel('density')
plt.ylabel('x')
plt.xlim(0, 1)

In [None]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
print(f"data.keys : {diabetes.keys()}")
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['target'] = diabetes.target

In [None]:
df.info()
# df.describe()
df.head(10)

In [None]:
# 데이터의 산점도 확인
_,ax = plt.subplots(figsize=(5,3))
ax.scatter(df['bmi'], df['target'], alpha=0.6, s =10)
ax.set(title='TARGET ~ BMI', xlabel='BMI', ylabel='TARGET')

In [None]:
# numpy로 공분산 계산
cov_df = np.cov(df['bmi'], df['target'])
cov_df[0,1]

In [None]:
# 상관계수 계산
np.corrcoef(df['bmi'], df['target'])[0,1]

In [None]:
## scipy이 사용
sp.pearsonr(df['bmi'], df['target'])

In [None]:
# pandas 사용
df.corr()

In [None]:
plt.subplots(figsize=(7,5))
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('pearson Corr HeatMap', fontdict={'fontsize':10}, pad=8);

In [None]:
# _,ax = plt.subplots(figsize=(5,3))
# ax.scatter(df['bmi'], df['target'], alpha=0.6, s =10)
# ax.set(title='TARGET ~ BMI', xlabel='BMI', ylabel='TARGET')
sns.lmplot(x='bmi', y='target', data=df)
#sns.lmplot(x='bmi', y='target', hue='sex', data=df);

In [None]:
pair = sns.pairplot(df, vars=['bmi','target']) # hue=, palette= , markers=[]
pair_grid = pair.map_upper(sns.regplot)
pair_grid = pair.map_lower(sns.kdeplot)
pair_grid = pair.map_diag(sns.histplot)

## 군집분석 Clustering (군집화)

### K-means

In [None]:
from sklearn.preprocessing import scale
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
iris = load_iris()
# 보다 편리한 데이터 Handling을 위해 DataFrame으로 변환
irisDF = pd.DataFrame(data=iris.data, columns=['sepal_length','sepal_width','petal_length','petal_width'])
print(irisDF.info())
irisDF.head(5)

In [None]:
iris.target

In [None]:
_, ax = plt.subplots()
scatter = ax.scatter(iris.data[:, 0], iris.data[:, 1], c=iris.target)
ax.set(xlabel=iris.feature_names[0], ylabel=iris.feature_names[1])
_ = ax.legend(
    scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes"
)

In [None]:
# 개정판 소스 코드 수정(2019.12.24)
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300,random_state=0)
kmeans.fit(irisDF)

irisDF['target'] = iris.target
irisDF['km_cluster']=kmeans.labels_
iris_result = irisDF.groupby(['target','km_cluster'])['sepal_length'].count()
print(iris_result)

In [None]:
irisDF.head()

In [None]:
_, ax = plt.subplots()
scatter = ax.scatter(irisDF.iloc[:,0], irisDF.iloc[:,1], c=kmeans.labels_ , cmap='rainbow')
ax.set(xlabel=iris.feature_names[0], ylabel=iris.feature_names[1])
# _ = ax.legend(
#    scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes"
#)

### Hierarchical clustering

In [None]:
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
plt.figure(figsize=(10, 7))
plt.title("iris Dendograms")
dend = shc.dendrogram(shc.linkage(irisDF.iloc[:,0:4], method='ward'))

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc_cluster = AgglomerativeClustering(n_clusters=3, linkage='ward')
hc_cluster.fit_predict(irisDF.iloc[:,0:4])

In [None]:
_, ax = plt.subplots()
scatter = ax.scatter(irisDF.iloc[:,0], irisDF.iloc[:,1], c=hc_cluster.labels_, cmap='rainbow')
ax.set(xlabel=iris.feature_names[0], ylabel=iris.feature_names[1])
# _ = ax.legend(
#    scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes"
# )

### GMM

In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=3, random_state=0).fit(iris.data)
gmm_cluster_labels = gmm.predict(iris.data)

# 클러스터링 결과를 irisDF 의 'gmm_cluster' 컬럼명으로 저장
irisDF['gmm_cluster'] = gmm_cluster_labels
irisDF['target'] = iris.target

# target 값에 따라서 gmm_cluster 값이 어떻게 매핑되었는지 확인.
iris_result = irisDF.groupby(['target'])['gmm_cluster'].value_counts()

### DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.6, min_samples=8, metric='euclidean')
dbscan_labels = dbscan.fit_predict(iris.data)

irisDF['dbscan_cluster'] = dbscan_labels
irisDF['target'] = iris.target

iris_result = irisDF.groupby(['target'])['dbscan_cluster'].value_counts()
print(iris_result)

In [None]:
_, ax = plt.subplots(2,2, figsize=(8,8))
ax[0,0].scatter(irisDF.iloc[:,0], irisDF.iloc[:,1], c=kmeans.labels_, cmap='rainbow')
ax[0,0].set(title="k-mean",ylabel=iris.feature_names[1])
ax[0,1].scatter(irisDF.iloc[:,0], irisDF.iloc[:,1], c=hc_cluster.labels_, cmap='rainbow')
ax[0,1].set(title="hierarchical",ylabel=iris.feature_names[1])
ax[1,0].scatter(irisDF.iloc[:,0], irisDF.iloc[:,1], c=gmm_cluster_labels, cmap='rainbow')
ax[1,0].set(title="GMM",xlabel=iris.feature_names[0], ylabel=iris.feature_names[1])
ax[1,1].scatter(irisDF.iloc[:,0], irisDF.iloc[:,1], c=dbscan_labels, cmap='rainbow')
ax[1,1].set(title="DBSCAN",xlabel=iris.feature_names[0], ylabel=iris.feature_names[1])

## 차원축소 (dimensionality reduction)
### PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_transformed = pca.fit_transform(iris.data)
pca_transformed.shape

In [None]:
irisDF['pca_x'] = pca_transformed[:,0]
irisDF['pca_y'] = pca_transformed[:,1]
irisDF['pca_z'] = pca_transformed[:,2]
irisDF.head()

In [None]:
import mpl_toolkits.mplot3d  # noqa: F401

fig = plt.figure(1, figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(
    X_reduced[:, 0],
    X_reduced[:, 1],
    X_reduced[:, 2],
    c=iris.target,
    s=40,
)
ax.set_title("First three PCA dimensions")
ax.set_xlabel("1st Eigenvector")
ax.xaxis.set_ticklabels([])
ax.set_ylabel("2nd Eigenvector")
ax.yaxis.set_ticklabels([])
ax.set_zlabel("3rd Eigenvector")
ax.zaxis.set_ticklabels([])
plt.show()

###  k-means

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# from matplotlib import font_manager
# plt.rc('axes', unicode_minus=False)
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns

from scipy.stats import mode
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
## digits data
from sklearn.datasets import load_digits
digits = load_digits()
print(f"data- keys :\n {digits.keys()}")
print(f"data.shape :{digits.data.shape}")

# data, labels = load_digits(return_X_y=True)
# (n_samples, n_features), n_digits = data.shape, np.unique(labels).size
# print(f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}")

fig, axes = plt.subplots(2,5,figsize=(10,5),
                        subplot_kw = {'xticks':(), 'yticks':()})
for ax, imgs in zip(axes.ravel(), digits.images):
    ax.imshow(imgs)

In [None]:
type(digits.data[0:3])

In [None]:
# Kmean
from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=10,random_state=0)
km_clusters = k_means.fit_predict(digits.data)
k_means.cluster_centers_.shape
print(km_clusters.shape)

In [None]:
fig, ax = plt.subplots(2, 5, figsize = (8,3))
centers = k_means.cluster_centers_.reshape(10,8,8)
for axi, center in zip(ax.flat, centers):
    axi.set(xticks = [], yticks = [])
    axi.imshow(center, interpolation='nearest',cmap = plt.cm.binary)

In [None]:
km_labels = np.zeros_like(km_clusters) # blank labels
print(f"The labels are : {km_labels}")
print(f"\nThe size of labels is : {km_labels.shape}")
print("The mask values are : ")
for i in range(10):
    mask = (km_clusters == i)
    print(mask)
    km_labels[mask] = mode(digits.target[mask])[0]

In [None]:
accuracy = "{:.2f}".format(accuracy_score(digits.target, km_labels)*100)
print(f"Accuracy: {accuracy}%")

In [None]:
matrix = confusion_matrix(digits.target, km_labels)
sns.heatmap(matrix.T, square = True, annot = True,
           fmt = 'd', cbar=False,
           xticklabels=digits.target_names,
           yticklabels=digits.target_names)
plt.xlabel('True Label')
plt.ylabel("Predicted Label")

### PCA

In [None]:
# pca
from sklearn.decomposition import PCA
#2차원으로 축소
pca = PCA(n_components=2)
pca.fit(digits.data)
#처음 두 개의 주성분으로 숫자 데이터를 변환
digits_pca = pca.transform(digits.data)
colors = ["#476A2A", "#7851B8", "#BD3430", "#4A2D4E", "#875525",
          "#A83683", "#4E655E", "#853541", "#3A3120","#535D8E"]
plt.figure(figsize=(10,10))
plt.xlim(digits_pca[:, 0].min(), digits_pca[:, 0].max())
plt.ylim(digits_pca[:, 1].min(), digits_pca[:, 1].max())
for i in range(len(digits.data)):
    plt.text(digits_pca[i, 0], digits_pca[i,1], str(digits.target[i]),
            color = colors[digits.target[i]] ,fontdict={'weight': 'bold', 'size':9}           )
##fontdict={'weight': 'bold', 'size':9}
plt.xlabel("첫 번째 주성분")
plt.ylabel("두 번째 주성분")

### T-sne

In [None]:
from sklearn.manifold import TSNE
plt.rc('axes', unicode_minus=False)
tsne = TSNE(random_state = 42)
#TSNE에는 transform 메서드가 없으므로 대신 fit_transform을 사용한다.
digits_tsne = tsne.fit_transform(digits.data)
plt.figure(figsize=(10,10))
plt.xlim(digits_tsne[:,0].min(), digits_tsne[:,0].max()+1)
plt.ylim(digits_tsne[:,1].min(), digits_tsne[:,1].max()+1)
for i in range(len(digits.data)):
    plt.text(digits_tsne[i,0], digits_tsne[i,1], str(digits.target[i]),
            color = colors[digits.target[i]],
            fontdict = {'weight':'bold','size':9})
plt.xlabel("t-SNE 특성 0")
plt.ylabel("t-SNE 특성 1")

### SOM - 예제 그래프 수정 필요 - som  별도 파일 필요

In [None]:
from som import Som
from pylab import plot,axis,show,pcolor,colorbar,bone
digits = load_digits()
data = digits.data
labels = digits.target

som = Som(16,16,64,sigma=1.0,learning_rate=0.5)
som.random_weights_init(data)
print("Initiating SOM.")
som.train_random(data,10000)
print("\n. SOM Processing Complete")

bone()
pcolor(som.distance_map().T)
colorbar()

In [None]:
labels[labels == '0'] = 0
labels[labels == '1'] = 1
labels[labels == '2'] = 2
labels[labels == '3'] = 3
labels[labels == '4'] = 4
labels[labels == '5'] = 5
labels[labels == '6'] = 6
labels[labels == '7'] = 7
labels[labels == '8'] = 8
labels[labels == '9'] = 9

markers = ['o', 'v', '1', '3', '8', 's', 'p', 'x', 'D', '*']
colors = ["r", "g", "b", "y", "c", (0,0.1,0.8), (1,0.5,0), (1,1,0.3), "m", (0.4,0.6,0)]
for cnt,xx in enumerate(data):
   w = som.winner(xx)
   plot(w[0]+.5,w[1]+.5,markers[labels[cnt]],
   markerfacecolor='None', markeredgecolor=colors[labels[cnt]],
   markersize=12, markeredgewidth=2)
   axis([0,som.weights.shape[0],0,som.weights.shape[1]])
   show()