## EDA


### Print the number of unique values for each column


In [None]:
for col in train.columns:
    print(f'{col} has {train[col].nunique()} values')

### Target distribution


In [None]:
# Set the figure size and create a count plot
plt.figure(figsize=(10, 8))
ax = sns.countplot(x='Target', data=train, palette='pastel')

# Add labels to each bar in the plot
for p in ax.patches:
    ax.text(p.get_x() + p.get_width() / 2, p.get_height() + 3, f'{int(p.get_height())}', ha="center")

plt.xlabel('Target')
plt.ylabel('Count')
plt.title('Target Distribution')
plt.show()

### Feature distributions


In [None]:
# カテゴリーデータを数値ではなくカテゴリーと認識させるための処理

# カテゴリーデータをセット
cat_features = ['Marital status', 'Application mode', 'Course',
                'Previous qualification', 'Nacionality', "Mother's qualification", 
                "Father's qualification", "Mother's occupation",
                "Father's occupation"]
# 上記のカラムはカテゴリーとtype付けする
for feature in cat_features:
    dtype = pd.CategoricalDtype(categories=list(set(train[feature]) | set(test[feature]) | set(original[feature])), ordered=False)
    for df in [train, test, original]:
        df[feature] = df[feature].astype(dtype)

In [None]:
# 全データを可視化
# floatデータは緑ヒストグラム
# categoryデータは黒棒グラフ
# integerデータは青棒グラフ
_, axs = plt.subplots(9, 4, figsize=(12, 20))
for col, ax in zip(initial_features, axs.ravel()):
    if train[col].dtype == float:
        ax.hist(train[col], bins=300, density=True, color='g')
    elif train[col].dtype == 'category':
        vc = train[col].cat.codes.value_counts() / len(train)
        ax.bar(vc.index, vc, color='k')
        ax.yaxis.set_major_formatter('{x:.0%}')
        ax.set_xticks([])
    else: # integer
        vc = train[col].value_counts() / len(train)
        ax.bar(vc.index, vc, color='b')
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        ax.yaxis.set_major_formatter('{x:.0%}')
    ax.set_title(col, fontsize=10)
plt.tight_layout()
plt.show()

## Feature correlations


In [None]:
from sklearn.preprocessing import LabelEncoder

categories = ['dropout', 'enrolled', 'graduate']
label_encoder = LabelEncoder()

# Convert categorical 'Target' labels to numeric values using LabelEncoder
train['Target'] = label_encoder.fit_transform(train['Target'])

In [None]:
corr_features = [initial_features, 'Target']
cc = np.corrcoef(train[corr_features], rowvar=False)

plt.figure(figsize=(21, 18))
# 見やすさのために10倍している
sns.heatmap(cc*10, center=0, cmap='coolwarm', annot=True, fmt='.0f',
            xticklabels=corr_features, yticklabels=corr_features)
plt.title('Correlation matrix', fontsize=40)
plt.show()