In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier # 결정트리 분류모델
from xgboost import XGBClassifier, plot_importance
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

#####################################################################
# 웹 크롤링 (Web Crawling) - BeautifulSoup
import requests
from bs4 import BeautifulSoup

# 웹 페이지 가져오기
url = "https://example.com"
response = requests.get(url)
html_content = response.content

# BeautifulSoup으로 HTML 파싱
soup = BeautifulSoup(html_content, "html.parser")

# 필요한 데이터 추출
data = []
table = soup.find("table", {"class": "data"})
rows = table.find_all("tr")
for row in rows:
    cols = row.find_all("td")
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])

# 데이터프레임으로 변환
df = pd.DataFrame(data, columns=["Column1", "Column2", "Column3"])
print(df)

# 데이터 시각화 (예: Column1의 분포)
plt.figure(figsize=(10, 6))
sns.histplot(df['Column1'], kde=True)
plt.title('Distribution of Column1')
plt.xlabel('Column1')
plt.ylabel('Frequency')
plt.show()

#####################################################################
# konlpy
from konlpy.tag import Kkma #형태소분석기
import pandas as pd

kkma = Kkma() #형태소분석기 kkma 정의

df = pd.read_excel("result.xlsx")
df['SUM'] = df['제목'] + " " + df['내용']
total = []

for i in df['SUM']:
    nouns = kkma.nouns(i)
    total+=nouns
    
from collections import Counter 
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

dic = Counter(total) # 제목/내용에 있는 명사(total)를 빈도를 센 것을 dic에 넣음

image = Image.open("heart.png") # 하트 이미지를 가져옴
image2 = np.array(image) #이미지를 행렬로 바꿔라

wc = WordCloud(background_color="white",
              font_path="BMDOHYEON_otf.otf",
              colormap="Blues",
              mask = image2)\
            .generate_from_frequencies(dic)

plt.axis('off')
plt.imshow(wc)
plt.show()

#####################################################################
# 머신러닝 (Machine Learning) - 기본 프로세스

# 데이터 로드
data = pd.read_csv('your_dataset.csv')

# 데이터 확인 및 전처리
print(data.head())
print(data.info())
data = data.dropna()

# 특성과 타겟 분리
X = data.drop('target_column', axis=1).values
y = data['target_column'].values

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 특성 상관 관계 히트맵
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# 타겟 분포
plt.figure(figsize=(8, 6))
sns.countplot(x='target_column', data=data)
plt.title('Target Distribution')
plt.show()


#####################################################################
# Logistic Regression -- Scaler 사용
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# 혼동 행렬 시각화
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# ROC-AUC 곡선
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

#####################################################################
# K-최근접 이웃 (KNN) -- Scaler 사용

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)
knn.score(X_test_scaled, y_test)
y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Accuracy: {accuracy}")

# 혼동 행렬 시각화
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

#####################################################################
# DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train) # 스케일링 필요없음
dtc.score(X_test, y_test)
y_pred = dtc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Classifier Accuracy: {accuracy}")

plt.figure(figsize=(20, 10))
plot_tree(dtc, filled=True, feature_names=data.columns[:-1], class_names=['Class 0', 'Class 1'])
plt.show()

#####################################################################
# 랜덤 포레스트 (Random Forest)

rf = RandomForestClassifier(n_estimators=100, random_state=42, oob_score=True)
rf.fit(X_train, y_train) # 스케일링 필요없음
rf.oob_score_
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")

# 중요 특징 시각화
feature_importances = rf.feature_importances_
features = data.columns[:-1]
plt.figure(figsize=(10, 6))
plt.barh(features, feature_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances in Random Forest Model')
plt.show()


#####################################################################
# KNeighborsRegressor -- Scaler 사용
knr = KNeighborsRegressor(n_neighbors=3)
knr.fit(X_train_scaled, y_train)
knr.score(X_test_scaled, y_test)
y_pred = knr.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"KNN Regressor Mean Squared Error: {mse}")


#####################################################################
# RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, random_state=42, oob_score=True)
rfr.fit(X_train, y_train) # 스케일링 필요없음
rfr.oob_score_
y_pred = rfr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Random Forest Regressor Mean Squared Error: {mse}")

# 중요 특징 시각화
feature_importances = rfr.feature_importances_
features = data.columns[:-1]
plt.figure(figsize=(10, 6))
plt.barh(features, feature_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances in Random Forest Regressor Model')
plt.show()

#####################################################################
# 딥러닝 (Deep Learning) - 기본 프로세스

# 데이터 로드 및 전처리 (여기서는 MNIST 예제)
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(-1, 28 * 28) / 255.0
X_test = X_test.reshape(-1, 28 * 28) / 255.0
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# 모델 정의
model = Sequential([
    Dense(128, activation='relu', input_shape=(784,)),
    Dense(64, activation='relu'),
    Dense(10, activation='softmax')
])
# 이진 분류모델 : output activation-sigmoid
# 다중 분류모델 : output activation-softmax
# 회귀모델 : output activation- 없음 또는 relu

# 모델 컴파일
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# 모델 학습
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2, batch_size=32)

# 모델 평가
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# 학습 과정 시각화
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

#####################################################################
# CNN (Convolutional Neural Networks)

# 데이터 로드 및 전처리 (여기서는 MNIST 예제)
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(-1, 28, 28, 1) / 255.0
X_test = X_test.reshape(-1, 28, 28, 1) / 255.0
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# 모델 정의
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(10, activation='softmax')
])

# 모델 컴파일
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 모델 학습
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2, batch_size=32)

# 모델 평가
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# 학습 과정 시각화
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

#####################################################################
# XGBoost - 기본 프로세스

# 데이터 로드 및 전처리
data = pd.read_csv('pima_indians.csv')
X = data.drop('target_column', axis=1).values
y = data['target_column'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# DMatrix 생성
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 하이퍼파라미터 설정
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

# 모델 학습
bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')], early_stopping_rounds=10)

# 예측 및 평가
y_pred = bst.predict(dtest)
accuracy = accuracy_score(y_test, y_pred.round())
print(f"XGBoost Accuracy: {accuracy}")

# 중요 특징 시각화
plot_importance(bst)
plt.show()

# GridSearchCV 설정 및 실행
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 150]
}

grid_search = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                           param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적 하이퍼파라미터 출력
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best accuracy found: {grid_search.best_score_}")

# 최적 모델로 재학습 및 평가
best_xgb_model = grid_search.best_estimator_
best_xgb_model.fit(X_train, y_train)
y_pred = best_xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best XGBoost Accuracy: {accuracy}")
