In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Professor names
professor_names = ['김유섭', '김은주', '이정근', '양은샘', '신미영', '김선정']

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# Set a seed value
SEED = 42
set_seed(SEED)

In [None]:
!sudo apt-get -q install -y fonts-noto-cjk
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

import matplotlib.font_manager as fm

fm.fontManager.addfont('/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc')
plt.rcParams['font.family'] = 'Noto Sans CJK JP'
sns.set(font='Noto Sans CJK JP', font_scale=.8)

In [None]:
df = pd.read_csv('merged_reviews_by_professor.csv', encoding='utf-8-sig')
df.head()

### Generate embeddings

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased')

df['review_embedding'] = df['review'].apply(lambda x: model.encode(x))

# EDA

### Display basic information


In [None]:
df.info()

### Analyze review length

In [None]:
df['review_length'] = df['review'].str.len()

plt.figure(figsize=(10, 6))
sns.histplot(df['review'].str.len(), bins=50, kde=True)
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.boxplot(x='professor', y='review_length', data=df)
plt.title('Review Lengths by Professor')
plt.xlabel('Professor')
plt.ylabel('Review Length')

### Analyze word frequency

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

plt.figure(figsize=(12, 6))

for i, professor in enumerate(professor_names):
    professor_df = df[df['professor'] == professor]
    all_reviews = ' '.join(professor_df['review'])

    all_reviews.strip()

    vectorizer = CountVectorizer(max_features=1000)
    X = vectorizer.fit_transform([all_reviews])

    frequency = np.sum(X.toarray(), axis=0)
    words = vectorizer.get_feature_names_out()

    word_frequencies = pd.DataFrame({'word': words, 'frequency': frequency})
    word_frequencies = word_frequencies.sort_values(by='frequency', ascending=False)

    plt.subplot(2, 3, i + 1)
    sns.barplot(x='frequency', y='word', data=word_frequencies.head(10))
    plt.title(professor)
    plt.xlabel('Frequency')
    plt.ylabel('Word')

plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(12, 6))

for i, professor in enumerate(professor_names):
    professor_df = df[df['professor'] == professor]
    all_reviews = ' '.join(professor_df['review'])

    all_reviews.strip()

    wordcloud = WordCloud(
        width=800,
        height=500,
        random_state=21,
        max_font_size=110,
        font_path='/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc',
        background_color='white',
        colormap='brg'
    ).generate(all_reviews)

    plt.subplot(2, 3, i + 1)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(professor)

plt.tight_layout()
plt.show()

### Visualize embeddings

In [None]:
import umap
from sklearn.neighbors import LocalOutlierFactor

# UMAP dimensionality reduction
umap_model = umap.UMAP(n_components=2, random_state=42, n_neighbors=5)
embedding = umap_model.fit_transform(df['review_embedding'].tolist())

# Local Outlier Factor
lof = LocalOutlierFactor(n_neighbors=50, contamination=0.02)
outlier_labels = lof.fit_predict(embedding)

umap_df = pd.DataFrame(embedding, columns=['umap_dim_1', 'umap_dim_2'])
umap_df['outlier'] = outlier_labels

# Visualize
plt.figure(figsize=(8, 8))
sns.scatterplot(
    x='umap_dim_1',
    y='umap_dim_2',
    hue='outlier',
    style='outlier',
    palette={1: 'blue', -1: 'red'},
    markers={1: 'o', -1: 'X'},
    data=umap_df,
    s=40,
    alpha=0.7
)

plt.title('UMAP Embedding with LOF Outlier Detection')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')

plt.grid(True)
plt.show()

In [None]:
for i in df[np.array(outlier_labels) == -1]['review']:
    print(i)
    print('---------')

# Modeling

### Prepare data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = np.array(df['review_embedding'].tolist())
y = df['professor']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

### Train SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Train SVM
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Predict
y_pred = svm_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Test Accuracy: {accuracy:.4f}')

# Classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

### Train Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Test Accuracy: {accuracy:.4f}')

# Classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

### Bayesian Optimization

In [None]:
!pip install scikit-optimize -q

In [None]:
from skopt import BayesSearchCV
from skopt.space import Integer, Real

param_space = {
    'n_estimators': Integer(100, 200),
    'max_depth': Integer(10, 20),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 5),
    'criterion': ['gini', 'entropy']
}

# Create the Random Forest model
rf = RandomForestClassifier(random_state=42)

# n_iter: Number of optimization iterations
# cv: Number of cross-validation folds
bayes_search = BayesSearchCV(
    estimator=rf,
    search_spaces=param_space,
    n_iter=50,
    cv=5,
    random_state=42,
    n_jobs=-1
)

bayes_search.fit(X_train, y_train)

print(f'Best parameters found: {bayes_search.best_params_}')

# Get the best model
best_rf_model = bayes_search.best_estimator_

# Predict
y_pred = best_rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Bayes Optimized Random Forest Test Accuracy: {accuracy:.4f}')

# Classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

### Train Neural Network

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Define the network
class MyNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MyNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = ReviewDataset(X_train_tensor, y_train_tensor)
test_dataset = ReviewDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Instantiate
input_size = X_train.shape[1]
num_classes = len(label_encoder.classes_)
model = MyNN(input_size, num_classes)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train
num_epochs = 50
best_val_accuracy = 0.0
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(1, num_epochs + 1):

    # Training loop
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Validation loop
    model.eval()
    correct = 0
    total = 0
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    train_loss = running_loss / len(train_loader)
    val_loss = val_loss / len(test_loader)
    val_accuracy = correct / total

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    if epoch % 5 == 0:
        print(
            f'Epoch {epoch}/{num_epochs}, '
            f'Training Loss: {train_loss:.4f}, '
            f'Validation Loss: {val_loss:.4f}, '
            f'Validation Accuracy: {val_accuracy:.4f}'
        )

    # Save the best model yet
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'nn_model.pth')
        print(f'New best: {best_val_accuracy:.4f}')

In [None]:
plt.figure(figsize=(12, 5))

# Plot training and validation loss
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

# Plot validation accuracy
plt.subplot(1, 2, 2)
plt.plot(val_accuracies, label='Validation Accuracy', color='green')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Validation Accuracy')
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Load the model state
loaded_model = MyNN(input_size, num_classes)
loaded_model.load_state_dict(torch.load('nn_model.pth'))
loaded_model.eval()

# Predict
y_pred_tensor = loaded_model(X_test_tensor)
_, y_pred = torch.max(y_pred_tensor, 1)
y_pred = y_pred.numpy()

# Evaluate
accuracy_nn = accuracy_score(y_test, y_pred)
print(f'NN Test Accuracy: {accuracy_nn:.4f}')

# Classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

### Visualize result

In [None]:
from sklearn.manifold import TSNE

# t-SNE dimensionality reduction
tsne_test = TSNE(n_components=2, random_state=42, perplexity=min(30, len(X_test) - 1))
tsne_test_results = tsne_test.fit_transform(X_test)

tsne_test_df = pd.DataFrame(tsne_test_results, columns=['tsne_dim_1', 'tsne_dim_2'])
tsne_test_df['true_labels'] = label_encoder.inverse_transform(y_test)
tsne_test_df['predicted_labels'] = label_encoder.inverse_transform(y_pred)
tsne_test_df['is_correct'] = y_test == y_pred

# Visualize
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='tsne_dim_1',
    y='tsne_dim_2',
    hue='predicted_labels',
    style='is_correct',
    markers={True: 'o', False: 'X'},
    data=tsne_test_df,
    s=100,
    alpha=0.7
)

plt.title('t-SNE of Test Set Embeddings')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.grid(True)
plt.show()