In [None]:
# Model Training

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load preprocessed data
df = pd.read_csv('../data/cleaned_university_ranking_2020.csv')

# Feature Engineering
df['Region'] = df['Town'].apply(lambda x: 'Region 1' if x in ['Town1', 'Town2'] else 'Region 2')

# Encoding categorical variables
X = pd.get_dummies(df[['University', 'Town', 'Region']], drop_first=True)
y = pd.cut(df['Rank'], bins=[0, 10, 50, df['Rank'].max()], labels=['Top 10', 'Top 50', 'Lainnya'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy}")

# Save model
joblib.dump(model, '../model/university_ranking_model.pkl')
