# Data Exploration

In [54]:
import pandas as pd

df = pd.read_csv('film.csv')

In [55]:
df.info()

In [56]:
df.isnull().sum()

In [57]:
df.duplicated().sum()

In [58]:
df['title'].unique(), df['year'].unique()

In [59]:
df['genre'].unique()


In [60]:
df = pd.read_csv('film.csv')

# Data Preparation

In [61]:
# Kategorisasi rating
def categorize_rating(rating):
    if rating < 2.5:
        return 'Low'
    elif rating < 3.5:
        return 'Medium'
    else:
        return 'High'

# Terapkan ke dataset
df['rating_class'] = df['rating'].apply(categorize_rating)

In [62]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Encode the 'rating_class' column
df['rating_class'] = label_encoder.fit_transform(df['rating_class'])

# Display the first few rows to verify the changes
df[['rating_class']].head()

In [63]:
# Simpan DataFrame yang telah dimodifikasi
df.to_csv('film.csv', index=False)

In [64]:
df = pd.read_csv('film.csv')

# Data Modelling & Evaluation

In [65]:
#train test split

from sklearn.model_selection import train_test_split

X = df[['userId', 'movieId', 'year', 'genre']]  # fitur
y = df['rating_class']               # target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import numpy as np


# Buat user-item matrix
cf_data = df.pivot_table(index='userId', columns='movieId', values='rating')

# Normalisasi per user: kurangi dengan rata-rata user
user_means = cf_data.mean(axis=1)
cf_data_centered = cf_data.sub(user_means, axis=0)

# Ganti NaN dengan 0 untuk SVD
cf_data_filled = cf_data_centered.fillna(0)

# Gunakan SVD
svd = TruncatedSVD(n_components=50, random_state=42)
cf_matrix_reduced = svd.fit_transform(cf_data_filled)
cf_matrix_pred = np.dot(cf_matrix_reduced, svd.components_)

# Tambahkan kembali rata-rata user untuk mendapatkan skala rating asli
cf_matrix_pred += user_means.values[:, np.newaxis]

# Evaluasi RMSE
true_ratings = cf_data.values
mask = ~np.isnan(true_ratings)
rmse = np.sqrt(mean_squared_error(true_ratings[mask], cf_matrix_pred[mask]))

print(f'RMSE (SVD CF dengan normalisasi): {rmse:.4f}')

In [67]:
from sklearn.tree import DecisionTreeRegressor

# Ganti dengan model regresi
regressor = DecisionTreeRegressor(random_state=42, max_depth=10)
regressor.fit(X_train, y_train)

# Prediksi
y_pred = regressor.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

In [68]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.2f}")


In [69]:
from sklearn.ensemble import RandomForestRegressor

# Buat dan latih model Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Prediksi
y_pred_rf = rf_model.predict(X_test)

# Evaluasi
from sklearn.metrics import mean_squared_error
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("RMSE (Random Forest):", rmse_rf)

In [70]:
# MAE
mae = mean_absolute_error(y_test, y_pred_rf)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# MSE
mse = mean_squared_error(y_test, y_pred_rf)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# R² Score
r2 = r2_score(y_test, y_pred_rf)
print(f"R² Score: {r2:.2f}")

In [71]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Select numerical columns for clustering (excluding encoded categorical columns)
numerical_features = ['rating', 'year']
X = df[numerical_features]

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform K-Means clustering with an arbitrary choice of 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Display the first few rows to verify the clustering results
df[['rating', 'year', 'cluster']].head()


In [72]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set(style="whitegrid")

# Buat plot 2D: rating vs. year, dengan warna berdasarkan cluster
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='year', y='rating', hue='cluster', palette='Set1', s=50)

plt.title('Hasil Clustering K-Means Berdasarkan Rating dan Tahun Rilis')
plt.xlabel('Tahun Rilis')
plt.ylabel('Rating')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()

In [73]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [74]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)

# Deployement

In [75]:
import joblib

joblib.dump(model, 'model.pkl')