In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Load the dataset
file_path = "udemy_courses.csv"  # Change if necessary
df = pd.read_csv(file_path)

Data Analysis


In [None]:

# Exploratory Data Analysis (EDA)
print("Dataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

Distribution visualization

In [None]:
# Visualizing distributions
plt.figure(figsize=(10, 5))
sns.histplot(df['price'], bins=30, kde=True)
plt.title("Price Distribution")
plt.show()

plt.figure(figsize=(10, 5))
sns.histplot(df['num_subscribers'], bins=30, kde=True)
plt.title("Number of Subscribers Distribution")
plt.show()

plt.figure(figsize=(10, 5))
sns.countplot(x='level', data=df)
plt.title("Course Levels Distribution")
plt.show()

plt.figure(figsize=(10, 5))
sns.countplot(x='subject', data=df)
plt.title("Subject Distribution")
plt.show()

Column Drop

In [None]:
# Drop only the columns that exist in the dataset
columns_to_drop = ["course_id", "url", "published_timestamp","course_title"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')


Encoding

In [None]:
# Encode categorical features
label_encoders = {}
for col in ["level", "subject"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

Standardizing

In [None]:
# Standardize numerical features
scaler = StandardScaler()
num_cols = ["price", "num_subscribers", "num_reviews", "num_lectures", "content_duration"]
df[num_cols] = scaler.fit_transform(df[num_cols])


Spliting Data

In [None]:
# Split data into train and test sets
X = df.drop(columns=["subject"])  # Feature matrix
y = df["subject"]  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


Model Intialization

In [None]:
# Initialize models
models = {
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}


Train and Evaluation

In [None]:
accuracy_results = {}
reports = {}
conf_matrices = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy
    reports[name] = classification_report(y_test, y_pred, output_dict=True)
    conf_matrices[name] = confusion_matrix(y_test, y_pred)

# Show results
print("\n✅ Accuracy Results:\n", accuracy_results)

Accuracy Comparision

In [None]:
# Plot model accuracy comparison
plt.figure(figsize=(8, 5))
plt.bar(accuracy_results.keys(), accuracy_results.values(), color=['blue', 'green', 'red', 'purple'])
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison")
plt.ylim(0, 1)  # Accuracy ranges from 0 to 1
plt.show()


Confusion Matrices

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()
for i, (name, cm) in enumerate(conf_matrices.items()):
    sns.heatmap(cm, annot=True, fmt='d', ax=axes[i], cmap='Blues')
    axes[i].set_title(f'Confusion Matrix - {name}')
    axes[i].set_xlabel('Predicted Label')
    axes[i].set_ylabel('True Label')
plt.tight_layout()
plt.show()

Precision,Recall and F1 score

In [None]:
# Plot precision, recall, and F1-score
metrics = ['precision', 'recall', 'f1-score']
for metric in metrics:
    plt.figure(figsize=(8, 5))
    scores = {name: [reports[name][str(i)][metric] for i in range(len(label_encoders['subject'].classes_))] for name in reports}
    for model, values in scores.items():
        plt.plot(range(len(values)), values, marker='o', label=model)
    plt.xlabel("Class Labels")
    plt.ylabel(metric.capitalize())
    plt.title(f"{metric.capitalize()} Comparison Across Models")
    plt.legend()
    plt.show()

recommendation logic

In [None]:
def recommend_courses(subject, level, budget):
    filtered = df[
        (df['subject'].str.lower() == subject.lower()) &
        (df['level'].str.lower() == level.lower()) &
        (df['price'] <= budget)
    ]
    return filtered.sort_values(by='num_subscribers', ascending=False).head(5)


In [None]:
import ipywidgets as widgets
from IPython.display import display, Markdown

# Dropdown for subject
subject_dropdown = widgets.Dropdown(
    options=df['subject'].unique().tolist(),
    description='Subject:'
)

# Dropdown for level
level_dropdown = widgets.Dropdown(
    options=df['level'].unique().tolist(),
    description='Level:'
)

# Budget slider
budget_slider = widgets.IntSlider(
    value=50,
    min=0,
    max=200,
    step=5,
    description='Budget ($):'
)

# Button to trigger recommendation
recommend_button = widgets.Button(description="Recommend")

# Output area
output = widgets.Output()

# Event handler
def on_recommend_clicked(b):
    with output:
        output.clear_output()
        recs = recommend_courses(subject_dropdown.value, level_dropdown.value, budget_slider.value)
        if recs.empty:
            display(Markdown("**No courses found. Try adjusting your filters.**"))
        else:
            display(Markdown("### 🎓 Recommended Courses:"))
            for i, row in recs.iterrows():
                display(Markdown(f"- **{row['course_title'].title()}** (${row['price']}) – {row['num_subscribers']} subscribers"))

recommend_button.on_click(on_recommend_clicked)

# Display UI
display(subject_dropdown, level_dropdown, budget_slider, recommend_button, output)
