In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the datasets
course_data = pd.read_csv('Course_Data.csv')
recommendation_data = pd.read_csv('Recommendation_EducationalPlatform_1.csv')

# Clean column names in the recommendation dataset by stripping any trailing spaces
recommendation_data.columns = recommendation_data.columns.str.strip()

# Extracting course category in Course_Data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to course titles
course_data['course_title_clean'] = course_data['course_title'].apply(preprocess_text)

# Define the keywords for different categories
category_keywords = {
    'Data Science': ['data science', 'data'],
    'Machine Learning' : ['machine learning', 'deep learning', 'ai', 'artificial intelligence'],
    'Programming': ['programming', 'coding', 'python', 'java', 'javascript', 'c++', 'c#', 'ruby', 'php'],
    'Business': ['business', 'management', 'entrepreneurship', 'marketing'],
    'Finance': ['finance', 'financial', 'investment', 'stock market', 'cryptocurrency', 'accounting'],
    'Education': ['education', 'teaching', 'learning', 'school', 'university'],
    'Design': ['design', 'graphic design', 'ux', 'ui', 'photoshop', 'illustrator'],
    'Self-Development': ['self-development', 'self-improvement', 'mindfulness', 'habits', 'happiness'],
    'Arts': ['arts', 'art', 'culture' 'photography', 'painting', 'music', 'dance'],
    'Health': ['health', 'fitness', 'nutrition', 'medicine', 'wellness'],
    'Mathematics': ['math', 'calculus', 'algebra', 'geometry', 'statistics'],
    'Cloud Computing': ['cloud computing', 'aws', 'azure', 'gcp'],
    # Add more categories and keywords as needed
}

# Function to match course to a category based on keywords
def categorize_course(title, keyword_dict):
    for category, keywords in keyword_dict.items():
        if any(keyword in title for keyword in keywords):
            return category
    return 'Other'  # If no category matches

# Categorize each course based on its title
course_data['Category'] = course_data['course_title_clean'].apply(lambda title: categorize_course(title, category_keywords))

# Display the categorized courses
print(course_data[['CourseID', 'course_title', 'Category']].head())

# Save the categorized courses to a new CSV file
course_data.to_csv('Course_Data_Categorized.csv', index=False)

# Merge course data with recommendation data on 'CourseID'
merged_data = pd.merge(recommendation_data, course_data, on='CourseID', how='left')

# Preprocessing: Combine relevant features into a single string (e.g., 'Category', 'course_title')
merged_data['combined_features'] = merged_data['Category'] + ' ' + merged_data['course_title']

# Create a TF-IDF vectorizer to convert the combined features into a vector space
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(merged_data['combined_features'])

# Compute cosine similarity between courses based on their TF-IDF vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get course recommendations based on user-selected course's subject/category
def get_recommendations(user_id, merged_data, cosine_sim, top_n=5):
    # Get the courses that the user has interacted with
    user_courses = merged_data[merged_data['UserID'] == user_id]

    # Get the indices of these courses in the dataset
    user_course_indices = user_courses.index.tolist()

    if not user_course_indices:
        print(f"No courses found for User {user_id}")
        return pd.DataFrame()  # Return an empty DataFrame if no courses found

    # Calculate the average similarity score for each course based on user preferences
    sim_scores = cosine_sim[user_course_indices].mean(axis=0)

    # Sort the courses based on similarity scores
    sim_scores_indices = sim_scores.argsort()[::-1]

    # Remove courses the user has already interacted with
    sim_scores_indices = [idx for idx in sim_scores_indices if idx not in user_course_indices]

    # Get the top N recommended courses
    recommended_courses = merged_data.iloc[sim_scores_indices[:top_n]]

    return recommended_courses[['CourseID', 'course_title', 'course_organization', 'Category']]

# Example: Recommend courses for a user with UserID = 2
user_id = 2
recommendations = get_recommendations(user_id, merged_data, cosine_sim)

if not recommendations.empty:
    print(f"Top course recommendations for User {user_id}:")
    print(recommendations)
else:
    print(f"No recommendations available for User {user_id}.")

   CourseID                                       course_title  \
0       743  A Crash Course in Causality:  Inferring Causal...   
1       874                     A Crash Course in Data Science   
2       413                            A Law Student's Toolkit   
3       635                A Life of Happiness and Fulfillment   
4       661  ADHD: Everyday Strategies for Elementary Students   

           Category  
0      Data Science  
1      Data Science  
2             Other  
3  Self-Development  
4             Other  
Top course recommendations for User 2:
    CourseID                                       course_title  \
17       209                          Foundations of Management   
6        104                                  Business Strategy   
21       211                                 Marketing Strategy   
33       310                       Marketing in a Digital World   
24       301  Corporate Sustainability. Understanding and Se...   

                           co

In [None]:
def evaluate_recommendations(merged_data, cosine_sim, n=5):
    y_true = []
    y_pred = []

    for user_id in merged_data['UserID'].unique():
        # Get the courses that the user has actually interacted with (ground truth)
        user_courses = set(merged_data[merged_data['UserID'] == user_id]['CourseID'])

        # Get the top N recommended courses (predicted)
        recommended_courses = get_recommendations(user_id, merged_data, cosine_sim, top_n=5)

        for course in user_courses.union(recommended_courses):
            y_true.append(1 if course in user_courses else 0)
            y_pred.append(1 if course in recommended_courses else 0)

    # Calculate precision, recall, and F1-score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return precision, recall, f1

# Calculate and display precision, recall, and F1-score
precision, recall, f1 = evaluate_recommendations(merged_data, cosine_sim, n=5)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Precision: 0.00
Recall: 0.00
F1-Score: 0.00
