# Content Based Filtering

## Read data

In [2]:
import pandas as pd

# If 'courses' might be a slice of another DataFrame, explicitly create a copy to work on:
courses = pd.read_csv("./CleanedDataset/final_courses.csv")

courses = courses.drop_duplicates(subset='name')

## Preprocessing

In [3]:
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

additional_stopwords = ['learn', 'course', 'students', 'introduction', 'understand', 'overview', 
    'concepts', 'designed', 'including', 'using', 'used', 'provides', 
    'students', 'aimed', 'teach', 'taught', 'include', 'includes', 'including', 
    'understanding', 'module', 'modules', 'cover', 'covers', 'covered', 
    'introduce', 'introduces', 'introduced', 'overview', 'knowledge', 'skills', 
    'learning', 'lecture', 'lectures', 'topic', 'topics', 'area', 'areas', 
    'focus', 'focuses', 'focused', 'approach', 'approaches', 'principles', 
    'principle', 'content', 'contents', 'key', 'features', 'feature', 
    'element', 'elements', 'basis', 'basic', 'basics', 'foundation', 'foundations',
    'beginner', 'intermediate', 'advanced', 'level', 'levels', 'intended',
    'objective', 'objectives', 'goal', 'goals', 'outcome', 'outcomes', 'aim', 'aims',
    'gain', 'gains', 'apply', 'effective', 'efficient', 'specialize', 'specialization',
    'specialise', 'certificate', 'certification', 'you', 'will', 'need',
    'want', 'have', 'able', 'well', 'one', 'two', 'three', 'first', 'second',
    'third', 'success', 'successful', 'sucessfully', 'like', 'many' , 'much', 'also', 'use', 'uses', 'used',
    'work', 'works', 'working', 'workshop', 'workshops', 'provide', 'provides', 'provided']

all_stop_words = stopwords.words('english') + additional_stopwords

[nltk_data] Downloading package punkt to C:\Users\Quang
[nltk_data]     Anh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Quang
[nltk_data]     Anh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Quang
[nltk_data]     Anh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Initialize the PorterStemmer
stemmer = PorterStemmer()

# Preprocessing function with N-gram consideration
def preprocess_text(text, lemmatizer=WordNetLemmatizer(), stop_words=all_stop_words):
    text = text.lower()  # Lowercasing
    # text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenization
    alphabetic_tokens = [token for token in tokens if re.match("^[a-zA-Z]+$", token)] # check for alphabetic tokens
    filtered_tokens = [word for word in alphabetic_tokens if word not in stop_words]  # Remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]  # Lemmatization
    return ' '.join(lemmatized_tokens)

In [5]:
# Now, safely apply the preprocessing and assign to the 'processed_description' column
courses['processed_description'] = courses['description'].apply(preprocess_text)
courses['processed_skills'] = courses['skills'].apply(preprocess_text)

### BERT Embeddings

In [6]:
from transformers import BertModel, BertTokenizer
import torch

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def get_bert_embeddings(texts):
    # Tokenize the text
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    
    # Move encoded input to the correct device
    encoded_input = encoded_input.to(device)
    
    # Generate embeddings
    with torch.no_grad():
        output = model(**encoded_input)
    
    # Retrieve the embeddings for the [CLS] token (first token) as sentence embeddings
    embeddings = output.last_hidden_state[:, 0, :].detach().cpu().numpy()
    
    return embeddings



  from .autonotebook import tqdm as notebook_tqdm


In [7]:
description_embeddings = get_bert_embeddings(courses['processed_description'].tolist())
skills_embeddings = get_bert_embeddings(courses['processed_skills'].tolist())


In [8]:
courses['description_embeddings'] = description_embeddings.tolist()
courses['skills_embeddings'] = skills_embeddings.tolist()

In [9]:
from sklearn.decomposition import PCA
import numpy as np

# Assuming courses['description_embeddings'] is a list of embeddings
embeddings_array = np.array(courses['description_embeddings'].tolist())

# Initialize PCA, here we are reducing to 128 components
pca = PCA(n_components=128)
reduced_embeddings = pca.fit_transform(embeddings_array)

# Store the reduced embeddings back into the DataFrame
courses['reduced_embeddings'] = list(reduced_embeddings)


In [10]:
from sklearn.decomposition import PCA
import numpy as np

# Assuming courses['description_embeddings'] is a list of embeddings
embeddings_array = np.array(courses['skills_embeddings'].tolist())

# Initialize PCA, here we are reducing to 128 components
pca = PCA(n_components=128)
reduced_embeddings = pca.fit_transform(embeddings_array)

# Store the reduced embeddings back into the DataFrame
courses['skills_embeddings'] = list(reduced_embeddings)


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming `courses_embeddings` and `skills_embeddings` are NumPy arrays
# If they are lists, first convert them: np.array(list_of_arrays)
description_similarity = cosine_similarity(description_embeddings)
skills_similarity = cosine_similarity(skills_embeddings)

# Combine the matrices, here we're using a simple average
combined_similarity = description_similarity * 0.3 + skills_similarity * 0.7

In [12]:
courses.reset_index(drop=True, inplace=True)

In [20]:
import random 

def recommend_course(course_index, combined_similarity, top_n=5):
    # Scores are the similarity of the selected course to all others
    scores = combined_similarity[course_index]

    # Sort the scores in descending order while keeping track of the index
    sorted_indices = scores.argsort()[::-1]
    
    # Exclude the course itself (highest score), and get the top_n recommendations
    top_course_indices = sorted_indices[1:top_n+1]

    # The scores for the top recommended courses
    top_scores = scores[top_course_indices]
    
    # You can then use `top_course_indices` to fetch the course names or IDs if you have them
    # For example, assuming you have a DataFrame `courses` with a course ID or name:
    # recommended_courses = courses.iloc[top_course_indices]
    
    return top_course_indices, top_scores

# Example: Recommend courses similar to the course at index 0
course_index = random.randint(0, len(courses)-1)
top_n = 5
recommended_indices, recommended_scores = recommend_course(course_index, combined_similarity, top_n)

print("Recommended Course Indices:", recommended_indices)
print("Scores:", recommended_scores)

print(f"Rcommendations for {course_index}: {courses['name'][course_index]}")

print("\n")

for i in recommended_indices:
    print(courses['name'][i])

Recommended Course Indices: [ 70  71  57  32 214]
Scores: [0.92423487 0.9199227  0.91198486 0.9068167  0.8935572 ]
Rcommendations for 122: natural language processing with sequence models


natural language processing in tensorflow
natural language processing with probabilistic models
convolutional neural networks
natural language processing with classification and vector spaces
introduction to machine learning
