In [68]:
%%capture
!pip install scikit-learn

In [69]:
# Import necessary libraries
import transformers
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Get the GPU status information
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [42]:
# Display the GPU status information
print(f'Number of GPUs: {torch.cuda.device_count()}')
print(f'Name of GPUs: {torch.cuda.get_device_name()}')

Number of GPUs: 1
Name of GPUs: Tesla T4


In [4]:
# Access google drive
import os
from google.colab import drive

In [5]:
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [76]:
# Load the dataset as a dataframe
course_reviews_df = pd.read_csv('/content/drive/MyDrive/course_recommendation/course_reviews.csv')
course_reviews_df.head()

Unnamed: 0,course_name,course_source,keywords,course_rating,course_review
0,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",5.0,"Thank you very much for this awesome course, I..."
1,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",1.0,I didn't like that course because of:-weak exp...
2,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",5.0,Nice one.
3,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",1.0,poor course. never give any algorithm. No clue...
4,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",3.0,While I like the content of the course but I f...


In [7]:
# Inspect the shape of the dataframe
course_reviews_df.shape

(55056, 5)

In [8]:
# Inspect the general information of the dataframe
course_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55056 entries, 0 to 55055
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   course_name    55056 non-null  object 
 1   course_source  55056 non-null  object 
 2   keywords       55056 non-null  object 
 3   course_rating  55056 non-null  float64
 4   course_review  55056 non-null  object 
dtypes: float64(1), object(4)
memory usage: 2.1+ MB


In [22]:
# Define the sentiment classes
sentiment_classes = ['negative', 'neutral', 'positive']

In [10]:
# Define the Sentiment Classifier class
class BertSentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BertSentimentClassifier, self).__init__()
    self.pretrained_bert = AutoModel.from_pretrained('bert-base-cased')
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.pretrained_bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.pretrained_bert(
        input_ids = input_ids,
        attention_mask = attention_mask,
        return_dict = False
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [11]:
# Load the tokenzier
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [23]:
# Load the finetuned model weights
model = BertSentimentClassifier(len(sentiment_classes))
model.load_state_dict(torch.load('/content/drive/MyDrive/course_recommendation/best_bert_model_state.bin'))
model = model.to(device)

# Set the model into evaluation mode
model.eval()

BertSentimentClassifier(
  (pretrained_bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [24]:
# Define the maximum string length
MAX_LEN = 200

In [48]:
# Define the reviews
first_sample = 'I really love this course, it was awesome'
second_sample = 'This course is ok'
third_sample = 'This course was terrible, I really hated it'

In [38]:
# Define a function to encode text and get sentiment predictions
def predict_sentiment(review):
  # Encode the text
  encoded_course_review = tokenizer.encode_plus(
      review,
      max_length = MAX_LEN,
      add_special_tokens = True,
      truncation = True,
      padding = 'max_length',
      return_attention_mask = True,
      return_tensors = 'pt',
  )

  # Get the sentiment prediction from the model
  input_ids = encoded_course_review['input_ids'].to(device)
  attention_mask = encoded_course_review['attention_mask'].to(device)
  output = model(input_ids, attention_mask)

  _, prediction = torch.max(output, dim=1)
  return sentiment_classes[prediction]

In [50]:
print(f'{first_sample}: {predict_sentiment(first_sample)}')
print(f'{second_sample}: {predict_sentiment(second_sample)}')
print(f'{third_sample}: {predict_sentiment(third_sample)}')

I really love this course, it was awesome: positive
This course is ok: neutral
This course was terrible, I really hated it: negative


In [81]:
# Predict the sentiment of each of the reviews in the course review dataframe
course_reviews_df['sentiment'] = course_reviews_df['course_review'].apply(predict_sentiment)
course_reviews_df.head()

Unnamed: 0,course_name,course_source,keywords,course_rating,course_review,sentiment
0,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",5.0,"Thank you very much for this awesome course, I...",positive
1,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",1.0,I didn't like that course because of:-weak exp...,negative
2,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",5.0,Nice one.,positive
3,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",1.0,poor course. never give any algorithm. No clue...,negative
4,advanced algorithms and complexity,coursera,"['advanced', 'algorithms', 'and', 'complexity']",3.0,While I like the content of the course but I f...,positive


In [82]:
# Remove the 'keywords' variable from the DataFrame
course_reviews_df = course_reviews_df.drop(columns=['keywords'])

In [83]:
# Inspect the first few observations of the newly formed DataFrame
course_reviews_df.head()

Unnamed: 0,course_name,course_source,course_rating,course_review,sentiment
0,advanced algorithms and complexity,coursera,5.0,"Thank you very much for this awesome course, I...",positive
1,advanced algorithms and complexity,coursera,1.0,I didn't like that course because of:-weak exp...,negative
2,advanced algorithms and complexity,coursera,5.0,Nice one.,positive
3,advanced algorithms and complexity,coursera,1.0,poor course. never give any algorithm. No clue...,negative
4,advanced algorithms and complexity,coursera,3.0,While I like the content of the course but I f...,positive


In [121]:
# Save the dataframe into a csv file we can use later
course_reviews_df.to_csv('/content/drive/MyDrive/course_recommendation/course_sentiments.csv')

In [106]:
# Create a search functionality
def course_search(query):
  courses = pd.Series(course_reviews_df['course_name'].unique())
  vectorizer = TfidfVectorizer().fit(courses)
  query_vector = vectorizer.transform([query])
  course_vectors = vectorizer.fit_transform(courses)
  similarities = cosine_similarity(query_vector, course_vectors).flatten()
  top_10_indices = similarities.argsort()[-10:][::-1]
  top_10_courses = courses.loc[top_10_indices].tolist()
  results = course_reviews_df[course_reviews_df['course_name'].isin(top_10_courses)]
  return results

In [116]:
# Create sentiment aggregation for recommendation
def calculate_recommendation_score(df):
  # recommendation_df = df.groupby('course_name')['sentiment'].apply(lambda x: x.value_counts().get('positive', 0) / len(x)).reset_index()
  recommendation_df = df.groupby('course_name').agg(
      course_source=('course_source', 'first'),
      positive_ratio=('sentiment', lambda x: x.value_counts().get('positive', 0) / len(x)),
      total_reviews=('sentiment', 'size')
  ).reset_index()
  return recommendation_df.sort_values(['total_reviews', 'positive_ratio'], ascending=False)

In [108]:
# Integrate search and recommendation
def recommend_courses(query):
  search_df = course_search(query)
  recommended_courses = calculate_recommendation_score(search_df)
  return recommended_courses

In [118]:
# Recommend courses
recommend_courses('machine learning')

Unnamed: 0,course_name,course_source,positive_ratio,total_reviews
7,machine learning,coursera,0.940668,7062
2,deep learning,udacity,0.839523,754
3,deep reinforcement learning,udacity,0.821818,275
5,introduction to machine learning with tensorflow,udacity,0.856481,216
4,intro to machine learning with pytorch,udacity,0.871429,210
9,practical machine learning,coursera,0.640625,192
6,machine design1,coursera,0.864865,37
0,aws machine learning engineer nanodegree,udacity,0.742857,35
1,big data machine learning,coursera,0.705882,34
8,machine learning data analysis,coursera,0.541667,24
