In [20]:
import os
import pandas as pd
# import tensorflow_datasets as tfds
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [2]:
# Mount your Google Drive to the Colab virtual machine
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# change director
os.chdir('drive/MyDrive/CSCI5502_dataset')
# print
!pwd

/content/drive/MyDrive/CSCI5502_dataset


In [164]:
import numpy as np
import pandas as pd

class ItemBasedCollaborativeFiltering:
    def __init__(self, k_neighbors=5):
        self.k_neighbors = k_neighbors
        self.item_similarity = None
        self.matrix_mean = None

    def preprocess(self, scores_df, courses_df):
        scores_df.dropna(axis=0, inplace=True)
        courses_df.drop('Prerequisite', axis=1, inplace=True)
        courses_df.drop('CourseName', axis=1, inplace=True)
        courses_df.dropna(axis=0, inplace=True)

        df = pd.merge(scores_df, courses_df, on='CourseId', how='inner')

        # Aggregate by score
        agg_scores = df.groupby('CourseId').agg(number_of_scores = ('Score', 'count')).reset_index()
        # Keep the courses with over 100 scores
        agg_scores_GT100 = agg_scores[agg_scores['number_of_scores']>100]

        df_GT100 = pd.merge(df, agg_scores_GT100[['CourseId']], on='CourseId', how='inner')
        return df_GT100

        # pd.DataFrame(cosine_similarity(matrix_norm.fillna(0)), index=matrix_norm.index, columns=matrix_norm.index)


    def create_item_user_matrix(self, preprocessed_df):
        matrix = preprocessed_df.pivot_table(index='CourseName', columns='StudedntId', values='Score')
        # if self.matrix_mean is None:
        #     self.matrix_mean = matrix.mean(axis=1)

        # matrix_norm = matrix.subtract(self.matrix_mean, axis = 0)
        matrix_norm = matrix

        return matrix_norm



    def fit(self, item_user_matrix):
        """
        Fit the collaborative filtering model using the user-item matrix.

        Parameters:
        user_item_matrix (pd.DataFrame): The user-item matrix where rows are users, columns are items, and values are ratings.

        Returns:
        None
        """
        self.item_similarity = self.compute_item_similarity(item_user_matrix)

    def predict(self, item_user_matrix):
        """
        Predict missing values in the user-item matrix.

        Parameters:
        user_item_matrix (pd.DataFrame): The user-item matrix where rows are users, columns are items, and values are ratings.

        Returns:
        pd.DataFrame: The user-item matrix with missing values predicted using collaborative filtering.
        """

        predicted_ratings = item_user_matrix.copy()

        for course, ratings in item_user_matrix.iterrows():
            for student_index, rating in ratings.items():
                if not pd.isnull(rating):  # Predict missing values only
                    predicted_ratings.at[course, student_index] = self.predict_score(item_user_matrix, student_index, course)

        return predicted_ratings

    def compute_item_similarity(self, user_item_matrix):
        item_similarity_cosine = pd.DataFrame(cosine_similarity(user_item_matrix.fillna(0)), index=user_item_matrix.index, columns=user_item_matrix.index)
        return item_similarity_cosine

    def predict_score(self, item_user_matrix, student_id, course_name):

        # courses that the target user has taken
        courses_taken = pd.DataFrame(item_user_matrix[student_id].dropna(axis=0, how='all')\
                          .sort_values(ascending=False))\
                          .reset_index()\
                          .rename(columns={student_id:'score'})

        # Similarity score of the course_name picked with all the other courses
        picked_course_similarity_score = self.item_similarity[[course_name]].reset_index().rename(columns={course_name:'similarity_score'})

        # if student has already taken the course, drop it from similarity score
        picked_course_similarity_score = picked_course_similarity_score.drop(picked_course_similarity_score[picked_course_similarity_score['CourseName'] == course_name].index)

        # Rank the similarities between the movies user 1 rated and American Pie.
        picked_student_taken_similarity = pd.merge(left=courses_taken,
                                                    right=picked_course_similarity_score,
                                                    on='CourseName',
                                                    how='inner')\
                                            .sort_values('similarity_score', ascending=False)[:self.k_neighbors]
        if picked_student_taken_similarity['similarity_score'].sum() != 0:
            predicted_rating = round(np.average(picked_student_taken_similarity['score'],
                                        weights=picked_student_taken_similarity['similarity_score']), 6)
        else:
          predicted_rating = 0.0

        return predicted_rating


    def get_top_similar_items(self, item_index, relevant_items):
        """
        Get the top k similar items for a given item.

        Parameters:
        item_index (int): The index of the target item.
        relevant_items (pd.Series): Series containing the ratings of items rated by the target user.

        Returns:
        list: A list of the top k similar items.
        """
        similar_items = self.item_similarity.loc[item_index].dropna()
        top_similar_items = similar_items.reindex(relevant_items.index).sort_values(ascending=False)
        return top_similar_items.index[:self.k_neighbors]

In [165]:
scores_df = pd.read_excel('CourseSelectionTable.xlsx')
courses_df = pd.read_excel('CourseInformationTable.xlsx')

In [166]:
collab_filtering = ItemBasedCollaborativeFiltering(k_neighbors=5)
preprocessed_df = collab_filtering.preprocess(scores_df, courses_df)

In [167]:
preprocessed_df.head()

Unnamed: 0,StudedntId,AcademicYear,Semester,CourseId,CourseName,CourseCollege,Score,College,Type,Grade,Introduction
0,1115,18-19,1.0,146,Advanced Mathematics (B) (1),National School of Development,81.0,National School of Development,Required major,5.0,Explanation: Comparing with the introductions ...
1,1108,18-19,1.0,146,Advanced Mathematics (B) (1),National School of Development,79.0,National School of Development,Required major,5.0,Explanation: Comparing with the introductions ...
2,1192,19-20,1.0,146,Advanced Mathematics (B) (1),National School of Development,89.0,National School of Development,Required major,5.0,Explanation: Comparing with the introductions ...
3,1193,19-20,1.0,146,Advanced Mathematics (B) (1),National School of Development,74.0,National School of Development,Required major,5.0,Explanation: Comparing with the introductions ...
4,1293,19-20,1.0,146,Advanced Mathematics (B) (1),National School of Development,89.0,National School of Development,Required major,5.0,Explanation: Comparing with the introductions ...


In [168]:
matrix = collab_filtering.create_item_user_matrix(preprocessed_df)
collab_filtering.fit(matrix)

In [169]:
matrix.head()

StudedntId,1,2,3,4,5,6,7,8,9,10,...,4556,4557,4558,4559,4560,4564,4565,4566,4567,4568
CourseName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A General Survey of World History,,,,,,,,,,,...,,,,,,,,,,
A Guide to Chinese Classic,,,,,,,,,,,...,,,,,,,,,,
A Survey of Mao Tsetung Thoughts and Theory of Socialism with Chinese Characteristics,,,66.0,75.0,67.0,80.0,79.0,88.0,,,...,,,,,,,,,,
A Visual Survey of British and American Culture,,,,,42.0,,,,,,...,,,,,,,,,,
Abstract Algebra,,,,,,,,,,,...,,,,,,,,,,


In [170]:
# Pick a user ID
student_id = 3
# Pick a movie
course_name = 'A Survey of Mao Tsetung Thoughts and Theory of Socialism with Chinese Characteristics'

score = collab_filtering.predict_score(matrix, student_id, course_name)

In [171]:
score

73.93721

In [None]:
preidcted_matrix = collab_filtering.predict(matrix)