In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import recommendations
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv('../datasets/processed_dataset.csv')
df = df[['Student Number', 'Course Code', 'Letter Grade', 'Semester', 'Course Credit', 'GPA', 'Completed Credits', 'Department Code']]
df

In [None]:
df = pd.concat([df, pd.get_dummies(df['Department Code'], prefix='Department Code')], axis=1)
df.drop(['Department Code'], axis=1, inplace=True)

In [None]:
numerical_grades = {'A+': 4.1, 'A': 4.0, 'A-': 3.7, 'B+': 3.3, 'B': 3.0, 'B-': 2.7, 'C+': 2.3, 'C': 2.0,
                    'C-': 1.7, 'D+': 1.3, 'D': 1.0, 'D-': 0.5, 'F': 0.0}

In [None]:
course_credits = {}
for row_idx in df.index:
    course_code = df.iloc[row_idx, 1]
    credit = df.iloc[row_idx, 4]    
    course_credits[course_code] = credit

In [None]:
def get_semester_data(semester_name):
    semester_data = {}   # semester data in shape {student_number: {course_code: letter_grade, ...}, ...}
    
    # extracting the instances with the given semester_name from the main dataFrame
    dataset = df[df.iloc[:, 3] == semester_name]
    dataset.index = range(len(dataset))
    
    # filling the semester_data dictionary
    for row_idx in dataset.index:
        student_number = dataset.iloc[row_idx, 0]
        course_code = dataset.iloc[row_idx, 1]
        letter_grade = dataset.iloc[row_idx, 2]
        
        semester_data.setdefault(student_number, {})
        semester_data[student_number][course_code] = numerical_grades[letter_grade]
    
    return semester_data

In [None]:
def get_avg_gpa(train_semester, student):
    courses = train_semester[student]
    total_credit = 0
    weights = 0
    for course in courses:
        total_credit += course_credits[course]
        weights += courses[course] * course_credits[course]
    
    return weights / total_credit

In [None]:
def get_grade_stats(semester_data, student):
    grade_list = []
    
    for course in semester_data[student]:
        numerical_grade = semester_data[student][course]
        grade_list.append(numerical_grade)
    
    mean = np.mean(grade_list)
    std_dev = np.std(grade_list)
    
    return mean, std_dev

In [None]:
def fit_cluster(train_sems, num_clusters, training_data, cluster_model):
    train_dataset = pd.DataFrame(columns=df.columns)
    
    # extracting instances from the dataset which should be in training data
    for sem in train_sems:
        train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)

    cluster_features = train_dataset[['GPA', 'Completed Credits'] + list(train_dataset.columns[7:])]
    
    # fitting a clustering model based on GPA, Completed Credits and Departments
    fitted_cluster_model = cluster_model(n_clusters=num_clusters).fit(cluster_features)
    cluster_labels = fitted_cluster_model.labels_
    
    cluster_dataset = {}   # splitting the train dataset into sub-dicts based on their predicted cluster label
    
    # assigning each students' data to their predicted clusters
    for i in range(len(cluster_labels)):
        cluster_dataset.setdefault(cluster_labels[i], {})
        student_number = train_dataset.iloc[i, 0]
        cluster_dataset[cluster_labels[i]][student_number] = training_data[student_number]
    
    return cluster_dataset, fitted_cluster_model

In [None]:
def cluster_test_data(cluster_model, semester_name):
    # extracting all instances with the given semester_name from the main dataFrame
    test_dataset = df[df.iloc[:, 3] == semester_name]
    test_dataset.index = range(len(test_dataset))
    
    # predicting the cluster labels of test data using a cluster model fitted on the train data so far
    cluster_features = test_dataset[['GPA', 'Completed Credits'] + list(test_dataset.columns[7:])]
    cluster_labels = cluster_model.predict(cluster_features)
    
    # getting the semester data of available students in test semester
    semester_data = get_semester_data(semester_name)
    
    cluster_dataset = {}   # splitting the test dataset into sub-dicts based on their predicted cluster label
    
    # assigning each students' data to their predicted clusters
    for i in range(len(cluster_labels)):
        cluster_dataset.setdefault(cluster_labels[i], {})
        student_number = test_dataset.iloc[i, 0]
        cluster_dataset[cluster_labels[i]][student_number] = semester_data[student_number]
        
    return cluster_dataset    

In [None]:
def get_errors(train_semester, test_semester, sim, item_based):
    average_gpa = {}
    y_true = []
    y_pred = []
    gpa = {}
    
    for student in train_semester:
        gpa[student] = get_avg_gpa(train_semester, student)
    
    item_sims = recommendations.calculateSimilarItems(train_semester)
    # predicting recommended courses for each student in training data
    for student in train_semester:
        recommended_courses = {}
        
        if item_based:
            recs = recommendations.getRecommendedItems(train_semester, item_sims, student)
        else:
            recs = recommendations.getRecommendations(train_semester, student, sim, dgpa=True, gpa=gpa, delta=0.7)
        
        for rec_grade, rec_course in recs:
            recommended_courses.setdefault(rec_course, rec_grade)
            
        average_gpa.setdefault(student, get_avg_gpa(train_semester, student))
        
        # skipping students from training data who do not have not taken courses in test data
        if student not in test_semester:
            continue
        
        mean, std_dev = get_grade_stats(train_semester, student)
            
        # checking for students' test data records in recommended courses
        for course_code in test_semester[student]:
            if course_code in recommended_courses:   # considering the predicted grade if course is available
                rec_grade = recommended_courses[course_code]
                if rec_grade < mean - (2 * std_dev) or rec_grade > mean + (2 * std_dev):
                    continue
                y_pred.append(rec_grade)
            else:   # considering the average GPA if course is not available in recommended courses
                rec_grade = average_gpa[student]
                if rec_grade < mean - (2 * std_dev) or rec_grade > mean + (2 * std_dev):
                    continue
                y_pred.append(rec_grade)
            y_true.append(test_semester[student][course_code])
            
    assert len(y_true) == len(y_pred)   
    return y_true, y_pred

In [None]:
def predict(sim, cluster_model, item_based=False):
    predictions = {} # storing error scores in a dict with shape: 
                      # {num_clusters (k=2,3,...,7): 
                        # {num_training_semesters (N=1,2,...,7): {'y_true': [], 'y_pred': []}, ...},
                      #...}
    
    sorted_semesters = sorted(set(df.iloc[:, 3]))   # sorting semesters in a time series manner
    for num_clusters in range(10, 31, 5):
        predictions.setdefault(str(num_clusters), {})
        train_semester = {}   # {student_number: {course_code: letter_grade, ...}, ...}
        for sem_idx in range(1, len(sorted_semesters)):
            predictions[str(num_clusters)].setdefault(str(sem_idx), {'y_true': [], 'y_pred': []})
            
            # combining all previous semesters in dataset and consider it as training semester
            new_semester = get_semester_data(sorted_semesters[sem_idx-1])
            for student in new_semester:
                if student in train_semester:   # combine data if a student already exist
                    train_semester[student].update(new_semester[student])
                else:   # create a new key-value pair for students with no record
                    train_semester[student] = new_semester[student]
            
            training_semesters_name = sorted_semesters[:sem_idx]   # names of all training semesters
            
            # getting the cluster model fitted on training data and each clusters' training data
            train_cluster_data, fitted_cluster_model = fit_cluster(training_semesters_name, num_clusters, train_semester, cluster_model)
            
            # getting the clustered test data
            test_semester_name = sorted_semesters[sem_idx]
            test_cluster_data = cluster_test_data(fitted_cluster_model, test_semester_name)
            
            # fitting each cluster label with a similarity metric, and measure the error between the same
            # cluster labels in training and test data
            for cluster_label in train_cluster_data:
                if cluster_label not in test_cluster_data:
                    continue
                y_true, y_pred = get_errors(train_cluster_data[cluster_label], test_cluster_data[cluster_label], sim, item_based)
                predictions[str(num_clusters)][str(sem_idx)]['y_true'] += y_true
                predictions[str(num_clusters)][str(sem_idx)]['y_pred'] += y_pred
        
    return predictions

### User-based Collaborative Filtering

In [None]:
model_predictions = {}

In [None]:
predictions = predict(recommendations.sim_distance, KMeans)
model_predictions['Euclidean Distance'] = predictions

In [None]:
predictions = predict(recommendations.sim_jaccard, KMeans)
model_predictions['Jaccard Index'] = predictions

In [None]:
predictions = predict(recommendations.sim_pearson, KMeans)
model_predictions['Pearson Correlation'] = predictions

In [None]:
with open('clustering_user_based_collaborative_filtering_WE_results (Student based with KMeans).json', 'w') as fw:
    json.dump(model_predictions, fw)

### Item-based Collaborative Filtering

In [None]:
model_predictions = {}

In [None]:
predictions = predict(recommendations.sim_distance, KMeans, item_based=True)
model_predictions['Euclidean Distance'] = predictions

In [None]:
predictions = predict(recommendations.sim_jaccard, KMeans, item_based=True)
model_predictions['Jaccard Index'] = predictions

In [None]:
predictions = predict(recommendations.sim_pearson, KMeans, item_based=True)
model_predictions['Pearson Correlation'] = predictions

In [None]:
with open('clustering_item_based_collaborative_filtering_results (Student based with KMeans).json', 'w') as fw:
    json.dump(model_predictions, fw)