In [5]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import recommendations
from sklearn.cluster import KMeans

In [7]:
df = pd.read_csv("./processed_data_course_based.csv")

In [9]:
df

Unnamed: 0,Student Number,Course Title,Course Credit,Grades,Course Semester,GPA,Completed Credits,Semester GPA,Semester Credit,Grade 1 Rate,Grade 2 Rate,Grade 3 Rate,Grade 4 Rate,Grade 5 Rate,Mean Grade,STDEV Grade,Mean GPA,STDEV GPA,Department Code
0,0,Ideological and Moral Cultivation and Legal Fo...,2.5,4,1,3.102564,19.5,3.102564,19.5,0.000000,0.023560,0.486911,0.408377,0.081152,3.547120,0.677055,3.307021,0.516455,BLG
1,0,Success: Career Planning,1.0,3,1,3.102564,19.5,3.102564,19.5,0.005236,0.086387,0.557592,0.052356,0.298429,3.552356,1.025211,3.307021,0.516455,BLG
2,0,Introduction to Computer Science,2.0,1,1,3.102564,19.5,3.102564,19.5,0.007853,0.107330,0.358639,0.429319,0.096859,3.500000,0.841040,3.307021,0.516455,BLG
3,0,Advanced Mathematics A(1),5.5,4,1,3.102564,19.5,3.102564,19.5,0.143979,0.193717,0.204188,0.240838,0.217277,3.193717,1.359003,3.307021,0.516455,BLG
4,0,College English A(1),4.0,3,1,3.102564,19.5,3.102564,19.5,0.028796,0.256545,0.439791,0.253927,0.020942,2.981675,0.843178,3.307021,0.516455,BLG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19095,381,J2EE Framework,4.5,3,6,2.790076,131.0,2.800000,20.0,0.007853,0.120419,0.348168,0.400524,0.123037,3.510471,0.886555,3.220775,0.515869,BLG
19096,381,Intellectual Property and Software Protection,1.0,4,7,2.838129,139.0,3.625000,8.0,0.000000,0.151832,0.227749,0.366492,0.253927,3.722513,1.007305,3.241516,0.494006,BLG
19097,381,Human-Computer Interaction Technology,2.0,5,7,2.838129,139.0,3.625000,8.0,0.000000,0.060209,0.413613,0.408377,0.117801,3.583770,0.774968,3.241516,0.494006,BLG
19098,381,Software Development and Testing Training,3.0,3,7,2.838129,139.0,3.625000,8.0,0.000000,0.018325,0.476440,0.463351,0.041885,3.528796,0.608736,3.241516,0.494006,BLG


In [11]:
df = pd.concat([df, pd.get_dummies(df['Department Code'], dtype=int, prefix='Department Code'), pd.get_dummies(df['Course Semester'], dtype=int, prefix='Course Semester')], axis=1)

# get rid of student based columns
df.drop(['Department Code', 'Semester GPA', 'Semester Credit', 'Completed Credits', 'GPA'], axis=1, inplace=True)

# rearrange order of columns
all_cols = df.columns.tolist()
cols_to_move = ["Student Number", "Course Title", "Grades", "Course Semester", "Course Credit"]
remaining_cols = [col for col in all_cols if col not in cols_to_move]
df = df[cols_to_move + remaining_cols]

In [13]:
df

Unnamed: 0,Student Number,Course Title,Grades,Course Semester,Course Credit,Grade 1 Rate,Grade 2 Rate,Grade 3 Rate,Grade 4 Rate,Grade 5 Rate,...,Mean GPA,STDEV GPA,Department Code_BLG,Course Semester_1,Course Semester_2,Course Semester_3,Course Semester_4,Course Semester_5,Course Semester_6,Course Semester_7
0,0,Ideological and Moral Cultivation and Legal Fo...,4,1,2.5,0.000000,0.023560,0.486911,0.408377,0.081152,...,3.307021,0.516455,1,1,0,0,0,0,0,0
1,0,Success: Career Planning,3,1,1.0,0.005236,0.086387,0.557592,0.052356,0.298429,...,3.307021,0.516455,1,1,0,0,0,0,0,0
2,0,Introduction to Computer Science,1,1,2.0,0.007853,0.107330,0.358639,0.429319,0.096859,...,3.307021,0.516455,1,1,0,0,0,0,0,0
3,0,Advanced Mathematics A(1),4,1,5.5,0.143979,0.193717,0.204188,0.240838,0.217277,...,3.307021,0.516455,1,1,0,0,0,0,0,0
4,0,College English A(1),3,1,4.0,0.028796,0.256545,0.439791,0.253927,0.020942,...,3.307021,0.516455,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19095,381,J2EE Framework,3,6,4.5,0.007853,0.120419,0.348168,0.400524,0.123037,...,3.220775,0.515869,1,0,0,0,0,0,1,0
19096,381,Intellectual Property and Software Protection,4,7,1.0,0.000000,0.151832,0.227749,0.366492,0.253927,...,3.241516,0.494006,1,0,0,0,0,0,0,1
19097,381,Human-Computer Interaction Technology,5,7,2.0,0.000000,0.060209,0.413613,0.408377,0.117801,...,3.241516,0.494006,1,0,0,0,0,0,0,1
19098,381,Software Development and Testing Training,3,7,3.0,0.000000,0.018325,0.476440,0.463351,0.041885,...,3.241516,0.494006,1,0,0,0,0,0,0,1


In [15]:
course_credits = {}
for row_idx in df.index:
    course_title = df.iloc[row_idx, 1]
    credit = df.iloc[row_idx, 4]    
    course_credits[course_title] = credit

In [17]:
def get_semester_data(semester_num):
    semester_data = {}   # semester data in shape {student_number: {course_code: letter_grade, ...}, ...}
    
    # extracting the instances with the given semester_num from the main dataFrame
    dataset = df[df.iloc[:, 3] == semester_num]
    dataset.index = range(len(dataset))
    
    # filling the semester_data dictionary
    for row_idx in dataset.index:
        student_number = dataset.iloc[row_idx, 0]
        course_title = dataset.iloc[row_idx, 1]
        grade = dataset.iloc[row_idx, 2]
        
        semester_data.setdefault(student_number, {})
        semester_data[student_number][course_title] = int(grade)
    
    return semester_data

In [19]:
def get_avg_gpa(train_semester, student):
    courses = train_semester[student]
    total_credit = 0
    weights = 0
    for course in courses:
        total_credit += course_credits[course]
        weights += courses[course] * course_credits[course]
    
    return weights / total_credit

In [21]:
def get_grade_stats(semester_data, student):
    grade_list = []
    
    for course in semester_data[student]:
        numerical_grade = semester_data[student][course]
        grade_list.append(numerical_grade)
    
    mean = np.mean(grade_list)
    std_dev = np.std(grade_list)
    
    return mean, std_dev

In [23]:
def fit_cluster(train_sems, num_clusters, training_data, cluster_model):
    train_dataset = pd.DataFrame(columns=df.columns)
    
    # extracting instances from the dataset which should be in training data
    for sem in train_sems:
        train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)

    cluster_features = train_dataset[list(train_dataset.columns[4:])]
    
    # fitting a clustering model based on GPA, Completed Credits and Departments
    fitted_cluster_model = cluster_model(n_clusters=num_clusters).fit(cluster_features)
    cluster_labels = fitted_cluster_model.labels_
    
    cluster_dataset = {}   # splitting the train dataset into sub-dicts based on their predicted cluster label
    
    # assigning each students' data to their predicted clusters
    for i in range(len(cluster_labels)):
        cluster_dataset.setdefault(cluster_labels[i], {})
        student_number = train_dataset.iloc[i, 0]
        cluster_dataset[cluster_labels[i]][student_number] = training_data[student_number]
    
    return cluster_dataset, fitted_cluster_model

In [25]:
def cluster_test_data(cluster_model, semester_num):
    # extracting all instances with the given semester_name from the main dataFrame
    test_dataset = df[df.iloc[:, 3] == semester_num]
    test_dataset.index = range(len(test_dataset))
    
    # predicting the cluster labels of test data using a cluster model fitted on the train data so far
    cluster_features = test_dataset[list(test_dataset.columns[4:])]
    cluster_labels = cluster_model.predict(cluster_features)
    
    # getting the semester data of available students in test semester
    semester_data = get_semester_data(semester_num)
    
    cluster_dataset = {}   # splitting the test dataset into sub-dicts based on their predicted cluster label
    
    # assigning each students' data to their predicted clusters
    for i in range(len(cluster_labels)):
        cluster_dataset.setdefault(cluster_labels[i], {})
        student_number = test_dataset.iloc[i, 0]
        cluster_dataset[cluster_labels[i]][student_number] = semester_data[student_number]
        
    return cluster_dataset    

In [27]:
def get_errors(train_semester, test_semester, sim, item_based):
    average_gpa = {}
    y_true = []
    y_pred = []
    gpa = {}
    
    for student in train_semester:
        gpa[student] = get_avg_gpa(train_semester, student)
    
    item_sims = recommendations.calculateSimilarItems(train_semester)
    # predicting recommended courses for each student in training data
    for student in train_semester:
        recommended_courses = {}
        
        if item_based:
            recs = recommendations.getRecommendedItems(train_semester, item_sims, student)
        else:
            recs = recommendations.getRecommendations(train_semester, student, sim, dgpa=True, gpa=gpa, delta=0.7)
        
        for rec_grade, rec_course in recs:
            recommended_courses.setdefault(rec_course, rec_grade)
            
        average_gpa.setdefault(student, get_avg_gpa(train_semester, student))
        
        # skipping students from training data who do not have not taken courses in test data
        if student not in test_semester:
            continue
        
        mean, std_dev = get_grade_stats(train_semester, student)
                    
        # checking for students' test data records in recommended courses
        for course_title in test_semester[student]:
            if course_title in recommended_courses:   # considering the predicted grade if course is availabel
                rec_grade = recommended_courses[course_title]
                if rec_grade < mean - (2 * std_dev) or rec_grade > mean + (2 * std_dev):
                    continue
                y_pred.append(rec_grade)
            else:   # considering the average GPA if course is not available in recommended courses
                rec_grade = average_gpa[student]
                if rec_grade < mean - (2 * std_dev) or rec_grade > mean + (2 * std_dev):
                    continue
                y_pred.append(rec_grade)
            y_true.append(test_semester[student][course_title])
            
    assert len(y_true) == len(y_pred)
    return y_true, y_pred

In [29]:
def predict(sim, cluster_model, item_based=False):
    predictions = {} # storing error scores in a dict with shape: 
                      # {num_clusters (k=2,3,...,7): 
                        # {num_training_semesters (N=1,2,...,7): {'y_true': [], 'y_pred': []}, ...},
                      #...}
    
    sorted_semesters = sorted(set(df.iloc[:, 3]))   # sorting semesters in a time series manner
    for num_clusters in range(10, 31, 5):
        # print(num_clusters)
        predictions.setdefault(str(num_clusters), {})
        train_semester = {}   # {student_number: {course_title: grade, ...}, ...}
        for sem_idx in range(1, len(sorted_semesters)):
            predictions[str(num_clusters)].setdefault(str(sem_idx), {'y_true': [], 'y_pred': []})
            
            # combining all previous semesters in dataset and consider it as training semester
            new_semester = get_semester_data(sorted_semesters[sem_idx-1])
            for student in new_semester:
                if student in train_semester:   # combine data if a student already exist
                    train_semester[student].update(new_semester[student])
                else:   # create a new key-value pair for students with no record
                    train_semester[student] = new_semester[student]
            
            training_semesters_name = sorted_semesters[:sem_idx]   # names of all training semesters
            
            # getting the cluster model fitted on training data and each clusters' training data
            train_cluster_data, fitted_cluster_model = fit_cluster(training_semesters_name, num_clusters, train_semester, cluster_model)
            
            # getting the clustered test data
            test_semester_name = sorted_semesters[sem_idx]
            test_cluster_data = cluster_test_data(fitted_cluster_model, test_semester_name)
            
            # fitting each cluster label with a similarity metric, and measure the error between the same
            # cluster labels in training and test data
            for cluster_label in train_cluster_data:
                if cluster_label not in test_cluster_data:
                    continue
                y_true, y_pred = get_errors(train_cluster_data[cluster_label], test_cluster_data[cluster_label], sim, item_based)
                predictions[str(num_clusters)][str(sem_idx)]['y_true'] += y_true
                predictions[str(num_clusters)][str(sem_idx)]['y_pred'] += y_pred
            
    return predictions

### User-based Collaborative Filtering

In [31]:
model_predictions = {}

In [33]:
predictions = predict(recommendations.sim_distance, KMeans)
model_predictions['Euclidean Distance'] = predictions

  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
found 0 physical cores < 1
  File "C:\Users\casperrrr\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset,

In [35]:
predictions = predict(recommendations.sim_jaccard, KMeans)
model_predictions['Jaccard Index'] = predictions

  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  tr

In [37]:
predictions = predict(recommendations.sim_pearson, KMeans)
model_predictions['Pearson Correlation'] = predictions

  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  tr

In [46]:
with open('clustering_user_based_collaborative_filtering_WE_results (Course based with KMeans).json', 'w') as fw:
    json.dump(model_predictions, fw)

### Item-based Collaborative Filtering

In [49]:
model_predictions = {}

In [51]:
predictions = predict(recommendations.sim_distance, KMeans, item_based=True)
model_predictions['Euclidean Distance'] = predictions

  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  tr

In [53]:
predictions = predict(recommendations.sim_jaccard, KMeans, item_based=True)
model_predictions['Jaccard Index'] = predictions

  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  tr

In [55]:
predictions = predict(recommendations.sim_pearson, KMeans, item_based=True)
model_predictions['Pearson Correlation'] = predictions

  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  return fit_method(estimator, *args, **kwargs)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  train_dataset = pd.concat([train_dataset, df[df.iloc[:, 3] == sem]], ignore_index=True)
  tr

In [57]:
with open('clustering_item_based_collaborative_filtering_results (Course based with KMeans).json', 'w') as fw:
    json.dump(model_predictions, fw)