In [1]:
import pandas as pd
import numpy as np
import scipy as sp

import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity

import sys

In [2]:
student_data = pd.read_excel("dataset/StudentInformationTable.xlsx")
course_data = pd.read_excel("dataset/CourseInformationTable.xlsx")
career_data = pd.read_excel("dataset/CourseSelectionTable.xlsx")

display(student_data.describe())
display(student_data.head(5))

Unnamed: 0,StudentId,EnrollmentYear
count,4568.0,4568.0
mean,2284.5,2018.295972
std,1318.812344,1.191886
min,1.0,2000.0
25%,1142.75,2018.0
50%,2284.5,2019.0
75%,3426.25,2019.0
max,4568.0,2020.0


Unnamed: 0,StudentId,EnrollmentYear,Education,Major
0,1115,2018,Undergraduate,Biological Science
1,1108,2018,Undergraduate,Biological Science
2,1192,2018,Undergraduate,Urban and Rural Planning
3,1193,2018,Undergraduate,Urban and Rural Planning
4,1293,2018,Undergraduate,World History


In [3]:
display(course_data.describe())
display(course_data.head(5))

Unnamed: 0,CourseId,Grade
count,5591.0,5225.0
mean,2796.0,2.436842
std,1614.127009,0.939362
min,1.0,0.0
25%,1398.5,2.0
50%,2796.0,2.0
75%,4193.5,3.0
max,5591.0,12.0


Unnamed: 0,CourseId,CourseName,College,Type,Grade,Prerequisite,Introduction
0,362,Fascinating Robot,College of Engineering,Whole school optional,2.0,,This course is open to all students in the sch...
1,1045,Introduction to Seismology,School of Earth and Space Sciences,General elective course,2.0,,This course is a quality education general cou...
2,1647,Speeches and oral cultures in China,Department of Chinese Language and Literature,Whole school optional,2.0,,The course is based on the introduction and re...
3,1830,Modern Chinese History,Department of History,Required major,4.0,ancient Chinese history,This course is based on a large number of orig...
4,1834,Chinese Historiography,Department of History,optional,3.0,,This course is a compulsory course for undergr...


In [4]:
display(career_data.describe())
display(career_data.head(5))

Unnamed: 0,StudentId,Semester,CourseId,Score
count,208949.0,208941.0,208949.0,149223.0
mean,1878.77259,1.505344,2578.111147,81.154792
std,1245.936537,0.532999,1732.925391,13.84162
min,1.0,1.0,1.0,0.0
25%,778.0,1.0,750.0,78.0
50%,1695.0,1.0,2569.0,84.0
75%,2914.0,2.0,4151.0,90.0
max,4568.0,3.0,5591.0,100.0


Unnamed: 0,StudentId,AcademicYear,Semester,CourseId,CourseName,CourseCollege,Score
0,1115,18-19,1.0,146,Advanced Mathematics (B) (1),National School of Development,81.0
1,1115,18-19,1.0,148,Problem-solving on Higher Mathematics (B),School of Economics,
2,1115,18-19,1.0,654,General Chemistry Practice,College of Engineering,
3,1115,18-19,1.0,681,General Chemistry (B),Department of Medicine Teaching office,72.0
4,1115,18-19,1.0,684,General Chemistry Lab.（B）,Department of Medicine Teaching office,83.5


In [5]:
career_data_clean = career_data.dropna()

In [6]:
print("Number of students in data:  ", career_data_clean.StudentId.nunique())
print("Range of scores:             ", career_data_clean.Score.min(), career_data_clean.Score.max())
print("Unique scores in dataset:    ")
print(np.array(sorted(career_data_clean.Score.unique())))

Number of students in data:   4546
Range of scores:              0.0 100.0
Unique scores in dataset:    
[  0.    1.    1.5   2.    2.5   3.    3.5   4.    5.    5.5   6.    7.
   7.5   8.    9.    9.5  10.   11.   12.   12.5  13.   14.   14.5  15.
  16.   16.5  17.   18.   19.   19.5  20.   20.5  21.   22.   22.5  23.
  23.5  23.6  24.   25.   25.5  26.   27.   27.5  28.   28.5  29.   30.
  30.5  31.   32.   33.   34.   34.5  35.   35.5  36.   36.5  37.   37.5
  38.   38.5  39.   39.5  40.   40.5  41.   41.5  42.   42.5  43.   43.5
  44.   44.5  45.   45.5  46.   46.5  47.   47.5  48.   48.5  49.   49.5
  50.   50.5  51.   51.5  52.   52.5  53.   53.5  54.   54.5  55.   55.5
  56.   56.5  57.   57.5  58.   59.   59.5  60.   60.5  61.   61.5  62.
  62.5  63.   63.5  64.   64.5  65.   65.5  66.   66.5  66.6  67.   67.5
  68.   68.5  69.   69.5  70.   70.5  70.7  71.   71.5  72.   72.5  73.
  73.5  74.   74.5  75.   75.5  76.   76.5  77.   77.5  78.   78.5  79.
  79.5  80.   80.5  80.8  

In [7]:
career_student_data = pd.merge(career_data_clean, student_data, 'inner', 'StudentId')

In [8]:
display(career_student_data.describe())
display(career_student_data.head(5))

Unnamed: 0,StudentId,Semester,CourseId,Score,EnrollmentYear
count,149021.0,149021.0,149021.0,149021.0,149021.0
mean,1631.926044,1.430362,2723.731306,81.136139,2017.849874
std,1163.655105,0.527811,1690.240923,13.840801,1.213044
min,1.0,1.0,2.0,0.0,2000.0
25%,617.0,1.0,1103.0,78.0,2017.0
50%,1411.0,1.0,2740.0,84.0,2018.0
75%,2592.0,2.0,4152.0,90.0,2019.0
max,4568.0,3.0,5591.0,100.0,2020.0


Unnamed: 0,StudentId,AcademicYear,Semester,CourseId,CourseName,CourseCollege,Score,EnrollmentYear,Education,Major
0,1115,18-19,1.0,146,Advanced Mathematics (B) (1),National School of Development,81.0,2018,Undergraduate,Biological Science
1,1115,18-19,1.0,681,General Chemistry (B),Department of Medicine Teaching office,72.0,2018,Undergraduate,Biological Science
2,1115,18-19,1.0,684,General Chemistry Lab.（B）,Department of Medicine Teaching office,83.5,2018,Undergraduate,Biological Science
3,1115,18-19,1.0,748,Physiology,College of Life Sciences,85.0,2018,Undergraduate,Biological Science
4,1115,18-19,1.0,844,Physiology Lab.,College of Life Sciences,75.0,2018,Undergraduate,Biological Science


In [9]:
all_student_ids = career_student_data.StudentId.unique()
training_students = np.random.choice(all_student_ids, int(all_student_ids.size * .8), False)
testing_students = np.array([i for i in all_student_ids if i not in training_students])
training_data = career_student_data[career_student_data["StudentId"].isin(training_students)]
testing_data = career_student_data[career_student_data["StudentId"].isin(testing_students)]
print("Number of total students:    ", all_student_ids.size)
print("Number of training students: ", training_students.size)
print("Number of testing students:  ", testing_students.size)
display(training_data.describe())
display(testing_data.describe())

Number of total students:     4546
Number of training students:  3636
Number of testing students:   910


Unnamed: 0,StudentId,Semester,CourseId,Score,EnrollmentYear
count,119041.0,119041.0,119041.0,119041.0,119041.0
mean,1642.321015,1.430129,2728.229568,81.215581,2017.859208
std,1166.630566,0.527416,1688.917476,13.754281,1.215396
min,1.0,1.0,2.0,0.0,2000.0
25%,624.0,1.0,1110.0,78.0,2017.0
50%,1424.0,1.0,2762.0,84.0,2018.0
75%,2625.0,2.0,4152.0,90.0,2019.0
max,4568.0,3.0,5591.0,100.0,2020.0


Unnamed: 0,StudentId,Semester,CourseId,Score,EnrollmentYear
count,29980.0,29980.0,29980.0,29980.0,29980.0
mean,1590.650934,1.431288,2705.870147,80.8207,2017.812809
std,1150.857732,0.529381,1695.396165,14.174979,1.202967
min,5.0,1.0,9.0,0.0,2014.0
25%,595.0,1.0,1045.0,77.0,2017.0
50%,1365.0,1.0,2687.0,84.0,2018.0
75%,2466.0,2.0,4152.0,90.0,2019.0
max,4559.0,3.0,5591.0,100.0,2020.0


In [10]:
training_data.head()

Unnamed: 0,StudentId,AcademicYear,Semester,CourseId,CourseName,CourseCollege,Score,EnrollmentYear,Education,Major
98,1192,18-19,1.0,165,Advanced Mathematics (C) (I),Department of Medicine Teaching office,99.0,2018,Undergraduate,Urban and Rural Planning
99,1192,18-19,1.0,1121,Human Geography,College of Urban and Environmental Sciences,91.0,2018,Undergraduate,Urban and Rural Planning
100,1192,18-19,1.0,1138,Introduction to Physical Geography,College of Urban and Environmental Sciences,82.0,2018,Undergraduate,Urban and Rural Planning
101,1192,18-19,1.0,1258,Introduction to Psychology,School of Psychological and Cognitive Sciences,85.5,2018,Undergraduate,Urban and Rural Planning
102,1192,18-19,1.0,1656,University-level Chinese,Department of Chinese Language and Literature,94.0,2018,Undergraduate,Urban and Rural Planning


In [12]:
training_data["CourseName"].nunique()

3231

In [13]:
training_data["CourseName"].value_counts().head()

Military Theory                                            2824
An Introduction to Ideological & Moral Culture and Laws    2598
Outline of Chinese Modern History                          2514
Events and Policies                                        2032
Advanced Mathematics (B) (1)                               1817
Name: CourseName, dtype: int64

In [17]:
score_counts = pd.DataFrame(training_data["CourseName"].value_counts())
rare_courses = score_counts[score_counts["CourseName"] <= 200].index

common_courses = training_data[training_data["CourseName"].isin(rare_courses)]
common_courses.shape

(60559, 10)

In [18]:
common_courses["CourseName"].nunique()

3118

In [19]:
user_course_df = common_courses.pivot_table(index=["StudentId"], columns=["CourseName"], values="Score")

user_course_df.shape

(3159, 3118)

In [23]:
user_course_df.head(10)

CourseName,\nHealth Physics,American Poetry,China's Foreign Relations from Ambassadors' Perspective,Consumer Health Information,Daoyin,Death Talks in China and the West,Enterprise & Corporation Law,Environmental Geosciences,Graduation Screenwriting（1）,Graduation Screenwriting（2）,...,the Evidence Law,the development of modern news,"the leading edges of genetics, development and cell biology",the reform of admissions of china,the study of society,the study on the mechanism of Chinese internet literature,traditional Chinese Literary Theory,workshop for energy and environment engineeing,Ｉｎｔｅｒｎａｔｉｏｎａｌ Ｉｎｖｅｓｔｍｅｎｔ,Ｉｎｔｅｒｎａｔｉｏｎａｌ　Ｅｃｏｎｏｍｉｃｓ
StudentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
13,,82.0,,,,,,,,,...,,,,,,,,,,


In [25]:
course_name = "the development of modern news"
course_name = user_course_df[course_name]
user_course_df.corrwith(course_name).sort_values(ascending=False).head(10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


CourseName
Ｉｎｔｅｒｎａｔｉｏｎａｌ　Ｅｃｏｎｏｍｉｃｓ                                 1.0
Introduction to Information Resource Management         1.0
Social Anthropology                                     1.0
Physiography                                            1.0
Experiments of Experimental Psychology                  1.0
Basic Reporting and Writing Skills                      1.0
Philosophy and Human Life                               1.0
Introduction to Contemporary Art                        1.0
Beijing and Shanghai: Twin Cities in Chinese History    1.0
Studies on Famous Works of Modern Chinese Literature    1.0
dtype: float64

In [31]:
random_user = 1192
random_user_df = user_course_df[user_course_df.index == random_user]
random_user_df

CourseName,\nHealth Physics,American Poetry,China's Foreign Relations from Ambassadors' Perspective,Consumer Health Information,Daoyin,Death Talks in China and the West,Enterprise & Corporation Law,Environmental Geosciences,Graduation Screenwriting（1）,Graduation Screenwriting（2）,...,the Evidence Law,the development of modern news,"the leading edges of genetics, development and cell biology",the reform of admissions of china,the study of society,the study on the mechanism of Chinese internet literature,traditional Chinese Literary Theory,workshop for energy and environment engineeing,Ｉｎｔｅｒｎａｔｉｏｎａｌ Ｉｎｖｅｓｔｍｅｎｔ,Ｉｎｔｅｒｎａｔｉｏｎａｌ　Ｅｃｏｎｏｍｉｃｓ
StudentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1192,,,,,,,,,,,...,,,,,,,,,,


In [33]:
courses_taken = random_user_df.columns[random_user_df.notna().any()].tolist()
print(len(courses_taken))
courses_taken

39


['Advanced Mathematics (C) (I)',
 'Advanced Mathematics (C) (II)',
 'Architecture Design I',
 'Art of Novel',
 'Chinese SanShou',
 'Comparative Readings of ？English Classics and Their ChineseTranslations',
 'Cultural Geography',
 'Economic Geography',
 'Fine Arts: Sketch and Color',
 'History of Chinese and Foreign City Planning',
 'Human Geography',
 'Intriguing science of the solar system',
 'Introduction to Microelectronics',
 'Introduction to Philosophy',
 'Introduction to Physical Geography',
 'Japan and Japanese Studies',
 'Japanese Economy',
 'Kendo',
 'Modern and Contemporary Architecture',
 'Population Geography',
 'Practice of Fine Arts',
 'Principles of Geographic Information System',
 'Principles of Urban Planning (1)',
 'Principles of Urban Planning (2)',
 'Quantitative Geography and System Engineering for City Planning',
 'Selective Readings in Western Civilization',
 'Social History of Medieval Western Europe',
 'Special Topics in China’s Reform',
 'Statistics',
 'Urban 

In [35]:
user_course_df.loc[user_course_df.index == random_user, user_course_df.columns == "Art of Novel"]

CourseName,Art of Novel
StudentId,Unnamed: 1_level_1
1192,89.0


In [36]:
courses_taken_df = user_course_df[courses_taken]
courses_taken_df.head()
courses_taken_df.shape

(3159, 39)

In [37]:
user_course_count = courses_taken_df.T.notnull().sum()

user_course_count = user_course_count.reset_index()
user_course_count.columns = ["studentId","course_count"]
user_course_count.head()

Unnamed: 0,studentId,course_count
0,1,0
1,2,0
2,3,3
3,4,1
4,7,0


In [40]:
perc = len(courses_taken) * 60 / 100
perc

23.4

In [41]:
users_same_courses = user_course_count[user_course_count["course_count"] > perc]["studentId"]
users_same_courses.count()

1

In [42]:
# Let's combine the data of user #28491 and similar users:
final_df = pd.concat([courses_taken_df[courses_taken_df.index.isin(users_same_courses)],
                      random_user_df[courses_taken]])

final_df.shape
final_df.T.corr()

StudentId,1192,1192
StudentId,Unnamed: 1_level_1,Unnamed: 2_level_1
1192,1.0,1.0
1192,1.0,1.0


In [43]:
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()
corr_df.head()

Unnamed: 0,user_id_1,user_id_2,corr
0,1192,1192,1.0
