In [1]:
import pandas as pd

## Preprocess step

In [2]:
# Read the data
file_path = 'task_priority_data_cleaned.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,task_name,university,school_year,course_name,course_code,credit,task_mode,task_type,task_weight_percent,due_date,due_time,time_required_hours,difficulty,current_date_(today),current_progress_percent,time_spent_hours,priority_level,days_until_due
0,Assignment 3,McMaster University,4,Scientific Computation,SFWRENG 4X03,3,Individual,Assignment,8.0,2023-11-15,11:59 pm,18.0,5.0,2023-11-11,10.0,0.0,5,4
1,Quiz 7,McMaster University,4,"Ethics, Equity and Law in Engineering",ENGINEER 4A03,3,Individual,Quiz,4.29,2023-11-13,11:59 pm,0.25,2.0,2023-11-12,0.0,0.0,3,1
2,Project Part II: Case Study,McMaster University,4,"Ethics, Equity and Law in Engineering",ENGINEER 4A03,3,Group,Project,5.0,2023-11-17,11:59 pm,2.0,3.0,2023-11-12,0.0,0.0,3,5
3,Lab 2/PPA2,McMaster University,2,Principles of Programming,COMPENG 2SH4,4,Individual,Lab,10.0,2023-10-27,11:59 pm,12.0,4.0,2023-10-20,50.0,12.0,5,7
4,Assignment 4,McMaster University,4,Scientific Computation,SFWRENG 4X03,3,Individual,Assignment,8.0,2023-11-30,11:59 pm,25.0,5.0,2023-11-12,0.0,0.0,1,18


In [3]:
for column in data.columns:
    print(column, data[column].dtype)


task_name object
university object
school_year int64
course_name object
course_code object
credit int64
task_mode object
task_type object
task_weight_percent float64
due_date object
due_time object
time_required_hours float64
difficulty float64
current_date_(today) object
current_progress_percent float64
time_spent_hours float64
priority_level int64
days_until_due int64


In [4]:
# Convert column with limited number, discrete set of values to category datatype.
for col in ['school_year', 'task_mode', 'task_type', 'difficulty', 'priority_level']:
    data[col] = data[col].astype('category')

In [5]:
# Converting date columns to datetime format
date_columns = ['due_date', 'current_date_(today)']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce', format='%Y/%m/%d')

In [6]:
for column in data.columns:
    print(column, data[column].dtype)

task_name object
university object
school_year category
course_name object
course_code object
credit int64
task_mode category
task_type category
task_weight_percent float64
due_date datetime64[ns]
due_time object
time_required_hours float64
difficulty category
current_date_(today) datetime64[ns]
current_progress_percent float64
time_spent_hours float64
priority_level category
days_until_due int64


In [7]:
data.head()

Unnamed: 0,task_name,university,school_year,course_name,course_code,credit,task_mode,task_type,task_weight_percent,due_date,due_time,time_required_hours,difficulty,current_date_(today),current_progress_percent,time_spent_hours,priority_level,days_until_due
0,Assignment 3,McMaster University,4,Scientific Computation,SFWRENG 4X03,3,Individual,Assignment,8.0,2023-11-15,11:59 pm,18.0,5.0,2023-11-11,10.0,0.0,5,4
1,Quiz 7,McMaster University,4,"Ethics, Equity and Law in Engineering",ENGINEER 4A03,3,Individual,Quiz,4.29,2023-11-13,11:59 pm,0.25,2.0,2023-11-12,0.0,0.0,3,1
2,Project Part II: Case Study,McMaster University,4,"Ethics, Equity and Law in Engineering",ENGINEER 4A03,3,Group,Project,5.0,2023-11-17,11:59 pm,2.0,3.0,2023-11-12,0.0,0.0,3,5
3,Lab 2/PPA2,McMaster University,2,Principles of Programming,COMPENG 2SH4,4,Individual,Lab,10.0,2023-10-27,11:59 pm,12.0,4.0,2023-10-20,50.0,12.0,5,7
4,Assignment 4,McMaster University,4,Scientific Computation,SFWRENG 4X03,3,Individual,Assignment,8.0,2023-11-30,11:59 pm,25.0,5.0,2023-11-12,0.0,0.0,1,18


In [8]:
# Feature Selection - select relevant features for priority classification
# I don't think university is relevant considering the values are all McMaster, but in the future
# if the user group enlarges, we might need to add it 
# We don't include course_code as we have course_name, school_year and credit 
# We don't need due_date and current_data as it is infered by days_until_due
# In the future, we might need to add a feature about program, complimentary or compulsory course etc.
feature_selected_columns = [
    'task_name', 'school_year', 'course_name', 'credit', 'task_mode', 'task_type', 'task_weight_percent', 
    'time_required_hours', 'difficulty', 'current_progress_percent', 'time_spent_hours', 
    'days_until_due'
]
label_colunm = ['priority_level']
selected_data = data[feature_selected_columns + label_colunm]
selected_data.head()

Unnamed: 0,task_name,school_year,course_name,credit,task_mode,task_type,task_weight_percent,time_required_hours,difficulty,current_progress_percent,time_spent_hours,days_until_due,priority_level
0,Assignment 3,4,Scientific Computation,3,Individual,Assignment,8.0,18.0,5.0,10.0,0.0,4,5
1,Quiz 7,4,"Ethics, Equity and Law in Engineering",3,Individual,Quiz,4.29,0.25,2.0,0.0,0.0,1,3
2,Project Part II: Case Study,4,"Ethics, Equity and Law in Engineering",3,Group,Project,5.0,2.0,3.0,0.0,0.0,5,3
3,Lab 2/PPA2,2,Principles of Programming,4,Individual,Lab,10.0,12.0,4.0,50.0,12.0,7,5
4,Assignment 4,4,Scientific Computation,3,Individual,Assignment,8.0,25.0,5.0,0.0,0.0,18,1


In [9]:
# Standardize numerical data

In [10]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['school_year', 'credit', 'task_weight_percent', 'time_required_hours', 
                     'difficulty', 'current_progress_percent', 'time_spent_hours', 'days_until_due']
scaler = StandardScaler()
selected_data.loc[:, numerical_columns] = scaler.fit_transform(selected_data.loc[:, numerical_columns])
selected_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


Unnamed: 0,task_name,school_year,course_name,credit,task_mode,task_type,task_weight_percent,time_required_hours,difficulty,current_progress_percent,time_spent_hours,days_until_due,priority_level
0,Assignment 3,-0.267506,Scientific Computation,-0.305219,Individual,Assignment,-0.249243,1.477311,1.48703,-0.528431,-0.517525,-0.089773,5
1,Quiz 7,-0.267506,"Ethics, Equity and Law in Engineering",-0.305219,Individual,Quiz,-0.602651,-0.734103,-1.094995,-0.817216,-0.517525,-0.318396,3
2,Project Part II: Case Study,-0.267506,"Ethics, Equity and Law in Engineering",-0.305219,Group,Project,-0.535017,-0.516076,-0.23432,-0.817216,-0.517525,-0.013566,3
3,Lab 2/PPA2,-2.311251,Principles of Programming,0.962102,Individual,Lab,-0.058726,0.729791,0.626355,0.626708,1.793566,0.138849,5
4,Assignment 4,-0.267506,Scientific Computation,-0.305219,Individual,Assignment,-0.249243,2.349418,1.48703,-0.817216,-0.517525,0.977133,1


In [11]:
category_columns = ['task_mode', 'task_type']

In [12]:
for col in category_columns:  
    unique_values = selected_data[col].unique()
    print("====")
    print(f"{col} values:")
    print("====")
    for val in unique_values:
        print(val)
    print("====")
    print(f"VALUE COUNTS:\n{selected_data[col].value_counts()}")
    print("\n")
# From the result we are see that our data is very skewed, we need to increase
# data sample size to include more different types of values

====
task_mode values:
====
Individual
Group
Hybrid
====
VALUE COUNTS:
Individual    129
Group          61
Hybrid          1
Name: task_mode, dtype: int64


====
task_type values:
====
Assignment
Quiz
Project
Lab
Test Review
Midterm
Report
Presentation
Exam Review
Exam
Document
Essay
Survey
====
VALUE COUNTS:
Assignment      76
Project         29
Lab             22
Report          16
Exam            15
Presentation    11
Quiz             9
Test Review      4
Midterm          3
Essay            2
Exam Review      2
Document         1
Survey           1
Name: task_type, dtype: int64




In [13]:
# Encode unordered categorical data using Ont-Hot Encoding

In [14]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
encoded_cols = encoder.fit_transform(selected_data[category_columns])

encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names(category_columns))

encoded_data = pd.concat([selected_data, encoded_df], axis=1)

encoded_data.drop(category_columns, axis=1, inplace=True)

encoded_data.head()

Unnamed: 0,task_name,school_year,course_name,credit,task_weight_percent,time_required_hours,difficulty,current_progress_percent,time_spent_hours,days_until_due,...,task_type_Exam,task_type_Exam Review,task_type_Lab,task_type_Midterm,task_type_Presentation,task_type_Project,task_type_Quiz,task_type_Report,task_type_Survey,task_type_Test Review
0,Assignment 3,-0.267506,Scientific Computation,-0.305219,-0.249243,1.477311,1.48703,-0.528431,-0.517525,-0.089773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Quiz 7,-0.267506,"Ethics, Equity and Law in Engineering",-0.305219,-0.602651,-0.734103,-1.094995,-0.817216,-0.517525,-0.318396,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Project Part II: Case Study,-0.267506,"Ethics, Equity and Law in Engineering",-0.305219,-0.535017,-0.516076,-0.23432,-0.817216,-0.517525,-0.013566,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Lab 2/PPA2,-2.311251,Principles of Programming,0.962102,-0.058726,0.729791,0.626355,0.626708,1.793566,0.138849,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Assignment 4,-0.267506,Scientific Computation,-0.305219,-0.249243,2.349418,1.48703,-0.817216,-0.517525,0.977133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
encoded_data.columns

Index(['task_name', 'school_year', 'course_name', 'credit',
       'task_weight_percent', 'time_required_hours', 'difficulty',
       'current_progress_percent', 'time_spent_hours', 'days_until_due',
       'priority_level', 'task_mode_Group', 'task_mode_Hybrid',
       'task_mode_Individual', 'task_type_Assignment', 'task_type_Document',
       'task_type_Essay', 'task_type_Exam', 'task_type_Exam Review',
       'task_type_Lab', 'task_type_Midterm', 'task_type_Presentation',
       'task_type_Project', 'task_type_Quiz', 'task_type_Report',
       'task_type_Survey', 'task_type_Test Review'],
      dtype='object')

In [16]:
# Vectorize text data

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectorized_task = vectorizer.fit_transform(encoded_data['task_name'])
task_cols = [f"task_name_{feature}" for feature in vectorizer.get_feature_names()]
vectorized_task_df = pd.DataFrame(vectorized_task.toarray(), columns=task_cols)

vectorized_course = vectorizer.fit_transform(encoded_data['course_name'])
course_cols = [f"course_name_{feature}" for feature in vectorizer.get_feature_names()]
vectorized_course_df = pd.DataFrame(vectorized_course.toarray(), columns=course_cols)

vectorized_data = pd.concat([encoded_data, vectorized_task_df, vectorized_course_df], axis=1)
text_cols = ['task_name', 'course_name']
vectorized_data.drop(text_cols, axis=1, inplace=True)

vectorized_data.head()


Unnamed: 0,school_year,credit,task_weight_percent,time_required_hours,difficulty,current_progress_percent,time_spent_hours,days_until_due,priority_level,task_mode_Group,...,course_name_system,course_name_systems,course_name_the,course_name_theoretical,course_name_theory,course_name_therapy,course_name_time,course_name_to,course_name_uav,course_name_unsupervised
0,-0.267506,-0.305219,-0.249243,1.477311,1.48703,-0.528431,-0.517525,-0.089773,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.267506,-0.305219,-0.602651,-0.734103,-1.094995,-0.817216,-0.517525,-0.318396,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.267506,-0.305219,-0.535017,-0.516076,-0.23432,-0.817216,-0.517525,-0.013566,3,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.311251,0.962102,-0.058726,0.729791,0.626355,0.626708,1.793566,0.138849,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.267506,-0.305219,-0.249243,2.349418,1.48703,-0.817216,-0.517525,0.977133,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from sklearn.model_selection import train_test_split
# Train test split, split the dataset into training and testing
# Training dataset will be used for model training 
# Testing dataset is for evaluating the model's performance 
# (several metrics will be used to ensure the trained model meets our expectation)
X = vectorized_data.drop('priority_level', axis=1)
y = vectorized_data['priority_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Check class distribution
train_class_distribution = y_train.value_counts(normalize=True)
test_class_distribution = y_test.value_counts(normalize=True)

In [21]:
# The distributions of priority levels of training and testing dataset seem reasonable
train_class_distribution

5    0.236842
2    0.210526
4    0.210526
1    0.190789
3    0.151316
Name: priority_level, dtype: float64

In [22]:
test_class_distribution

5    0.282051
1    0.230769
4    0.230769
2    0.128205
3    0.128205
Name: priority_level, dtype: float64

In [23]:
X.shape

(191, 244)

In [24]:
y.shape

(191,)

In [26]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((152, 244), (39, 244), (152,), (39,))

In [27]:
X_train.to_csv('task_priority_x_train.csv', index=False)

In [28]:
X_test.to_csv('task_priority_x_test.csv', index=False)

In [29]:
y_train.to_csv('task_priority_y_train.csv', index=False)

In [31]:
y_test.to_csv('task_priority_y_test.csv', index=False)