#### Name and Surname: Janice Xerri 
#### ID: 0437903L
##### Thesis - Create a personalized and accurate course recommendation system for users in a virtual world based on User Profiling

Importing libraries

In [138]:
import pandas as pd
import os
import numpy as np

Loading the datasets 

In [139]:
columns_to_use = ['Userid', 'Degree 1', 'Degree 1 Specializations', 'Known Languages', 'Key Skills', 'Career Objective']
user_profile_df = pd.read_csv('Datasets/user_profile_set_21.csv',encoding='latin-1', usecols=columns_to_use)
user_ratings_df = pd.read_csv('Datasets/User_Ratings.csv',encoding='latin-1')
master_dataset_df = pd.read_csv('Datasets/Master_dataset_Feb22.csv',encoding='latin-1')

Checking the databases to see if they are loaded correctly.

In [140]:
user_profile_df.head()

Unnamed: 0,Userid,Degree 1,Degree 1 Specializations,Known Languages,Key Skills,Career Objective
0,,,,,,
1,1001.0,B.E.,Computer Science & Engineering,"English, Marathi, Hindi",C; Java; Keras; Flask; Deep Learning; Selenium...,Computer Engineering student with good technic...
2,1002.0,B.E.,Computer Science & Engineering,Hindi English,Java; Neural Networks; AI; Python; Html5; CPP,Interested in working under company offering A...
3,1003.0,B.E.,Computer Science & Engineering,,,
4,1004.0,B.E.,Computer Science & Engineering,"English, Hindi, Marathi, Marwari",XML; C; Java; Data Structures; Python; MongoDB...,Currently a final year student of Computer Eng...


In [141]:
user_ratings_df.head()

Unnamed: 0,User_id,Course_id,Unnamed: 2,...,Unnamed: 18,Unnamed: 19,Unnamed: 20
0,,1001,1002,...,1018,1019,1020
1,2001.0,5,3,...,0,1,3
2,2002.0,3,5,...,3,0,0
3,2003.0,4,1,...,4,3,4
4,2004.0,2,4,...,2,3,0


In [142]:
master_dataset_df.head()

Unnamed: 0,Sr,Degree 1,Degree 1 SpeCializations,Campus,Key Skills
0,1001,B E,Mechanical,MITCOE,CATIA
1,1002,B E,Mechanical,MITCOE,CATIA
2,1003,B E,Mechanical,MITAOE,CATIA
3,1004,B E,Mechanical,MITCOE,CATIA
4,1005,B E,Mechanical,MITCOE,CATIA


Data Preprocessing 

Functions created. 

In [143]:
# Column names: remove white spaces which is extra and convert to lower case
def clean_column_names(df):
    df.columns = df.columns.str.strip().str.lower()
    #replace white spaces with underscore if needed.
    df.columns = list(map(lambda x: x.replace(' ', '_'),df.columns))
    return df

def clean_dataframe(df):
    # Convert string values to lowercase
    df = df.apply(lambda x: x.str.strip().str.lower() if x.dtype == "object" else x)
    
    # Apply replace operation to all string columns
    string_columns = df.select_dtypes(include='object').columns
    df[string_columns] = df[string_columns].applymap(lambda x: x.replace(' ', '_') if isinstance(x, str) else x)
    
    return df


def check_for_null_values(df):
    null_values = ['none', 'null', 'nan', 'n/a', 'na', '', 'NA', 'NaN']

    if df.isnull().values.any():
        df = df.fillna(0)  # Replace null values with 0, or any other appropriate value

    # Convert any potential NoneType to string representation
    null_values = [str(val).lower() for val in null_values]

    # Replace specific null values within string columns
    string_columns = df.select_dtypes(include='object').columns
    for col in string_columns:
        df[col] = df[col].apply(lambda x: 'unknown' if str(x).lower() in null_values else x)

    if (df.values.flatten() == 'unknown').any():
        print("Null values replaced with 'unknown'")
    else:
        print("No null values found")
    
    return df



def check_for_duplicates(df, column_name):
    print(df.columns)
    duplicate_column = df[df[column_name].duplicated()]
    if duplicate_column.empty:
        print("There are no duplicate values in the '{}' column".format(column_name))
    else:
        print("Duplicate values found in the '{}' column:".format(column_name))
        print(duplicate_column)
        # Generate a unique number and assign it to duplicates
        unique_number = max(df[column_name]) + 1
        df.loc[duplicate_column.index, column_name] = unique_number
        print("Duplicates assigned with unique number:", unique_number)
    return df

def replace_with_commas(df, column_name):
    # Replacing whitespaces or 'and' with a comma and eliminating extra commas
    df[column_name] = df[column_name].str.replace(r'\s+and\s+|\s+', ',').str.replace(r',+', ',').str.strip(',')
    return df


def handle_multiple_values(df, column_name):
    # Check if the column contains multiple values
    if df[column_name].str.contains(',|;').any():
        # Split the values by comma or semicolon and create new rows for each value
        df = df.assign(**{column_name: df[column_name].str.split(',|;')}).explode(column_name)
        # Strip leading and trailing whitespaces from the values
        df[column_name] = df[column_name].str.strip()
    return df


### Data handling and cleaning for the user profile dataset.

In [144]:
#Cleaning user_profile_df columns
user_profile_df = clean_column_names(user_profile_df)
print(user_profile_df.columns)
print("\nHead of the user profile dataframe.")
print(user_profile_df.head())

Index(['userid', 'degree_1', 'degree_1_specializations', 'known_languages', 'key_skills', 'career_objective'], dtype='object')

Head of the user profile dataframe.
   userid degree_1        degree_1_specializations                   known_languages                                         key_skills                                   career_objective
0     NaN      NaN                             NaN                               NaN                                                NaN                                                NaN
1  1001.0     B.E.  Computer Science & Engineering           English, Marathi, Hindi  C; Java; Keras; Flask; Deep Learning; Selenium...  Computer Engineering student with good technic...
2  1002.0     B.E.  Computer Science & Engineering                     Hindi English      Java; Neural Networks; AI; Python; Html5; CPP  Interested in working under company offering A...
3  1003.0     B.E.  Computer Science & Engineering                               NaN    

In [145]:
#Cleaning the data in user_profile_df
#check for null values. 
user_profile_df = check_for_null_values(user_profile_df)


#since there are no null values, we check for duplicates for the user identifiers. 
#user_profile_df = check_for_duplicates(user_profile_df, 'userid')

#keeping the data consistent.
user_profile_df = replace_with_commas(user_profile_df, 'known_languages')
user_profile_df = handle_multiple_values(user_profile_df, 'known_languages')
user_profile_df = handle_multiple_values(user_profile_df, 'key_skills')

#cleaning the dataframe
user_profile_df = clean_dataframe(user_profile_df)

#to view that the database is set correctly.
pd.options.display.max_rows = user_profile_df.shape[0]
pd.options.display.max_columns = user_profile_df.shape[1]
pd.options.display.width = 1000

No null values found


Viewing the modifications

In [146]:

# Printing the  DataFrame
print("DataFrame:\n")
print(user_profile_df)

DataFrame:

      userid degree_1        degree_1_specializations                               known_languages                                         key_skills                                   career_objective
0        0.0      NaN                             NaN                                           NaN                                                NaN                                                NaN
1     1001.0     b.e.  computer_science_&_engineering                                       english                                                  c  computer_engineering_student_with_good_technic...
1     1001.0     b.e.  computer_science_&_engineering                                       english                                               java  computer_engineering_student_with_good_technic...
1     1001.0     b.e.  computer_science_&_engineering                                       english                                              keras  computer_engineering_student

Data handling and cleaning for the user rating dataset.

Handling Missing Values: Dropping rows with missing ratings ensures the data used for ratings analysis is complete and reliable.

Check Data Consistency: Verifying consistency in user and course IDs ensures data integrity between the ratings and other datasets.

In [147]:
# Transforming the data.
user_ratings_df = clean_column_names(user_ratings_df)

print(user_ratings_df.columns)
user_ratings_df = user_ratings_df.T.reset_index()


# Display the updated DataFrame
print(user_ratings_df)

#to view that the database is set correctly.
pd.options.display.max_rows = user_ratings_df.shape[0]
pd.options.display.max_columns = user_ratings_df.shape[1]
pd.options.display.width = 1000

Index(['user_id', 'course_id', 'unnamed:_2', 'unnamed:_3', 'unnamed:_4', 'unnamed:_5', 'unnamed:_6', 'unnamed:_7', 'unnamed:_8', 'unnamed:_9', 'unnamed:_10', 'unnamed:_11', 'unnamed:_12', 'unnamed:_13', 'unnamed:_14', 'unnamed:_15', 'unnamed:_16', 'unnamed:_17', 'unnamed:_18', 'unnamed:_19', 'unnamed:_20'], dtype='object')
          index       0       1  ...     422     423     424
0       user_id     NaN  2001.0  ...  2422.0  2423.0  2424.0
1     course_id  1001.0     5.0  ...     0.0     2.0     0.0
2    unnamed:_2  1002.0     3.0  ...     4.0     5.0     0.0
3    unnamed:_3  1003.0     1.0  ...     4.0     2.0     2.0
4    unnamed:_4  1004.0     0.0  ...     5.0     0.0     4.0
5    unnamed:_5  1005.0     2.0  ...     2.0     5.0     1.0
6    unnamed:_6  1006.0     1.0  ...     4.0     3.0     3.0
7    unnamed:_7  1007.0     0.0  ...     5.0     2.0     5.0
8    unnamed:_8  1008.0     0.0  ...     1.0     0.0     4.0
9    unnamed:_9  1009.0     0.0  ...     2.0     1.0     1.0
10  

View results for the user rating dataset.

In [148]:
# Printing the  DataFrame
print("DataFrame:\n")
# Display the resulting DataFrame
print(user_ratings_df)

DataFrame:

          index       0       1       2       3       4       5       6       7       8       9      10      11      12      13      14      15      16      17      18      19      20      21      22      23      24      25      26      27      28      29      30      31      32      33      34      35      36      37      38      39      40      41      42      43      44      45      46      47      48      49      50      51      52      53      54      55      56      57      58      59      60      61      62      63      64      65      66      67      68      69      70      71      72      73      74      75      76      77      78      79      80      81      82      83      84      85      86      87      88      89      90      91      92      93      94      95      96      97      98      99     100     101     102     103     104     105     106     107     108     109     110     111     112     113     114     115     116     117     118     119     120     

Data handling and cleaning for the master dataset.

Handling Missing Values.

Cleaning Text Data: Lowercasing course names and stripping extra spaces in degree specializations standardizes the text for consistency.

Feature Engineering: Creating dummy variables for degree specializations and campus converts categorical data into a format suitable for modeling. - ASK ABOUT THIS SECTION.

In [149]:
# Handling the column names cleaning and manipulation.
master_dataset_df = clean_column_names(master_dataset_df)
print(master_dataset_df.columns)
print(master_dataset_df.head())


Index(['sr', 'degree_1', 'degree_1_specializations', 'campus', 'key_skills'], dtype='object')
     sr degree_1 degree_1_specializations  campus key_skills
0  1001     B E                Mechanical  MITCOE      CATIA
1  1002     B E                Mechanical  MITCOE      CATIA
2  1003     B E                Mechanical  MITAOE      CATIA
3  1004     B E                Mechanical  MITCOE      CATIA
4  1005     B E                Mechanical  MITCOE      CATIA


In [150]:
#Course Identifier.

#check for null values. 
master_dataset_df = check_for_null_values(master_dataset_df)


#since there are no null values, we check for duplicates for the degree identifiers. 
# Check for duplicates in the 'Sr' column
#master_dataset_df = check_for_duplicates(master_dataset_df, 'sr')

master_dataset_df = handle_multiple_values(master_dataset_df, 'key_skills')
master_dataset_df = handle_multiple_values(master_dataset_df, 'campus')

#cleaning the dataframe
master_dataset_df = clean_dataframe(master_dataset_df)

#to view that the database is set correctly.
pd.options.display.max_rows = master_dataset_df.shape[0]
pd.options.display.max_columns = master_dataset_df.shape[1]
pd.options.display.width = 1000


No null values found


View results

In [151]:
# Printing the  DataFrame
print("DataFrame:\n")
print(master_dataset_df)

DataFrame:

         sr degree_1                    degree_1_specializations    campus                         key_skills
0      1001      b_e                                  mechanical    mitcoe                              catia
1      1002      b_e                                  mechanical    mitcoe                              catia
2      1003      b_e                                  mechanical    mitaoe                              catia
3      1004      b_e                                  mechanical    mitcoe                              catia
4      1005      b_e                                  mechanical    mitcoe                              catia
5      1006      b_e                                  mechanical    mitcoe                              catia
6      1007      b_e                                  mechanical    mitaoe                              catia
7      1008      b_e                                  mechanical    mitcoe                              cati