### Importing libraries


In [24]:
import tiktoken
import pandas as pd
import openai
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import numpy as np
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")

#set up OpenAI API credentials
openai.api_key = "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"

### Accessing the datasets

In [25]:
#accessing the course dataset
course_dataset = pd.read_csv("./cleanedDatasets/cleaned_course_dataset.csv", encoding= 'unicode_escape')
course_dataset.head()

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA


In [26]:
#accessing the user profile datasets
user_profile = pd.read_csv("./cleanedDatasets/cleaned_user_profile.csv", encoding= 'unicode_escape')
user_profile.head()

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."
1,1002,B.E.,Computer Science Engineering,['Hindi English'],"['Java', ' Neural Networks', ' AI', ' Python',...",Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ..."
2,1003,B.E.,Computer Science Engineering,['Unknown'],['Unknown'],Unknown,Unknown
3,1004,B.E.,Computer Science Engineering,"['English', ' Hindi', ' Marathi', ' Marwari']","['XML', ' C', ' Java', ' Data Structures', ' P...",Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo..."
4,1005,B.E.,Computer Science Engineering,"['English', 'Hindi ', 'Kashmiri ', ' Urdu']","['XML', ' Word', ' Data Structures', ' Communi...",To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ..."


In [27]:
#accessing the user rating
user_rating = pd.read_csv("./cleanedDatasets/cleaned_user_ratings.csv", encoding= 'unicode_escape')
user_rating.head()

Unnamed: 0,course_id,user_id,rating
0,2001,1001,5
1,2001,1002,3
2,2001,1003,1
3,2001,1004,0
4,2001,1005,2


In [28]:
#creating meaningful embeddings for each course based on their characteristics 

#creating a nfew field - description - includes all the information about a course
# Assuming 'course_dataset' is your DataFrame containing course data.
course_dataset['description'] = (course_dataset['degree_1'].astype(str) + " in " +
                                 course_dataset['degree_1_specializations'] + " at " +
                                 course_dataset['campus'] + ". Key skills: " +
                                 course_dataset['key_skills_str'])
course_dataset

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str,description
0,1001,B.E.,Mechanical,MITCOE,['CATIA'],CATIA,B.E. in Mechanical at MITCOE. Key skills: CATIA
1,1002,B.E.,Mechanical,MITCOE,['CATIA'],CATIA,B.E. in Mechanical at MITCOE. Key skills: CATIA
2,1003,B.E.,Mechanical,MITAOE,['CATIA'],CATIA,B.E. in Mechanical at MITAOE. Key skills: CATIA
3,1004,B.E.,Mechanical,MITCOE,['CATIA'],CATIA,B.E. in Mechanical at MITCOE. Key skills: CATIA
4,1005,B.E.,Mechanical,MITCOE,['CATIA'],CATIA,B.E. in Mechanical at MITCOE. Key skills: CATIA
...,...,...,...,...,...,...,...
9995,10996,B.E.,Electronics Telecommunication Engineering,MITCOE,"['EmbeddedC, MATLAB, Cprogramming, Keil']","EmbeddedC, MATLAB, Cprogramming, Keil",B.E. in Electronics Telecommunication Engine...
9996,10997,B.E.,Electronics Telecommunication Engineering,"MIT,Pune","['EmbeddedC, MATLAB, Cprogramming, Keil']","EmbeddedC, MATLAB, Cprogramming, Keil",B.E. in Electronics Telecommunication Engine...
9997,10998,M TeCh,Electronics Telecommunication Engineering,MIT WPU,"['EmbeddedC, MATLAB, Cprogramming, Keil']","EmbeddedC, MATLAB, Cprogramming, Keil",M TeCh in Electronics Telecommunication Engi...
9998,10999,B.E.,Electronics Telecommunication Engineering,MITAOE,"['AmazonWebServiCes, C CPP, Arduino, MongoDB, ...","AmazonWebServiCes, C CPP, Arduino, MongoDB, Li...",B.E. in Electronics Telecommunication Engine...


In [29]:
# Retrieve the list of available models
models = openai.Model.list()

# Print the list of models
print(models)

{
  "object": "list",
  "data": [
    {
      "id": "gpt-3.5-turbo-16k",
      "object": "model",
      "created": 1683758102,
      "owned_by": "openai-internal"
    },
    {
      "id": "gpt-3.5-turbo-16k-0613",
      "object": "model",
      "created": 1685474247,
      "owned_by": "openai"
    },
    {
      "id": "whisper-1",
      "object": "model",
      "created": 1677532384,
      "owned_by": "openai-internal"
    },
    {
      "id": "davinci-002",
      "object": "model",
      "created": 1692634301,
      "owned_by": "system"
    },
    {
      "id": "gpt-3.5-turbo",
      "object": "model",
      "created": 1677610602,
      "owned_by": "openai"
    },
    {
      "id": "dall-e-2",
      "object": "model",
      "created": 1698798177,
      "owned_by": "system"
    },
    {
      "id": "tts-1-hd-1106",
      "object": "model",
      "created": 1699053533,
      "owned_by": "system"
    },
    {
      "id": "tts-1-hd",
      "object": "model",
      "created": 1699046015,
 

In [30]:
#to reorganize and simplify the data for analysis
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
llm_user_dataset = pd.DataFrame()
llm_user_dataset['userid'] = user_profile['userid']
llm_user_dataset['degree_1'] = user_profile['degree_1']
llm_user_dataset['degree_1_specializations'] = user_profile['degree_1_specializations']
llm_user_dataset['career_objective'] = user_profile['career_objective']
llm_user_dataset['key_skills_str'] = user_profile['key_skills_str']

#store the career objective and key skills in description
llm_user_dataset['description'] = llm_user_dataset['degree_1'] +" "+ llm_user_dataset['degree_1_specializations'] 
llm_user_dataset.head(5)

Unnamed: 0,userid,degree_1,degree_1_specializations,career_objective,key_skills_str,description
0,1001,B.E.,Computer Science Engineering,Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel...",B.E. Computer Science Engineering
1,1002,B.E.,Computer Science Engineering,Interested in working under company offering A...,"Java, Neural Networks, AI, Python, Html5, ...",B.E. Computer Science Engineering
2,1003,B.E.,Computer Science Engineering,Unknown,Unknown,B.E. Computer Science Engineering
3,1004,B.E.,Computer Science Engineering,Currently a final year student of Computer Eng...,"XML, C, Java, Data Structures, Python, Mo...",B.E. Computer Science Engineering
4,1005,B.E.,Computer Science Engineering,To have a growth oriented and challenging care...,"XML, Word, Data Structures, Communication, ...",B.E. Computer Science Engineering


In [31]:
#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
llm_course_dataset = pd.DataFrame()
llm_course_dataset['sr_'] = course_dataset['sr_']
llm_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
llm_course_dataset['degree_1'] = course_dataset['degree_1']
llm_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
llm_course_dataset['key_skills_str'] = course_dataset['key_skills_str']

#store the career objective and key skills in description
llm_course_dataset['description'] = llm_course_dataset['degree_1'] + llm_course_dataset['degree_1_specializations'] 
llm_course_dataset.head(5)


Unnamed: 0,sr_,campus,degree_1,degree_1_specializations,key_skills_str,description
0,1001,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical
1,1002,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical
2,1003,MITAOE,B.E.,Mechanical,CATIA,B.E. Mechanical
3,1004,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical
4,1005,MITCOE,B.E.,Mechanical,CATIA,B.E. Mechanical


In [32]:
#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = llm_user_dataset['description'] + llm_course_dataset['description']
comb.head(5)

Unnamed: 0,description
0,B.E. Computer Science EngineeringB.E. Mechanical
1,B.E. Computer Science EngineeringB.E. Mechanical
2,B.E. Computer Science EngineeringB.E. Mechanical
3,B.E. Computer Science EngineeringB.E. Mechanical
4,B.E. Computer Science EngineeringB.E. Mechanical


In [33]:
# TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer is used to transform the textual 'description' data into a numerical format that machine learning algorithms can process. 
#This step converts the corpus of descriptions into a matrix of TF-IDF features, highlighting the importance of specific terms in differentiating the documents (descriptions).

#load and preprocess data
data = llm_course_dataset
data = data.dropna()
corpus = data["description"].tolist()
X_train, X_test = train_test_split(data, test_size=0.2)


In [34]:
#extract features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train["description"].tolist())
X_test_features = vectorizer.transform(X_test["description"].tolist())

In [35]:
#compute pairwise similarities using cosine similarity
similarity_matrix = cosine_similarity(X_test_features, X_train_features)

In [37]:
#generate recommendations for each test user
recommendations = []
#print(corpus)
llm_user_dataset = llm_user_dataset.dropna()# removes any rows from the llm_user_dataset DataFrame that contain missing values

for i in range(len(llm_user_dataset)): #iterate over each row in the llm_user_dataset
    user_input = llm_user_dataset.iloc[i]["description"] #extracts the 'description' text of the current user. It uses iloc[i] to get the row at index i (the current user) and ["description"] to get the value from the 'description' column for that row.
    #print(user_input)
    user_index = corpus.index(user_input)
    recommended_items = similarity_matrix[i].argsort()[:-6:-1]
    recommendations.append(recommended_items)

In [38]:
#Compute precision and recall for degree
relevant_items = []
for i in range(len(llm_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1"] == X_test.iloc[i]["degree_1"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

In [39]:
#Print evaluation metrics
print("Evaluation for Degree: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

Evaluation for Degree: 
Accuracy: 0.001
Precision: 1.000
Recall: 0.001
F1 Score: 0.001


In [40]:
#Compute precision and recall for degree spec
relevant_items = []
for i in range(len(llm_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1_specializations"] == X_test.iloc[i]["degree_1_specializations"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)


In [41]:
#Print evaluation metrics
print()
print("Evaluation for Degree Specializations: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")


Evaluation for Degree Specializations: 
Accuracy: 0.004
Precision: 1.000
Recall: 0.002
F1 Score: 0.004


In [42]:
#Compute precision and recall for key skills
relevant_items = []
for i in range(len(llm_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["key_skills_str"] == X_test.iloc[i]["key_skills_str"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

In [43]:
#Print evaluation metrics
print()
print("Evaluation for Key Skills: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")


Evaluation for Key Skills: 
Accuracy: 0.004
Precision: 0.236
Recall: 0.002
F1 Score: 0.004


In [44]:
#CHECKING THE RECOMMENDATIONS OBTAINED 

#load and preprocess data
data = llm_course_dataset
#data = data.dropna()
corpus = data["description"].tolist()

#extract features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

#compute pairwise similarities using cosine similarity
similarity_matrix = cosine_similarity(X)

In [45]:
#generate recommendations based on user input
def llm_recommender(user):
    course_recommendations = []
    
    user_input = llm_user_dataset["description"].loc[llm_user_dataset["userid"] == user].values[0] 
    
    user_index = corpus.index(user_input)
    recommendations = similarity_matrix[user_index].argsort()[:-6:-1]
    
    
    #Print top 5 recommendations
    print("Top 5 Recommendations:")
    for i, index in enumerate(recommendations):
        print(f"{i+1}. {data.loc[index]['sr_']}: {data.loc[index]['description']}")
        course_recommendations.append(data.loc[index]['sr_'])
    
    #return the course ids
    return course_recommendations

In [46]:
#this function does not use key skills
llm_recommender("1001")

Top 5 Recommendations:
1. 7408: B.E. Computer Science Engineering
2. 2954: B.E. Computer Science Engineering
3. 8949: B.E. Computer Science Engineering
4. 8948: B.E. Computer Science Engineering
5. 8947: B.E. Computer Science Engineering


[7408, 2954, 8949, 8948, 8947]

In [47]:
user_profile[user_profile["userid"] == "1001"]

Unnamed: 0,userid,degree_1,degree_1_specializations,known_languages,key_skills,career_objective,key_skills_str
0,1001,B.E.,Computer Science Engineering,"['English', ' Marathi', ' Hindi']","['C', ' Java', ' Keras', ' Flask', ' Deep Lear...",Computer Engineering student with good technic...,"C, Java, Keras, Flask, Deep Learning, Sel..."


In [48]:
course_dataset[course_dataset["sr_"] == 7408]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str,description
6407,7408,B.E.,Computer Science Engineering,MITCOE,"['Java, JavasCript, CPP, Laravel Phpframework ...","Java, JavasCript, CPP, Laravel Phpframework ,...",B.E. in Computer Science Engineering at MITCO...


In [49]:
course_dataset[course_dataset["sr_"] == 2954]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str,description
1953,2954,B.E.,Computer Science Engineering,MITCOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL",B.E. in Computer Science Engineering at MITCO...


In [50]:
course_dataset[course_dataset["sr_"] == 8949]

Unnamed: 0,sr_,degree_1,degree_1_specializations,campus,key_skills,key_skills_str,description
7948,8949,B.E.,Computer Science Engineering,MITAOE,"['C, Java, CPP, HTML, CMStool, MYSQL']","C, Java, CPP, HTML, CMStool, MYSQL",B.E. in Computer Science Engineering at MITAO...
