In [1]:
import pandas as pd
import numpy as np 

In [2]:
import json 

In [3]:

json_path = '../datasets/program_details.json'

with open(json_path, 'r', encoding="utf8") as file:
    program_details = json.load(file)

valid_program_details = [
    entry for entry in program_details if entry is not None and isinstance(entry, dict)
]

pdf = pd.DataFrame(valid_program_details)




In [4]:
pdf

Unnamed: 0,id,Description/content,Course Organisation
0,8305,The Master's in Computer Science offers a comp...,"In the Master's programme, the schedule can be..."
1,4439,Combining the breadth of a traditional course ...,"In the first semester, the module theoretical ..."
2,4870,The Master's in Applied Computer Science progr...,Three semesters:Semesters one and two: theory ...
3,5616,Based on their undergraduate experience the st...,"The programme is organised in modules, with an..."
4,4455,Computer Science is one of the drivers of tech...,You will choose modules from the following fiv...
...,...,...,...
137,6262,The MSc Cognitive Systems programme is a two-y...,The study programme starts off with lectures t...
138,4407,Medical technology is one of the main research...,Besides compulsory and elective modules with a...
139,9595,New digital technologies provide companies wit...,The MSc in Information Engineering programme c...
140,3724,Environmental information technologies such as...,The first semester takes place at the Eberswal...


In [5]:
pdf.isnull().sum()

id                     0
Description/content    0
Course Organisation    0
dtype: int64

In [6]:
pdf.duplicated().sum()

0

# Merge both 


In [7]:
pdf['tags'] = pdf['Course Organisation'] + pdf['Description/content']

In [8]:
pdf.head()

Unnamed: 0,id,Description/content,Course Organisation,tags
0,8305,The Master's in Computer Science offers a comp...,"In the Master's programme, the schedule can be...","In the Master's programme, the schedule can be..."
1,4439,Combining the breadth of a traditional course ...,"In the first semester, the module theoretical ...","In the first semester, the module theoretical ..."
2,4870,The Master's in Applied Computer Science progr...,Three semesters:Semesters one and two: theory ...,Three semesters:Semesters one and two: theory ...
3,5616,Based on their undergraduate experience the st...,"The programme is organised in modules, with an...","The programme is organised in modules, with an..."
4,4455,Computer Science is one of the drivers of tech...,You will choose modules from the following fiv...,You will choose modules from the following fiv...


In [9]:
pdf['tags'][0]

'In the Master\'s programme, the schedule can be arranged flexibly. Students individually choose their areas of specialisation from a wide variety of subjects.Mandatory Modules:Computer ScienceComputer Science LabResearch Methods & EthicsCritical Reading and DiscussionElective Modules: Professional SkillsChoose from:Design ThinkingEntrepreneurship and InnovationLaw and ComplianceManagement and LeadershipTechnology Communication and TransferFocus AreasData and AIAlgorithms and FoundationsSystemsDigital HealthSecurity EngineeringOpen TrackMaster\'s thesisThe Master\'s in Computer Science offers a comprehensive and challenging education. The course content is tailor-made for students who want to take their IT expertise to a new level.Students can choose from six different tracks. This\xa0track structure allows our students to discover the diversity of computer science while specialising in the area that excites them the most. The tracks are based on our interdisciplinary research clusters

In [10]:
df = pd.read_csv("../datasets/MergedData.csv")
df
df.rename(columns={'Id':'id'}, inplace=True)

In [11]:
final_df = pd.merge(pdf,df, on = 'id')

In [12]:
Basic_Df = final_df[['id','tags','CourseNameShort','Academy']]

In [13]:
Basic_Df

Unnamed: 0,id,tags,CourseNameShort,Academy
0,8305,"In the Master's programme, the schedule can be...",Computer Science,University Of Potsdam
1,8305,"In the Master's programme, the schedule can be...",Computer Science,University Of Potsdam
2,8305,"In the Master's programme, the schedule can be...",Computer Science,University Of Potsdam
3,4439,"In the first semester, the module theoretical ...",Computer Science,University Of Stuttgart
4,4439,"In the first semester, the module theoretical ...",Computer Science,University Of Stuttgart
...,...,...,...,...
637,9595,The MSc in Information Engineering programme c...,Information Engineering,Technical University Of Munich
638,9595,The MSc in Information Engineering programme c...,Information Engineering,Technical University Of Munich
639,3724,The first semester takes place at the Eberswal...,Forest Information Technology (FIT),Eberswalde University For Sustainable Development
640,3724,The first semester takes place at the Eberswal...,Forest Information Technology (FIT),Eberswalde University For Sustainable Development


In [14]:
Basic_Df.duplicated().sum()

500

In [15]:
Basic_Df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Basic_Df.drop_duplicates(inplace=True)


## NLTK 

!pip install nltk

In [16]:
import nltk 

In [17]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [18]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [None]:
Basic_Df['tags'] = Basic_Df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Basic_Df['tags'] = Basic_Df['tags'].apply(stem)


In [21]:

Basic_Df.reset_index(drop=True, inplace=True)


# By Paragraph 

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(Basic_Df['tags']).toarray()

def recommend_for_paragraph(paragraph):
    paragraph_vector = tfidf.transform([paragraph]).toarray()
    
    similarity_scores = cosine_similarity(paragraph_vector, vectors).flatten()
    
    top_indices = sorted(list(enumerate(similarity_scores)), reverse=True, key=lambda x: x[1])[:10]
    
    print(f"{'ID':<10} {'Course Name':<70} {'Academy':<40} {'Similarity (%)':<20}")
    print("-" * 160)
    
    for i in top_indices:
        course_id = Basic_Df.iloc[i[0]].id
        course_name_short = Basic_Df.iloc[i[0]].CourseNameShort
        academy = Basic_Df.iloc[i[0]].Academy
        similarity_percentage = round(i[1] * 100, 2)  
        
        print(f"{course_id:<10} {course_name_short:<70} {academy:<40} {similarity_percentage:<20.2f}")


In [42]:

# student_paragraph = """
# I am passionate about artificial intelligence, machine learning, and data analysis. 
# I have worked on several projects involving predictive modeling and computer vision. 
# I also have a keen interest in natural language processing and software development.
# """

# # Web
# student_paragraph = """I have a strong interest in web engineering and have developed several projects involving full-stack web development. My expertise includes building scalable web applications using modern frameworks like React, Angular, and Node.js. I am proficient in designing and implementing RESTful APIs, database management with SQL and NoSQL technologies, and ensuring responsive, user-friendly designs. I have experience in deploying web applications using cloud platforms like AWS and Azure, and I am passionate about exploring web security, performance optimization, and progressive web apps (PWAs)."""

# # data science
# student_paragraph = """I am passionate about data science and analytics, with hands-on experience in data preprocessing, statistical modeling, and predictive analytics. I have worked on projects involving machine learning algorithms like regression, classification, and clustering, using tools such as Python, R, and SQL. I have a strong understanding of data visualization techniques with libraries like Matplotlib and Tableau. Additionally, I am skilled in working with large datasets, data wrangling, and creating dashboards to derive actionable insights."""

# AI 
# student_paragraph = """"My interests lie in artificial intelligence and machine learning. I have developed several projects focusing on neural networks, deep learning, and reinforcement learning. I have experience using frameworks like TensorFlow and PyTorch for building and deploying models. My projects include image recognition, NLP-based sentiment analysis, and predictive modeling. I am also keen to explore the ethical and societal impacts of AI and its applications across industries."""

# # Game 
student_paragraph = """I am highly enthusiastic about game development and have created several 2D and 3D games using Unity and Unreal Engine. My expertise includes programming gameplay mechanics, designing levels, and integrating animations. I have experience in languages like C# and C++, as well as working with game physics and AI for character behavior. I am particularly interested in exploring VR/AR technologies and their applications in gaming and beyond."""

# # Bioinformatik
# student_paragraph = """I have a keen interest in bioinformatics and its application in analyzing biological data. I have worked on projects involving genome sequence analysis, protein structure prediction, and phylogenetic tree construction. My skills include working with tools like BLAST, Biopython, and R for statistical analysis of biological datasets. I am passionate about exploring how computational approaches can drive advancements in genetics, drug discovery, and personalized medicine.""" 

# # Robotics
# student_paragraph = """My interest lies in robotics and automation, with hands-on experience in building autonomous robots using Arduino and Raspberry Pi. I have worked on projects involving robotic arm programming, obstacle detection using sensors, and line-following robots. I am proficient in control systems, kinematics, and programming languages like Python and C. I am excited about exploring applications of robotics in industries like healthcare, manufacturing, and space exploration"""

# Blockchain 
# student_paragraph = """"I am fascinated by blockchain technology and its potential applications beyond cryptocurrency. I have experience in building decentralized applications (dApps) using Ethereum and smart contracts written in Solidity. My projects include creating secure voting systems and supply chain management platforms. I am also interested in exploring consensus algorithms, blockchain scalability, and integrating blockchain with emerging technologies like IoT and AI."""

# # # Cloud Computing
# student_paragraph = """"My focus area is cloud computing, with experience in deploying and managing applications on cloud platforms like AWS, Google Cloud, and Azure. I have worked on designing scalable architectures, setting up virtual machines, and implementing serverless solutions. My knowledge includes storage solutions, load balancing, and containerization technologies like Docker and Kubernetes. I am particularly interested in exploring the intersection of cloud computing with AI and IoT."""

# # Cybersecurity 
# student_paragraph = """"I am highly interested in cybersecurity and its applications in protecting sensitive information and systems. I have worked on securing networks, implementing encryption algorithms, and identifying vulnerabilities in software applications. My skills include penetration testing, ethical hacking, and working with tools like Wireshark, Metasploit, and Kali Linux. I am also passionate about learning advanced topics like cryptography, intrusion detection, and cyber forensics."""


In [43]:
recommend_for_paragraph(student_paragraph)

ID         Course Name                                                            Academy                                  Similarity (%)       Top Words                                         
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
7711       Cybersecurity (MSc)                                                    Saarland University                      16.89                networks, cryptography, cybersecurity             
6236       Applied Research In Computer Science (MSc)                             Hof University Of Applied Sciences       8.68                 systems, algorithms, cyber, like                  
4775       Master'S In Industrial Informatics                                     University Of Applied Sciences Emden/Leer 7.28                 systems, like, information, cyber                 
4591    

# For Frontend 


In [None]:
import pickle

In [None]:
pickle.dump(Basic_Df,open('../frontend/university_df.pkl','wb'))
pickle.dump(similarity,open('../frontend/similarity.pkl','wb'))

In [None]:
pickle.dump(Basic_Df.to_dict() ,open('university_dict.pkl','wb'))
