## IMPORT DATASET FROM GOOGLE DRIVE

In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/gdrive')

# Read the CSV file from Google Drive
data = pd.read_csv('/content/gdrive/My Drive/ADS/201709301651_masters_portal.csv')

# Display the first few rows of the DataFrame
data.head()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,country_name,country_code,university_name,university_rank,program_name,program_type,deadline,duration,language,tution_1_currency,...,tution_2_money,tution_2_type,tuition_price_specification,start_date,ielts_score,structure,academic_req,facts,city,program_url
0,Armenia,ARM,American University of Armenia,,Economics,MSc,2004-07-18T00:00:00Z,,English,EUR,...,2108.0,EU/EEA,Tuition (Year),2018-09-01 00:00:00,6.5,['Quantitative Methods for Economists (Mathema...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-09-01 00:00:00 You can...,['Yerevan'],http://www.mastersportal.eu/studies/71101/econ...
1,Armenia,ARM,American University of Armenia,,Political Science and International Affairs,Master,2031-07-18T00:00:00Z,24 months,English,EUR,...,2500.0,National,Tuition (Year),2018-08-22 00:00:00,6.5,,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-08-22 00:00:00 You can...,['Yerevan'],http://www.mastersportal.eu/studies/71085/poli...
2,Armenia,ARM,American University of Armenia,,Business Administration,MBA,2004-07-18T00:00:00Z,,English,EUR,...,2499.0,EU/EEA,Tuition (Year),2018-09-01 00:00:00,6.5,['Managers with practical knowledge of account...,"<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-09-01 00:00:00 You can...,['Yerevan'],http://www.mastersportal.eu/studies/71102/busi...
3,Armenia,ARM,American University of Armenia,,Computer and Information Science,MSc,,24 months,English,EUR,...,2500.0,National,Tuition (Year),,6.5,['Introduction to Object-Oriented Programming'...,"<section id=""AcademicRequirements""> <h2>Academ...",['Deadline and start date Application deadline...,['Yerevan'],http://www.mastersportal.eu/studies/71104/comp...
4,Armenia,ARM,American University of Armenia,,Industrial Engineering and Systems Management,MEng,2031-07-18T00:00:00Z,24 months,English,EUR,...,2500.0,National,Tuition (Year),2018-08-22 00:00:00,6.5,"['Probability Theory', 'Analysis and Design of...","<section id=""AcademicRequirements""> <h2>Academ...",['Starting in 2018-08-22 00:00:00 You can...,['Yerevan'],http://www.mastersportal.eu/studies/71103/indu...


## CLEANING THE DATA 



In [None]:
## Setting the column Currency to upper so that we can easily compare it to the user input in our filter 
import numpy as np
# data['tution_1_currency'] = data['tution_1_currency'].apply(lambda x: x.upper() if not pd.isnull(x) else np.nan)


## Setting all string columns to lowercase so that we can easily compare them to the user input in our filters 

# create a list of columns containing string datatypes to apply the function to
cols = ['program_name', 'program_type', 'language', 'tution_1_currency', 'tution_2_type']

# apply the  function to each column in the list
data[cols] = data[cols].applymap(lambda x: x.lower() if not pd.isnull(x) else np.nan)



## Column duration: Some values are in months, some in days, some not specified. We want only month values and only a number, without the string element after to be able to compare it. 

# Extract numerical part of duration column, save it in the column 'duration_num'
data['duration_num'] = data['duration'].str.extract('(\d+)').astype(float)

# Strip string elements after numbers
data['duration'] = data['duration'].str.extract('(\d+)', expand=False)

# Convert 'days' to 'months' by devidig /30.5 (we had to use an average) and round  values. Those values will be stored directly in the 'duration' column (this will be updated with the new values)
data.loc[data['duration'].str.contains('day', na=False) , 'duration_num'] = round(data.loc[data['duration'].str.contains('day', na=False) , 'duration_num'] / 30.5)
# data['duration'] = data['duration_num'].astype(int).astype(str) + ' months'

# Drop 'duration_num' column
data = data.drop('duration_num', axis=1)



## Fixing Tution Column : As 'free' is denoted in the currency column, and then the actual tution is an NAN, lets replace those instances with 0: 
data.loc[(data['tution_1_currency'] == 'free') & (data['tution_2_money'].isna()), 'tution_2_money'] = 0




## DEFINE DATA STRUCTURE

In [None]:
# Initialize Program class
class MasterProgram:
    # Using ** to be able to input as many arguments as we want, not having to define how many beforehand
    def __init__(self, **attributes): # with ** attributes we don't have to predefine all attributes. we can insert as many as we want. 
        for key, value in attributes.items():  # Iterating over all attributes that will be inserted
                                           # Using items() to retrieve key-value pairs as a tuple
            setattr(self, key, value)  # Set attribute with the setattr() function using the key and value we previously retrieved

    def __str__(self):
        return f"{self.program_name} at {self.university_name}"


##CREATE HASH TABLES

In [None]:
def create_hashtable(data):
    hashtable = {}
    for _, row in data.iterrows():
        program_key = f"{row['program_name']}_{row['university_name']}"
        program = MasterProgram(**row)
        if program_key not in hashtable:
            hashtable[program_key] = [program]
        else:
            hashtable[program_key].append(program)
    return hashtable

mastermatch_hashtable = create_hashtable(data)


## DEFINE POSSIBLE FILTERS


In [None]:
# Define filter functions


def filter_country(mastermatch_hashtable, country):
    """Filter programs by country."""
    filtered_programs = [program for program_list in mastermatch_hashtable.values() for program in program_list if str(country).lower() in str(program.country_name).lower()]
    if not filtered_programs:
        raise ValueError(f"No programs found in {country}. Please try again with a different country.")
    return filtered_programs


def filter_program_name(mastermatch_hashtable, program_name):
    """Filter programs by field of study."""
    filtered_programs = [program for program_list in mastermatch_hashtable.values() for program in program_list if str(program_name).lower() in str(program.program_name).lower()]
    if not filtered_programs:
        print(f"No programs found in {program_name}. Please try again.")
    else:
        return filtered_programs


def filter_program_type(mastermatch_hashtable, program_type):
    """Filter programs by program type."""
    filtered_programs = [program for program_list in mastermatch_hashtable.values() for program in program_list if program.program_type.lower() == program_type.lower()]
    if not filtered_programs:
        raise ValueError(f"No programs found with '{program_type}' program type. Please try again.")
    else:
        return filtered_programs


def filter_language(mastermatch_hashtable, language):
    """Filter programs by language."""
    filtered_programs = [program for program_list in mastermatch_hashtable.values() for program in program_list if str(language).lower() in str(program.language).lower()]
    if not filtered_programs:
        raise ValueError(f"No programs found with '{language}' language. Please try again.")
    else:
        return filtered_programs

def filter_ielts(mastermatch_hashtable, min_ielts_score, max_ielts_score):
    """Filter programs by IELTS score."""
    filtered_programs = []
    for key in mastermatch_hashtable:
        for program in mastermatch_hashtable[key]:
            if min_ielts_score <= program.ielts_score <= max_ielts_score:
                filtered_programs.append(program)

    if not filtered_programs:
        raise ValueError(f"No programs found with an IELTS score between '{min_ielts_score}' and '{max_ielts_score}'. Please try again.")
    else:
        return filtered_programs

# here, we must access and compare with the key, as uni name is part of the key 
def filter_uni_name(mastermatch_hashtable, university_name):
    """Filter programs by university name."""
    filtered_programs = [program for program_list in mastermatch_hashtable.values() for program in program_list if str(university_name).lower() in str(program.university_name).lower()]
    if not filtered_programs:
        raise ValueError(f"No programs found at {university_name}. Please Try again. ")
        
    else:
        return filtered_programs


def filter_duration(mastermatch_hashtable, duration):
    """Filter programs by duration."""
    filtered_programs = [program for program_list in mastermatch_hashtable.values() for program in program_list if str(duration).lower() in str(program.duration).lower()]
    if not filtered_programs:
        raise ValueError(f"No programs found with duration {duration}. Please try again with a different duration.")
    return filtered_programs

def filter_currency(mastermatch_hashtable, currency):
    """Filter programs by currency."""
    valid_currencies = set(str(currency) for currency in data['tution_1_currency'].unique())
    
    # Check if input is a string
    if not isinstance(currency, str):
        print("The 'currency' argument must be a string. Please input a string")
    currency = currency.lower()  # Convert to lowercase to be able to compare it
    if currency not in valid_currencies:
        print(f"'{currency}' is not a valid currency. Valid options are: {valid_currencies}")

    filtered_programs = [program for program_list in mastermatch_hashtable.values() for program in program_list if isinstance(program.tution_1_currency, str) and program.tution_1_currency.lower() == currency]

    if not filtered_programs:
        raise ValueError(f"No programs found in currency {currency}. Here are the available options:{valid_currencies}")
    else:
        return filtered_programs


def filter_tuition_cost(mastermatch_hashtable, min_cost, max_cost):
    """Filter programs by tuition cost range."""
    if not all(isinstance(cost, (int, float)) for cost in [min_cost, max_cost]):
        print("Invalid input. Minimum and maximum cost must be a number.")
        return None
    else:
        filtered_programs = []
        for key in mastermatch_hashtable:
            for program in mastermatch_hashtable[key]:
                if min_cost <= program.tution_1_money <= max_cost:
                    filtered_programs.append(program)

        if not filtered_programs:
          raise ValueError(f"No programs found with that tuition range. Please try again :")
        else:
          return filtered_programs



In [None]:
# Define a ranking function (rank after university rank)
def rank_programs(programs):
    if programs is None:
        return []
    else: 
        return sorted(programs, key=lambda x: x.university_rank)


In [None]:
# Define a function to create a box around text
import textwrap
def create_box(text):
    lines = text.split('\n')
    width = max(len(line) for line in lines)
    res = [f'┌{"─" * width}┐']
    for line in lines:
        res.append(f'│{line:{width}}│')
    res.append(f'└{"─" * width}┘')
    return '\n'.join(res)

In [None]:
# Main Function 

def main(): 

  # Display available filters
  print("Welcome to MasterMatch! We will help you to find the MatersProgram that best suits your needs and wishes. \n" +
  "You will now be able to filter a large database after your preferences. \n" +
  "Available Filters:\n" +
        "1. Country\n" +
        "2. Program Name\n" +
        "3. Program Type\n" +
        "4. Language\n" +
        "5. IELTS Score Required\n" +
        "6. University Name\n" +
        "7. Duration\n" +
        "8. Tuition Currency\n" +
        "9. Tuition Cost\n")


  # Get the filter numbers from the user
  while True:
      filter_numbers = input("Please enter the above indicated numbers of the filters you would like to use, separated by commas: ")

      try:
          filter_numbers = [int(num.strip()) for num in filter_numbers.split(',')]
          break
      except ValueError:
          print("Invalid input. Please enter only numbers separated by commas.")

  # initializing the filtered programs to be all programs of the hashtable, then we filter
  filtered_programs = mastermatch_hashtable

  # Apply the chosen filters and use ValueError to display an error message in the case that the user input an invalid answer
  filtered_programs = [program for program_list in mastermatch_hashtable.values() for program in program_list]
  for filter_number in filter_numbers:
    if filter_number == 1:
        while True: # we use the while loop to keep asking the user until they give a correct input 
            try:
                country_filter = input("Enter the country: ")
                filtered_programs = filter_country(mastermatch_hashtable, country_filter)
                break # break the loop if the input is correct and there is matches 
            except ValueError as e:
                print(e)
                

    elif filter_number == 2:
        while True:
            try:
                program_name_filter = input("Enter the program name: ")
                filtered_programs = filter_program_name(mastermatch_hashtable, program_name_filter)
                break
            except ValueError as e:
                print(e)


    elif filter_number == 3:
          while True:
              try:
                  program_type_filter = input("Enter the program type: ")
                  filtered_programs = filter_program_type(mastermatch_hashtable, program_type_filter)
                  break
              except ValueError as e:
                  print(e)

    elif filter_number == 4:
          while True:
              try:
                  language_filter = input("Enter the language: ")
                  filtered_programs = filter_language(mastermatch_hashtable, language_filter)
                  break
              except ValueError as e:
                  print(e)
 
    elif filter_number == 5:
      while True:
          try:
              min_ielts_filter = float(input("Enter the minimum IELTS score that the program should require: "))
              max_ielts_filter = float(input("Enter the maximum IELTS score that the program should require: "))
              filtered_programs = filter_ielts(mastermatch_hashtable, min_ielts_filter, max_ielts_filter)
              break
          except ValueError as e:
              print(e)


    elif filter_number == 6:
      while True: 
        try: 
          university_name = input("Enter the University Name:  ")
          filtered_programs = filter_uni_name(mastermatch_hashtable, university_name)
          break
        except ValueError as ve:
          print(str(ve))
          continue

    elif filter_number == 7:
      while True:
        try:
          duration_filter = input("Please enter the desired duration in Months: ")
          filtered_programs = filter_duration(mastermatch_hashtable, duration_filter)
          break
        except ValueError as ve:
          print(str(ve))
          continue
    elif filter_number == 8:
      while True: 
        try:
          currency_filter = input("Enter the currency: ")
          filtered_programs = filter_currency(mastermatch_hashtable, currency_filter)
          break
        except ValueError as ve:
          print(str(ve))
          continue

    elif filter_number == 9:
      while True: 
        try:
          min_cost = float(input("Enter the minimum tuition cost (currencies may vary, best is to also filter currency ): "))
          max_cost = float(input("Enter the maximum tuition cost(currencies may vary, best is to also filter currency ): "))
          filtered_programs = filter_tuition_cost(mastermatch_hashtable, min_cost, max_cost)
          break
        except ValueError as ve:
          print(str(ve))
          continue
    
    else:
        print(f"Invalid filter number ({filter_number}). Skipping this filter.")

  # Rank the filtered programs
  ranked_programs = rank_programs(filtered_programs)

  # Ask user how many programs they want to see (in case that there is at least 1 matching program)
  if len(ranked_programs)>=1: 
      n = input("Enter the maximum number of Master's Programs you would like to be shown: ")
      try: 
          float(n)
          n = int(n)
          # Display fitting programs to the user:
          if isinstance(n, int):
              saved_list = []
              if not ranked_programs[:n]:
                print(f"\nUnfortunately, no Program matches your criteria. Change the filters and try again!\n")
              else:
                print(f"\nThe List Below shows {n} Adequate Universities According To Your Preferences, sorted after University Rank:")
                for rank, program in enumerate(ranked_programs[:n], start=1):
                  saved_list.append({"program_name": program.program_name, "university_name": program.university_name})
                  program_details = f"{rank}. {program}"
                  boxed_details = create_box(textwrap.fill(program_details, width=80))
                  print(boxed_details)
          else:
            pass  # this is the case that the input n is not an integer, we only pass as we have already printed the error message

      except ValueError:
          print("n is not a number! Please enter an integer number")




In [None]:
if __name__ == "__main__":  
  main()

Welcome to MasterMatch! We will help you to find the MatersProgram that best suits your needs and wishes. 
You will now be able to filter a large database after your preferences. 
Available Filters:
1. Country
2. Program Name
3. Program Type
4. Language
5. IELTS Score Required
6. University Name
7. Duration
8. Tuition Currency
9. Tuition Cost
10. Tuition Type
Please enter the above indicated numbers of the filters you would like to use, separated by commas: kjansk
Invalid input. Please enter only numbers separated by commas.
Please enter the above indicated numbers of the filters you would like to use, separated by commas: 1^89
Invalid input. Please enter only numbers separated by commas.
Please enter the above indicated numbers of the filters you would like to use, separated by commas: 8293874
Invalid filter number (8293874). Skipping this filter.


KeyboardInterrupt: ignored