In [190]:
import pandas as pd
import numpy as np

df = pd.read_excel('/Users/andreavento/Downloads/dataframe_scaped2.xlsx')

In [191]:
def get_user_preferences(test = 0, importance_of_weights = 0.3):

    total = 0

    if test == 0:
        print("Please rate the importance of the following factors from 1 (least important) to 10 (most important). Each factor has a corresponding number from 1 to 10.")
        factors = {
            'temperature_rating': "Climate: How important is a favorable climate to you?",
            'Cost of Living Index': "Cost Index (the lower, the better): How important is affordability?",
            'Opportunity to make friends (proportion of youth aged 15-29) ': "Opportunity of making friends: How important are social opportunities?",
            'Personal Safety': "Safety: How important is the safety of the campus and surrounding area?"
        }
        weights = {}
        
        
        for factor, message in factors.items():
            print(f"\n{message}")
            print("1: Not important, 2: Slightly important, 3: Moderately important, 4: Important, 5: Very important, 6-10: Increasingly critical")
            while True:
                try:
                    user_input = int(input("Select your importance rating (1-10): "))
                    total += user_input
                    if 1 <= user_input <= 10:
                        weights[factor] = user_input
                        break
                    else:
                        print("Please select a number between 1 and 10.")
                except ValueError:
                    print("Invalid input. Please enter a valid number.")
        for factor in weights:
            weights[factor] = (weights[factor] / total) * importance_of_weights
            return weights
        print(weights)

        return weights
    else:
        weights = {'temperature_rating': 8, 'Cost of Living Index': 7, 
                   'Opportunity to make friends (proportion of youth aged 15-29) ': 6, 'Personal Safety': 5}
        total = sum(weights.values())
        for factor in weights:
            weights[factor] = (weights[factor] / total) * importance_of_weights
            
        print(weights)
        
        return weights

weights = get_user_preferences(test = 1, importance_of_weights=0.3)

{'temperature_rating': 0.09230769230769231, 'Cost of Living Index': 0.08076923076923076, 'Opportunity to make friends (proportion of youth aged 15-29) ': 0.06923076923076923, 'Personal Safety': 0.057692307692307696}


In [192]:
def normalize_column(column):
    return (column - column.min()) / (column.max() - column.min())

# Example columns to normalize
numerical_columns = ['Personal Safety', 'Opportunity to make friends (proportion of youth aged 15-29) ', 'temperature_rating', 'Cost of Living Index']
for col in numerical_columns:
    df[col + '_normalized'] = normalize_column(df[col])

In [193]:
# calculate the partial scores of the numerical features

def calculate_score(row, weights):
    score = 0
    for factor, weight in weights.items():
        normalized_column = factor + '_normalized'  # Assumes normalized columns follow this naming pattern
        score += row[normalized_column] * weight
    return score

df['score'] = df.apply(calculate_score, args=(weights,), axis=1)

In [194]:
# get ranking preferences

def get_ranking_preferences(test=1, importance_of_rankings=0.3):
    
    if test == 0:
        print("Please rate the importance of the following university rankings from 1 (least important) to 10 (most important). Each ranking type has a corresponding number from 1 to 10.")
        ranking_types = {
            'Rankings for CS.pdf': "Ranking for Computer Science: How important is the CS ranking of the university?",
            'Rankings for Econ.pdf': "Ranking for Economics: How important is the economics ranking of the university?",
            'Rankings for Stats.pdf': "Rankings for Statistics: How important is the Statistics ranking?",
            'Rankings for DS.pdf': "Rankings for Data Science: How important is the university's Data Science reputation?", 
            'Rankings for Acc&Fin.pdf': "Rankings for Accounting and Finance: How important is the university's reputation?",
            'Rankings for Business&Management.pdf': "Rankings for Management: How important is the university's reputation?",
            'Rankings for SocialPolicy.pdf': "Rankings for Political Sciences: How important is the university's reputation?",
        }
        weights = {}
        total = 0

        for rank_type, message in ranking_types.items():
            print(f"\n{message}")
            print("1: Not important, 2: Slightly important, 3: Moderately important, 4: Important, 5: Very important, 6-10: Increasingly critical")
            while True:
                try:
                    user_input = int(input("Select your importance rating (1-10): "))
                    if 1 <= user_input <= 10:
                        weights[rank_type] = user_input
                        total += user_input
                        break
                    else:
                        print("Please select a number between 1 and 10.")
                except ValueError:
                    print("Invalid input. Please enter a valid number.")
                    
        for rank_type in weights:
            weights[rank_type] = (weights[rank_type] / total) * importance_of_rankings
        return weights

    else:
        # Test scenario with predefined weights for rankings
        weights = {'Rankings for CS.pdf': 10, 'Rankings for Econ.pdf': 8, 'Rankings for Stats.pdf': 6, 'Rankings for DS.pdf': 4, 
                   'Rankings for Acc&Fin.pdf': 7, 'Rankings for Business&Management.pdf': 9, 'Rankings for SocialPolicy.pdf': 3
                   }
        total = sum(weights.values())
        for rank_type in weights:
            weights[rank_type] = (weights[rank_type] / total) * importance_of_rankings
        return weights
    
ranking_weights = get_ranking_preferences()

In [195]:
max_ranking_boost = 0.02

def parse_rank(rank):
    """Parse ranking, converting ranges to their mean value."""
    if pd.isna(rank):
        return None
    if isinstance(rank, str) and '-' in rank:
        low, high = map(int, rank.split('-'))
        return (low + high) / 2  # Return the average of the range
    return int(rank)  # Convert rank to integer if not already

def apply_ranking_boost(row, ranking_weights, max_rank_boost=0.02):
    boost = 0
    total_weight = sum(ranking_weights.values())
    for rank_type, weight in ranking_weights.items():
        rank_value = row.get(rank_type, None)
        rank_position = parse_rank(rank_value)
        if rank_position is not None:
            # Normalize weight for this ranking
            normalized_weight = (weight / total_weight) * max_rank_boost
            # Calculate boost inversely proportional to rank position
            boost += normalized_weight * (1 / np.log(rank_position + 1))  # Using log to decrease impact smoothly
    # Directly add the boost to the existing score and return the updated score
    return row['score'] + boost

df['score'] = df.apply(apply_ranking_boost, axis=1, args=(ranking_weights, max_ranking_boost))

Now we turn our attention on the categorical variables, let's first capture user preferences

In [196]:
def get_user_choices(data, category, max_choices=3):
    """ Outputs the list of unique values for the user to choose their preferences using a numbered list. """
    unique_values = data[category].dropna().unique()
    options = {str(i+1): val for i, val in enumerate(unique_values)}  # Create a dict with numbered keys
    print(f"Available {category}:")
    for k, v in options.items():
        print(f"{k}: {v}")  # Display options as '1: English', '2: French', etc.

    choices = []
    print(f"Select up to {max_choices} preferences for {category}, in order of importance. Type 'skip' to finish.")
    while len(choices) < max_choices:
        choice_number = input(f"Enter your choice number ({len(choices) + 1}/{max_choices}) or 'skip': ")
        if choice_number.lower() == 'skip':
            break
        if choice_number in options and options[choice_number] not in choices:
            choices.append(options[choice_number])  # Append the value corresponding to the chosen number
        else:
            print("Invalid choice or already selected. Please try again.")
    
    if choices:
        print(f"You have chosen the following {category}: {choices}")
        # Ask for importance rating if any choices were made
        print("How important is the category '{category}' to you on a scale from 1 (least important) to 10 (most important)? ")
        importance = input(f"How important is the category '{category}' to you on a scale from 1 (least important) to 10 (most important)? ")
        try:
            importance = int(importance)
            if not 1 <= importance <= 10:
                print("Please enter a number between 1 and 10.")
                importance = None
        except ValueError:
            print("Invalid input. Please enter a number.")
            importance = None
    else:
        print(f'No preferences selected for {category}.')
        importance = None  # Set importance to None if no preferences are selected

    return choices, importance

In [197]:
# Collect preferences of language
language_choices, language_importance = get_user_choices(df, 'Language')

Available Language:
1: English
2: French
3: German
4: Spanish
5: English and Portuguese
6: Portuguese
Select up to 3 preferences for Language, in order of importance. Type 'skip' to finish.
Invalid choice or already selected. Please try again.
Invalid choice or already selected. Please try again.
Invalid choice or already selected. Please try again.
Invalid choice or already selected. Please try again.
Invalid choice or already selected. Please try again.
Invalid choice or already selected. Please try again.
Invalid choice or already selected. Please try again.
No preferences selected for Language.


In [198]:
# Collect preferences of region
region_choices, region_importance = get_user_choices(df, 'Region')

Available Region:
1: Europe
2: Middle East & North Africa
3: North America
4: Latin America & Caribbean
5: East Asia & Pacific
6: Sub-Saharan Africa
Select up to 3 preferences for Region, in order of importance. Type 'skip' to finish.
No preferences selected for Region.


In [199]:
# Collect preferences of Climate
climate_choices, climate_importance = get_user_choices(df, 'Climate')

Available Climate:
1: Marine west coast climate
2: Humid continental climate, no dry season, warm summer
3: Subarctic climate, no dry season, cool summer
4: Mediterranean climate, dry summer
5: Mediterranean climate
6: Hot desert climate
7: Humid continental climate, hot summer, no dry season
8: Humid subtropical climate, hot summer
9: Cold semi-arid climate
10: Tropical monsoon climate
11: Tropical savanna climate
12: Temperate oceanic climate, dry winter
13: Hot semi-arid climate
14: Humid subtropical climate, dry winter
15: Humid continental climate, hot summer
16: Tropical rainforest climate
Select up to 3 preferences for Climate, in order of importance. Type 'skip' to finish.
No preferences selected for Climate.


In [200]:
language_choices, language_importance, region_choices, region_importance, climate_choices, climate_importance

([], None, [], None, [], None)

In [201]:
# Let's modify the dataframe according to user preferences and importance rankings

def apply_preferences(data, language_choices, language_importance, region_choices, region_importance, climate_choices, climate_importance):
    """
    Apply filtering and boosting based on user preferences for Language, Region, and Climate.
    Safely handle cases where importance might be None.

    Args:
    data (DataFrame): The DataFrame containing university data.
    language_choices (list): List of chosen languages in order of preference.
    language_importance (int | None): Importance rating for language choices (1-10).
    region_choices (list): List of chosen regions.
    region_importance (int | None): Importance rating for region choices (1-10).
    climate_choices (list): List of chosen climates.
    climate_importance (int | None): Importance rating for climate choices (1-10).

    Returns:
    DataFrame: The updated DataFrame after applying preferences.
    """
    # Define maximum base boosts
    max_base_boost = 0.02  # Adjust as necessary based on typical scores in your data

    # Ensure all importance ratings are integers or have a fallback value
    language_importance = int(language_importance) if language_importance else 5
    region_importance = int(region_importance) if region_importance else 5
    climate_importance = int(climate_importance) if climate_importance else 5

    # Calculate dynamic boosts based on importance ratings and number of choices
    def calculate_boosts(choices, importance, max_boost):
        boosts = [max_boost * ((len(choices) - i) / len(choices)) * (importance / 10) for i in range(len(choices))]
        return boosts

    # Apply Language preferences: Filter and boost
    if language_choices:
        data = data[data['Language'].isin(language_choices)]
        language_boosts = calculate_boosts(language_choices, language_importance, max_base_boost)
        for index, language in enumerate(language_choices):
            if index < len(language_boosts):
                data.loc[data['Language'] == language, 'score'] += language_boosts[index]

    # Apply Region preferences: Boost without filtering
    if region_choices:
        region_boosts = calculate_boosts(region_choices, region_importance, max_base_boost)
        for index, region in enumerate(region_choices):
            data.loc[data['Region'] == region, 'score'] += region_boosts[index]

    # Apply Climate preferences: Boost without filtering
    if climate_choices:
        climate_boosts = calculate_boosts(climate_choices, climate_importance, max_base_boost)
        for index, climate in enumerate(climate_choices):
            data.loc[data['Climate'] == climate, 'score'] += climate_boosts[index]

    return data

df = apply_preferences(df, language_choices, language_importance, region_choices, region_importance, climate_choices, climate_importance)

In [202]:
# Ask for the exchange score and output the best 10 feasible recommendations

def find_universities(df):

    print('Please indicate you Exchange score: ')
    """
    Asks the user for their Exchange score and outputs the top 10 universities 
    available to them, sorted by 'score' in descending order, displaying only the
    university name and the minimum and maximum Exchange score requirements.

    Args:
    df (DataFrame): The DataFrame containing university data with columns for 'score',
                    'Min Exchange Score Requirement', and 'Max Exchange Score Requirement'.

    Returns:
    None: Prints the filtered and formatted university data.
    """
    try:
        # Prompt the user for their Exchange score
        user_score = float(input("Please enter your Exchange score: "))

        # Filter the DataFrame for universities where the user's score meets the requirement
        available_universities = df[df['Min Score'] <= user_score]

        # Sort these universities by 'score' in descending order and select the top 10
        top_universities = available_universities.sort_values(by='score', ascending=False).head(10)

        # Check if there are universities available
        if top_universities.empty:
            print("No universities available based on your Exchange score.")
        else:
            print("Here are the top 10 universities available to you:")
            # Select and print only the relevant columns
            print(top_universities[['University', 'Max Score', 'Min Score']])
    except ValueError:
        print("Invalid input. Please enter a valid number for your Exchange score.")

    
top_universities = find_universities(df)
print(top_universities)

Please indicate you Exchange score: 
Here are the top 10 universities available to you:
                           University  Max Score  Min Score
66                 Universität Zürich      33269      32696
64             Université de Lausanne      31898      29449
65             Universität St. Gallen      34471      33494
240   Singapore Management University      34665      33415
236  Nanyang Technological University      35280      33103
237  Nanyang Technological University      35635      32466
111      American University in Dubai      32943      32943
250     University of New South Wales      35112      34295
255          The University of Sydney      34370      34370
254          The University of Sydney      35449      31009
None


In [203]:
# altre cose da fare: chiedere corso e scremare in base a quello