In [21]:
import pandas as pd

# Load the two datasets 
ratings_data = pd.read_csv('Dataset/last/XWines_Slim_150K_ratings.csv')
wine_data = pd.read_csv('Dataset/last/XWines_Slim_1K_wines.csv')
group_data = pd.read_csv('Dataset/last/group_composition.csv')  

# Merge the two datasets on 'WineID'
merged_data = pd.merge(ratings_data, wine_data, on='WineID')

# Function to find the best-rated wine or suggest a completely different one if rating < 4
def recommend_wine_for_user(user_id, merged_data):
    # Filter wines rated by the specific user
    user_wines = merged_data[merged_data['UserID'] == user_id]

    if user_wines.empty:
        print(f"No wines found for user {user_id}")
        return f"No wines found for user {user_id}.", None

    # Find the wine with the highest rating by the user
    best_rated_wine = user_wines.loc[user_wines['Rating'].idxmax()]

    # If the best rating is 4 or higher, return that wine
    if best_rated_wine['Rating'] >= 4:
        return best_rated_wine['WineID'], best_rated_wine['Rating']

    # If no wine has a rating of 4 or higher, find a completely different wine
    else:

        characteristics = ['Type', 'Body']

        # Filter out wines that are similar to the one the user rated poorly
        different_wines = merged_data
        for char in characteristics:
            different_wines = different_wines[different_wines[char] != best_rated_wine[char]]

        # If there are still wines left, choose one randomly or based on rating
        if not different_wines.empty:
            recommended_wine = different_wines.sample().iloc[0]  # Sample one random different wine
            return recommended_wine['WineID'], None

        return f"No sufficiently different wines found for user {user_id}.", None

# Function to recommend wine for a group and output in a DataFrame
def recommend_wine_for_group(group_id, group_data, merged_data):
    group_info = group_data[group_data['group_id'] == group_id].iloc[0]
    group_members = eval(group_info['group_members'])  
    recommendations = []

    # Loop through each member of the group and get their favorite wine
    for user_id in group_members:
        wine_id, rating = recommend_wine_for_user(user_id, merged_data)
        print(wine_id, rating)
        recommendations.append({
            'group_id': group_id,
            'user_id': user_id,
            'wine_id': wine_id,
            'rating': rating if rating is not None else 'Suggested different wine'
        })

    recommendation_df = pd.DataFrame(recommendations)
    return recommendation_df

group_id = int(group_data.loc[:,"group_id"].sample().iloc[0]) 
print(f"Group ID: {group_id}")
result_df = recommend_wine_for_group(group_id, group_data, merged_data)

# Output the DataFrame with the recommendations
print(result_df)


  ratings_data = pd.read_csv('Dataset/last/XWines_Slim_150K_ratings.csv')


Group ID: 15
112247 4.5
162502 5.0
   group_id  user_id  wine_id  rating
0        15  1443462   112247     4.5
1        15  1019006   162502     5.0


In [26]:
# create average rating for each wine

ratings = {}
for index, row in ratings_data.iterrows():
    if row['WineID'] not in ratings:
        ratings[row['WineID']] = {"total": row['Rating'], "count": 1}
    else:
        ratings[row['WineID']]["total"] += row['Rating']
        ratings[row['WineID']]["count"] += 1

# add the averages to the df
wine_data["AvgRating"] = 0.0

for index, row in wine_data.iterrows():
    wine_data.loc[index, "AvgRating"] = ratings[row["WineID"]]["total"]/ratings[row["WineID"]]["count"]

In [27]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import numpy as np

wine_ids = wine_data['WineID']
features = wine_data.drop(columns=['WineID', 'Grapes', 'Harmonize', 'Code', 'RegionID', 'RegionName', 'WineryID', 'WineryName', 'Website', 'Vintages'])

# Encode categorical features
label_encoders = {}
for column in features.columns:
    le = LabelEncoder()
    features[column] = le.fit_transform(features[column].astype(str))  
    label_encoders[column] = le

# Convert features to NumPy array 
features_array = features.values

# Train a Decision Tree Classifier
tree = DecisionTreeClassifier(random_state=42)
tree.fit(features_array, wine_ids)

# Create a function to find similar wines
def recommend_similar_wines(wine_features, tree, features_array, wine_ids, num_recommendations=10):
    # Predict the leaf node for the given wine
    leaf_node = tree.apply(wine_features)[0]

    # Find all wines grouped by their leaf nodes
    wines_in_leaf_nodes = defaultdict(list)
    for i, leaf in enumerate(tree.apply(features_array)):
        wines_in_leaf_nodes[leaf].append(wine_ids.iloc[i])

    # Get wines from the same leaf node
    similar_wines = wines_in_leaf_nodes[leaf_node]

    # If there are not enough recommendations, add wines from nearby nodes
    if len(similar_wines) < num_recommendations:
        remaining_count = num_recommendations - len(similar_wines)
        # Sort other nodes by their proximity to the target node
        nearby_nodes = sorted(wines_in_leaf_nodes.keys(), key=lambda x: abs(x - leaf_node))
        for node in nearby_nodes:
            if node != leaf_node:
                similar_wines.extend(wines_in_leaf_nodes[node])
                if len(similar_wines) >= num_recommendations:
                    break

    # Return the top 10 recommendations 
    return [int(wine) for wine in similar_wines if wine != wine_ids.iloc[0]][:num_recommendations]

def recommend_similar_wine_for_user(wine_id, user_id, num_recommendations=10):
    wine_to_recommend_for = wine_data[wine_data['WineID'] == wine_id].iloc[0]
    wine_features = wine_to_recommend_for.drop(['WineID', 'Grapes', 'Harmonize', 'Code', 'RegionID', 'RegionName', 'WineryID', 'WineryName', 'Website', 'Vintages'])

    # Encode the feature vector using the label encoders, handling unseen labels
    encoded_wine_features = []
    for column in features.columns:
        value = wine_features[column]

        # If the value is not seen during training, expand the classes
        if value not in label_encoders[column].classes_:
            # Expand the label encoder's classes to include the unseen value
            label_encoders[column].classes_ = np.append(label_encoders[column].classes_, value)

        # Encode the value
        encoded_value = label_encoders[column].transform([value])[0]
        encoded_wine_features.append(encoded_value)
    encoded_wine_features = np.array(encoded_wine_features).reshape(1, -1)

    recommended_wine_ids = recommend_similar_wines(
        encoded_wine_features,
        tree,
        features_array,
        wine_ids,
        num_recommendations=num_recommendations
    )

    
    recommendations = []
    for wine_id in recommended_wine_ids:
        wine_info = wine_data[wine_data['WineID'] == wine_id].iloc[0]
        wine_id = int(wine_id)
        user_id = int(user_id)
        original_wine_ratings = ratings_data[(ratings_data['WineID'] == wine_id) & (ratings_data['UserID'] == user_id)]
        if not original_wine_ratings.empty:
            original_wine_rating = original_wine_ratings.iloc[0]['Rating']
        else:
            original_wine_rating = None
            
        try:
            rating = int(original_wine_rating)
        except TypeError:
            rating = original_wine_rating
            
        recommendations.append({
            "WineID": int(wine_info['WineID']),
            "Type": wine_info['Type'],       
            "Body": wine_info['Body'],       
            "ABV": int(wine_info['ABV']),       
            "Country": wine_info['Country'],
            "Rating": rating 
        })
        
    print(recommendations)

    return recommendations

# Function to find decision tree recommendations for all wines in the group recommendation DataFrame
def recommend_similar_wines_for_group(result_df, merged_data, k=10):
    decision_tree_recommendations = []

    # Iterate over each wine recommended to the group
    for index, row in result_df.iterrows():
        wine_id = row['wine_id']
        user_id = row['user_id']

        # Find decision tree recommendations for the wine
        decision_tree_result = recommend_similar_wine_for_user(wine_id, user_id, k)
        decision_tree_recommendations.append({
            'user_id': int(user_id),
            'wine_id': int(wine_id),
            'decision_tree_recommendations': [
                {
                    "WineID": rec['WineID'],
                    "Type": rec['Type'],       # Wine type
                    "Body": rec['Body'],       # Wine body
                    "ABV": int(rec['ABV']),         # Alcohol by volume
                    "Country": rec['Country'], # Country of origin
                    "Rating": rec['Rating']   # User's rating for the wine (if available)
                } for rec in decision_tree_result  
            ]
        })

    decision_tree_recommendation_df = pd.DataFrame(decision_tree_recommendations)
    return decision_tree_recommendation_df


result_df = recommend_wine_for_group(group_id=group_id, group_data=group_data, merged_data=merged_data)

decision_tree_recommendation_df = recommend_similar_wines_for_group(result_df, merged_data, k=10)

decision_tree_recommendation_df


112247 4.5
162502 5.0
[{'WineID': 112247, 'Type': 'Red', 'Body': 'Medium-bodied', 'ABV': 15, 'Country': 'France', 'Rating': 4}, {'WineID': 162456, 'Type': 'Red', 'Body': 'Medium-bodied', 'ABV': 14, 'Country': 'Ukraine', 'Rating': None}, {'WineID': 100002, 'Type': 'Red', 'Body': 'Medium-bodied', 'ABV': 12, 'Country': 'Brazil', 'Rating': None}, {'WineID': 184540, 'Type': 'Red', 'Body': 'Very full-bodied', 'ABV': 12, 'Country': 'United States', 'Rating': None}, {'WineID': 198090, 'Type': 'Red', 'Body': 'Light-bodied', 'ABV': 11, 'Country': 'Romania', 'Rating': None}, {'WineID': 144081, 'Type': 'Red', 'Body': 'Medium-bodied', 'ABV': 15, 'Country': 'Italy', 'Rating': None}, {'WineID': 184116, 'Type': 'Red', 'Body': 'Very full-bodied', 'ABV': 14, 'Country': 'United States', 'Rating': None}, {'WineID': 162457, 'Type': 'Red', 'Body': 'Medium-bodied', 'ABV': 13, 'Country': 'Ukraine', 'Rating': None}, {'WineID': 163332, 'Type': 'Red', 'Body': 'Medium-bodied', 'ABV': 12, 'Country': 'Chile', 'Rati

Unnamed: 0,user_id,wine_id,decision_tree_recommendations
0,1443462,112247,"[{'WineID': 112247, 'Type': 'Red', 'Body': 'Me..."
1,1019006,162502,"[{'WineID': 163557, 'Type': 'Red', 'Body': 'Fu..."


In [28]:
decision_tree_recommendation_df.loc[:,'decision_tree_recommendations']

0    [{'WineID': 112247, 'Type': 'Red', 'Body': 'Me...
1    [{'WineID': 163557, 'Type': 'Red', 'Body': 'Fu...
Name: decision_tree_recommendations, dtype: object

In [29]:
def score_characteristics(recommendations):
    # Initialization of the scores dictionary for all features
    scores = {
        "Type": {},
        "Body": {},
        "ABV": {},
        "Country": {}
    }
    
    weights = {
        "Type": 1,      # Highest weight
        "Body": 0.75,      # Second highest
        "ABV": 0.50,       # Third highest
        "Country": 0.25    # Lowest weight
    }
    
    # Loop through each wine recommendation
    for wine in recommendations:
        for key, value in wine.items():
            # Skip certain keys
            if key in ["WineID", "Rating"]:
                continue
            
            # Increment the weighted counts for each relevant feature
            if key in scores:
                weight = weights[key]  
                if value not in scores[key]:
                    scores[key][value] = weight
                else:
                    scores[key][value] += weight
    
    return scores

# Loop over the recommendations for each user and calculate the characteristic weights
category_weights_by_user = []
for index, row in decision_tree_recommendation_df.iterrows():
    x = score_characteristics(row.decision_tree_recommendations)
    category_weights_by_user.append(x)
    print(x)


{'Type': {'Red': 10}, 'Body': {'Medium-bodied': 5.25, 'Very full-bodied': 1.5, 'Light-bodied': 0.75}, 'ABV': {15: 1.0, 14: 1.0, 12: 2.0, 11: 0.5, 13: 0.5}, 'Country': {'France': 0.25, 'Ukraine': 0.5, 'Brazil': 0.25, 'United States': 0.5, 'Romania': 0.25, 'Italy': 0.25, 'Chile': 0.25, 'Malta': 0.25}}
{'Type': {'Red': 8, 'White': 2}, 'Body': {'Full-bodied': 7.5}, 'ABV': {14: 3.0, 13: 1.0, 12: 1.0}, 'Country': {'Chile': 2.0, 'France': 0.5}}


In [30]:
# Initialize category weights for all relevant features
category_weights = {
    "Type": {},
    "Body": {},
    "ABV": {},
    "Country": {}
}

# Aggregate weights from all users
for i in category_weights_by_user:
    # Aggregate weights for Type
    for key, value in i["Type"].items():
        if key not in category_weights["Type"]:
            category_weights["Type"][key] = value
        else:
            category_weights["Type"][key] += value
    
    # Aggregate weights for Body
    for key, value in i["Body"].items():
        if key not in category_weights["Body"]:
            category_weights["Body"][key] = value
        else:
            category_weights["Body"][key] += value
    
    # Aggregate weights for ABV
    for key, value in i["ABV"].items():
        if key not in category_weights["ABV"]:
            category_weights["ABV"][key] = value
        else:
            category_weights["ABV"][key] += value
    
    # Aggregate weights for Country
    for key, value in i["Country"].items():
        if key not in category_weights["Country"]:
            category_weights["Country"][key] = value
        else:
            category_weights["Country"][key] += value

category_weights


{'Type': {'Red': 18, 'White': 2},
 'Body': {'Medium-bodied': 5.25,
  'Very full-bodied': 1.5,
  'Light-bodied': 0.75,
  'Full-bodied': 7.5},
 'ABV': {15: 1.0, 14: 4.0, 12: 3.0, 11: 0.5, 13: 1.5},
 'Country': {'France': 0.75,
  'Ukraine': 0.5,
  'Brazil': 0.25,
  'United States': 0.5,
  'Romania': 0.25,
  'Italy': 0.25,
  'Chile': 2.25,
  'Malta': 0.25}}

In [31]:
# Initialize sorted category weights for all relevant features
category_weights_sorted = {
    "Type": {},
    "Body": {},
    "ABV": {},
    "Country": {}
}

# Sort each feature's weights in descending order
for key, value in category_weights.items():
    category_weights_sorted[key] = dict(sorted(category_weights[key].items(), key=lambda item: item[1], reverse=True))

category_weights_sorted


{'Type': {'Red': 18, 'White': 2},
 'Body': {'Full-bodied': 7.5,
  'Medium-bodied': 5.25,
  'Very full-bodied': 1.5,
  'Light-bodied': 0.75},
 'ABV': {14: 4.0, 12: 3.0, 13: 1.5, 15: 1.0, 11: 0.5},
 'Country': {'Chile': 2.25,
  'France': 0.75,
  'Ukraine': 0.5,
  'United States': 0.5,
  'Brazil': 0.25,
  'Romania': 0.25,
  'Italy': 0.25,
  'Malta': 0.25}}

In [32]:
def get_wine_with_top_categories(category_weights_sorted=category_weights_sorted, top_type_index=1, top_body_index=1, top_abv_index=1, top_country_index=1):
    # Get top categories for each feature
    if top_type_index-1 < len(list(category_weights_sorted["Type"].keys())):
        top_type = list(category_weights_sorted["Type"].keys())[top_type_index-1]
    else: top_type = None
    if top_body_index-1 < len(list(category_weights_sorted["Body"].keys())):
        top_body = list(category_weights_sorted["Body"].keys())[top_body_index-1]
    else: top_body = None
    if top_abv_index-1 < len(list(category_weights_sorted["ABV"].keys())):
        top_abv = list(category_weights_sorted["ABV"].keys())[top_abv_index-1]
    else: top_abv = None
    if top_country_index-1 < len(list(category_weights_sorted["Country"].keys())):
        top_country = list(category_weights_sorted["Country"].keys())[top_country_index-1]
    else: top_country = None
    
    # if a value is None, do not filter by that category
    # Initialize a boolean mask with all True (no filtering applied initially)
    mask = True
    
    # Apply filters only if the corresponding variable is not None
    if top_type is not None:
        mask &= (wine_data['Type'] == top_type)
    if top_body is not None:
        mask &= (wine_data['Body'] == top_body)
    if top_abv is not None:
        mask &= (wine_data['ABV'] == top_abv)
    if top_country is not None:
        mask &= (wine_data['Country'] == top_country)
    
    selection = wine_data[mask].dropna()
    
    # If no wines match, adjust indices to find alternatives
    if selection.shape[0] == 0:
        # Increment indices if no match is found
        try:
            if top_type_index == top_body_index == top_abv_index == top_country_index:
                selection = get_wine_with_top_categories(
                    category_weights_sorted=category_weights_sorted,
                    top_type_index=top_type_index+1,
                    top_body_index=top_body_index,
                    top_abv_index=top_abv_index,
                    top_country_index=top_country_index
                )
            elif top_body_index < top_type_index:
                selection = get_wine_with_top_categories(
                    category_weights_sorted=category_weights_sorted,
                    top_type_index=top_type_index,
                    top_body_index=top_body_index+1,
                    top_abv_index=top_abv_index,
                    top_country_index=top_country_index
                )
            elif top_abv_index < top_body_index:
                selection = get_wine_with_top_categories(
                    category_weights_sorted=category_weights_sorted,
                    top_type_index=top_type_index,
                    top_body_index=top_body_index,
                    top_abv_index=top_abv_index+1,
                    top_country_index=top_country_index
                )
            else:
                selection = get_wine_with_top_categories(
                    category_weights_sorted=category_weights_sorted,
                    top_type_index=top_type_index,
                    top_body_index=top_body_index,
                    top_abv_index=top_abv_index,
                    top_country_index=top_country_index+1
                )
        except IndexError:
            print("No wines found with the given category weights.")
            return pd.DataFrame(columns=wine_data.columns)
    
    
    
    # Randomize the selection so it is not always the same recommendation
    selection.sample(frac=1).reset_index(drop=True)

    return selection.head(1)

get_wine_with_top_categories()


Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,Country,RegionID,RegionName,WineryID,WineryName,Website,Vintages,AvgRating
560,162502,Montes Alpha Cabernet Sauvignon,Red,Varietal/100%,['Cabernet Sauvignon'],"['Beef', 'Lamb', 'Game Meat', 'Poultry']",14.0,Full-bodied,High,CL,Chile,2268,Colchagua Valley,39051,Montes,http://www.monteswines.com,"[2020, 2019, 2018, 2017, 2016, 2015, 2014, 201...",3.818328


In [33]:
get_wine_with_top_categories()["WineID"]

560    162502
Name: WineID, dtype: int64

In [34]:
group_data

Unnamed: 0.1,Unnamed: 0,group_id,group_size,group_similarity,group_members,avg_similarity
0,0,0,2,random,"[1110381, 1212243]",-0.006469
1,1,1,2,random,"[1180876, 1006585]",-0.015370
2,2,2,2,random,"[1174271, 1159745]",-0.015015
3,3,3,2,random,"[1223585, 1150327]",0.105930
4,4,4,2,random,"[1822155, 1138489]",0.163505
...,...,...,...,...,...,...
235,235,235,8,similar_one_divergent,"[1174766, 1226880, 1059478, 1062080, 1039599, ...",0.376073
236,236,236,8,similar_one_divergent,"[1203902, 1166271, 1078943, 2056705, 1204723, ...",0.400792
237,237,237,8,similar_one_divergent,"[1198066, 1041042, 1086821, 1293899, 1134438, ...",0.248949
238,238,238,8,similar_one_divergent,"[1908334, 1714244, 1018208, 1054770, 1642793, ...",0.361038


In [35]:
#evaluate_recommender(decision_tree_recommendation_df, group_data, ratings_data)

In [36]:
decision_tree_recommendation_df

Unnamed: 0,user_id,wine_id,decision_tree_recommendations
0,1443462,112247,"[{'WineID': 112247, 'Type': 'Red', 'Body': 'Me..."
1,1019006,162502,"[{'WineID': 163557, 'Type': 'Red', 'Body': 'Fu..."


In [37]:
group_data["group_members"][0]

'[1110381, 1212243]'

In [38]:
ratings_data

Unnamed: 0,RatingID,UserID,WineID,Vintage,Rating,Date
0,143,1356810,103471,1950,4.5,2021-11-02 20:52:59
1,199,1173759,111415,1951,5.0,2015-08-20 17:46:26
2,348,1164877,111395,1952,5.0,2020-11-13 05:40:26
3,374,1207665,111433,1953,5.0,2017-05-05 06:44:13
4,834,1075841,111431,1955,5.0,2016-09-14 20:18:38
...,...,...,...,...,...,...
149995,21013438,1000052,111468,N.V.,4.5,2021-12-22 21:03:51
149996,21013467,1180844,111461,N.V.,4.0,2017-04-23 21:07:55
149997,21013494,1218581,113690,N.V.,3.5,2019-04-14 17:45:08
149998,21013505,1106198,111468,N.V.,4.5,2021-07-10 07:00:15


In [39]:
import pandas as pd
import ast  # Import to safely parse string representation of lists

def get_wines_rated_by_group(group_df, group_id, ratings_df, wine_id=None):
    """
    Get all wines rated by a specific group based on group_id, with optional filtering for a specific wine_id.

    Parameters:
    group_df (pd.DataFrame): DataFrame containing group data with columns like ['group_id', 'group_members'].
    group_id (int): The ID of the group to analyze.
    ratings_df (pd.DataFrame): DataFrame containing ratings data with columns ['UserID', 'WineID', 'Rating'].
    wine_id (int, optional): The ID of a specific wine to filter. Defaults to None.

    Returns:
    pd.DataFrame: DataFrame containing all wines rated by users in the specified group, optionally filtered by the given WineID.
    """
    # Step 1: Extract the group_members string using the group_id
    group_row = group_df[group_df['group_id'] == group_id]

    # Step 2: Check if group_members list is found
    if group_row.empty:
        print(f"Group ID {group_id} not found.")
        return pd.DataFrame(columns=['UserID', 'WineID', 'Rating'])
    
    # Step 3: Convert the group_members string into an actual list
    group_members_str = group_row['group_members'].values[0]
    group_members = ast.literal_eval(group_members_str)  # Safely convert string to list
    
    # Step 4: Filter ratings_data for rows where UserID is in the group_members list
    filtered_ratings = ratings_df[ratings_df['UserID'].isin(group_members)]
    
    # Step 5: If wine_id is provided, filter further for that specific WineID
    if wine_id is not None:
        filtered_ratings = filtered_ratings[filtered_ratings['WineID'] == wine_id]

    # Step 6: Return the filtered DataFrame
    return filtered_ratings[['UserID', 'WineID', 'Rating']]

# Example Usage:
# Assuming you have `group_data` and `ratings_data` DataFrames defined
# result_df = get_wines_rated_by_group(group_data, 5, ratings_data, wine_id=111544)


In [40]:
get_wines_rated_by_group(group_data,5,ratings_data,wine_id=193478)

Unnamed: 0,UserID,WineID,Rating
5881,1078014,193478,4.5
