Importing necessary libraries

In [24]:

import pandas as pd
import numpy as np

from typing import Dict, List

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

Data reading and column configuration

In [7]:
df=pd.read_csv("service_data.csv")
cc = [
    "Target_Business_Type",
    "Price_Category",
    "Language_Support",
    "Location_Area",
]
tc="Description"

Understanding the data

In [10]:
df.info()
df.describe
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Service_ID            1000 non-null   int64 
 1   Service_Name          1000 non-null   object
 2   Target_Business_Type  1000 non-null   object
 3   Price_Category        1000 non-null   object
 4   Language_Support      1000 non-null   object
 5   Location_Area         1000 non-null   object
 6   Description           1000 non-null   object
 7   Match_Quality         1000 non-null   object
dtypes: int64(1), object(7)
memory usage: 62.6+ KB


Unnamed: 0,Service_ID,Service_Name,Target_Business_Type,Price_Category,Language_Support,Location_Area,Description,Match_Quality
0,1001,Payroll Processing,E-commerce,High,Both,Delhi,"Monthly employee salary calculation, disbursem...",High
1,1002,Financial Audit,Restaurant,High,Both,Remote,Independent annual financial statement verific...,Low
2,1003,Business Registration,Restaurant,Low,Hindi,Remote,Complete guidance and filing for establishing ...,Low
3,1004,Advanced Tax Filing,Restaurant,Medium,Regional,Remote,Comprehensive annual tax submission and financ...,Medium
4,1005,SEO Optimization,Tech Startup,High,Regional,Delhi,Advanced search engine ranking strategy and im...,Medium


Data loading and cleaning

In [None]:
df.columns=[c.strip() for c in df.columns]

#Remove duplicate services
df=df.drop_duplicates(subset=["Service_ID"]).reset_index(drop=True)

#Handle missing categorical values
for c in cc:
    df[c]=df[c].fillna("Unknown").astype(str).str.strip()
    

#Handle missing descriptions
df[tc]=df[tc].fillna("")

#Normalize categories
for c in cc:
    df[c]=df[c].str.strip().str.title()
 

df.head()


Unnamed: 0,Service_ID,Service_Name,Target_Business_Type,Price_Category,Language_Support,Location_Area,Description,Match_Quality
0,1001,Payroll Processing,E-Commerce,High,Both,Delhi,"Monthly employee salary calculation, disbursem...",High
1,1002,Financial Audit,Restaurant,High,Both,Remote,Independent annual financial statement verific...,Low
2,1003,Business Registration,Restaurant,Low,Hindi,Remote,Complete guidance and filing for establishing ...,Low
3,1004,Advanced Tax Filing,Restaurant,Medium,Regional,Remote,Comprehensive annual tax submission and financ...,Medium
4,1005,SEO Optimization,Tech Startup,High,Regional,Delhi,Advanced search engine ranking strategy and im...,Medium


Module 1: Feature Encoder

In [27]:
class FeatureEncoder:
    """
    Handles:
    - Categorical one-hot encoding (business type, price, language, location)
    - Text encoding (TF-IDF on description/keywords)
    - Builds combined feature vectors for services & user input
    """

    def __init__(self):
        self.cat_encoder = OneHotEncoder(sparse_output=True,handle_unknown="ignore")
        self.text_vectorizer = TfidfVectorizer(
            max_features=1000,
            ngram_range=(1, 2),
            stop_words="english",
        )
        self.fitted = False
        self.service_features = None  # sparse matrix aligned with df rows

    def fit(self, df: pd.DataFrame):
        cat_data = df[cc]
        text_data = df[tc]

        self.cat_encoder.fit(cat_data)
        self.text_vectorizer.fit(text_data)

        cat_matrix = self.cat_encoder.transform(cat_data)
        text_matrix = self.text_vectorizer.transform(text_data)

        self.service_features = sp.hstack([cat_matrix, text_matrix]).tocsr()
        self.fitted = True

    def transform_services(self, df: pd.DataFrame):
        if not self.fitted:
            raise RuntimeError("FeatureEncoder not fitted")
        cat_matrix = self.cat_encoder.transform(df[cc])
        text_matrix = self.text_vectorizer.transform(df[tc])
        return sp.hstack([cat_matrix, text_matrix]).tocsr()

    def transform_user(self, user_prefs: Dict) -> sp.csr_matrix:
        """
        user_prefs keys:
        - Target_Business_Type
        - Price_Category
        - Language_Support
        - Location_Area
        - Keywords (optional free text)
        """
        if not self.fitted:
            raise RuntimeError("FeatureEncoder not fitted")

        user_cat = pd.DataFrame(
            {
                "Target_Business_Type": [user_prefs["Target_Business_Type"]],
                "Price_Category": [user_prefs["Price_Category"]],
                "Language_Support": [user_prefs["Language_Support"]],
                "Location_Area": [user_prefs["Location_Area"]],
            }
        )

        user_cat = user_cat.fillna("Unknown")
        for c in user_cat.columns:
            user_cat[c] = user_cat[c].astype(str).str.strip().str.title()

        cat_matrix = self.cat_encoder.transform(user_cat)

        user_text = user_prefs.get("Keywords", "") or ""
        text_matrix = self.text_vectorizer.transform([user_text])

        user_features = sp.hstack([cat_matrix, text_matrix]).tocsr()
        return user_features


# Fit encoder on the dataset (run this cell)
encoder = FeatureEncoder()
encoder.fit(df)

encoder.service_features.shape


(1000, 187)

Filtering, Scoring, Match Quality

In [28]:
def filter_services(df: pd.DataFrame, user_prefs: Dict) -> pd.DataFrame:
    """
    Initial filtering based on user preferences.
    'Any' means no filtering on that field.
    """
    filtered = df.copy()

    if user_prefs["Target_Business_Type"] != "Any":
        filtered = filtered[
            filtered["Target_Business_Type"] == user_prefs["Target_Business_Type"]
        ]

    if user_prefs["Price_Category"] != "Any":
        filtered = filtered[
            filtered["Price_Category"] == user_prefs["Price_Category"]
        ]

    if user_prefs["Language_Support"] != "Any":
        filtered = filtered[
            filtered["Language_Support"] == user_prefs["Language_Support"]
        ]

    if user_prefs["Location_Area"] != "Any":
        filtered = filtered[
            filtered["Location_Area"] == user_prefs["Location_Area"]
        ]

    # If filtering removes everything, fall back to original dataset
    if filtered.empty:
        return df

    return filtered


def compute_similarity_scores(
    user_features: sp.csr_matrix,
    service_features: sp.csr_matrix,
) -> np.ndarray:
    """
    Cosine similarity between user vector and all service vectors.
    Returns 1D numpy array of similarity scores.
    """
    sims = cosine_similarity(user_features, service_features)[0]
    return sims


def quality_from_score(score: float) -> str:
    """
    Convert similarity score (0–1) to qualitative Match Quality.
    Tune thresholds as part of evaluation/optimization.
    """
    if score >= 0.7:
        return "High"
    elif score >= 0.4:
        return "Medium"
    else:
        return "Low"


Explaination Generator

In [29]:
def generate_explanation(row: pd.Series, user_prefs: Dict) -> str:
    """
    Generate human-readable reason for recommendation using
    matched features + optional keyword relevance.
    """
    reasons: List[str] = []

    if (
        user_prefs["Target_Business_Type"] != "Any"
        and row["Target_Business_Type"] == user_prefs["Target_Business_Type"]
    ):
        reasons.append("it exactly matches your business type")

    if (
        user_prefs["Price_Category"] != "Any"
        and row["Price_Category"] == user_prefs["Price_Category"]
    ):
        reasons.append("the price range aligns with your budget")

    if (
        user_prefs["Language_Support"] != "Any"
        and row["Language_Support"] == user_prefs["Language_Support"]
    ):
        reasons.append("it supports your preferred language")

    if (
        user_prefs["Location_Area"] != "Any"
        and row["Location_Area"] == user_prefs["Location_Area"]
    ):
        reasons.append("it is available in your chosen location")

    if user_prefs.get("Keywords"):
        reasons.append("its description is similar to your keywords")

    if not reasons:
        return (
            "This service is recommended based on overall high similarity across "
            "multiple features such as business type, budget, language, and location."
        )

    if len(reasons) == 1:
        body = reasons[0]
    else:
        body = ", ".join(reasons[:-1]) + " and " + reasons[-1]

    return f"This service is recommended because {body}."


Recommendation function

In [30]:
def get_recommendations(
    df: pd.DataFrame,
    encoder: FeatureEncoder,
    user_prefs: Dict,
    top_k: int = 3,
) -> pd.DataFrame:
    """
    Full pipeline:
    1. Filter services using user preferences
    2. Build feature matrix for filtered services
    3. Compute cosine similarity
    4. Add match score & quality
    5. Generate explanations
    6. Return top_k recommendations
    """
    # 1. Filtering
    filtered_df = filter_services(df, user_prefs)

    # 2. Service features (subset rows from global feature matrix)
    service_features = encoder.service_features[filtered_df.index]

    # 3. User feature vector
    user_features = encoder.transform_user(user_prefs)

    # 4. Similarity scores
    scores = compute_similarity_scores(user_features, service_features)

    # 5. Attach scores & match quality
    filtered_df = filtered_df.copy()
    filtered_df["Match_Score"] = scores
    filtered_df["Match_Quality"] = filtered_df["Match_Score"].apply(quality_from_score)

    # 6. Sort and take Top K
    filtered_df = filtered_df.sort_values(by="Match_Score", ascending=False)
    top_df = filtered_df.head(top_k).reset_index(drop=True)

    # 7. Explanations
    explanations = []
    for _, row in top_df.iterrows():
        explanations.append(generate_explanation(row, user_prefs))
    top_df["Explanation"] = explanations

    return top_df


In [31]:
# Helper: show unique options (for interactive use)
print("Unique business types:", sorted(df["Target_Business_Type"].unique()))
print("Unique price categories:", sorted(df["Price_Category"].unique()))
print("Unique languages:", sorted(df["Language_Support"].unique()))
print("Unique locations:", sorted(df["Location_Area"].unique()))


Unique business types: ['Clinic', 'E-Commerce', 'Freelancer', 'Restaurant', 'Retail', 'Tech Startup']
Unique price categories: ['High', 'Low', 'Medium', 'Premium']
Unique languages: ['Both', 'English', 'Hindi', 'Regional']
Unique locations: ['Bengaluru', 'Chennai', 'Delhi', 'Mumbai', 'Remote']


In [32]:
# Example user preferences (change as needed)
user_prefs = {
    "Target_Business_Type": "Restaurant",  # or "Any"
    "Price_Category": "Medium",           # or "Any"
    "Language_Support": "English",        # or "Any"
    "Location_Area": "Bangalore",         # or "Any"
    "Keywords": "online marketing, social media, SEO",  # optional
}

top_k = 3  # Top N services

recommendations = get_recommendations(df, encoder, user_prefs, top_k=top_k)
recommendations[[
    "Service_ID",
    "Service_Name",
    "Target_Business_Type",
    "Price_Category",
    "Language_Support",
    "Location_Area",
    "Match_Score",
    "Match_Quality",
    "Explanation",
]]


Unnamed: 0,Service_ID,Service_Name,Target_Business_Type,Price_Category,Language_Support,Location_Area,Match_Score,Match_Quality,Explanation
0,1696,Digital Marketing Strategy,Restaurant,Medium,English,Remote,0.75588,High,This service is recommended because it exactly...
1,1977,Social Media Setup,Restaurant,Medium,English,Remote,0.739335,High,This service is recommended because it exactly...
2,1900,Advanced Tax Filing,Restaurant,Medium,English,Remote,0.67082,Medium,This service is recommended because it exactly...


Simple evaluation

# Evaluation & Optimization Ideas

- Tune the score → quality thresholds in `quality_from_score`.
- Separate similarity into:
  - Categorical similarity (only one-hot features),
  - Text similarity (only TF-IDF features),
  and combine them as: `0.7 * cat_sim + 0.3 * text_sim`.
- Experiment with:
  - Different `max_features` in `TfidfVectorizer`,
  - Using bi-grams vs tri-grams,
  - Different filtering strategies (hard vs soft filtering).
- Add a column `Match_Quality` in the dataset using your model, and compare with
  existing/expected quality labels.
