# **Vendor Qualification System**

### Mounting Google Drive for data connection

In [5]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
pip install nltk



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

### Loading stopwords and wordnet for text pre-processing

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

### Loading data in pandas DataFrame

In [6]:
# Define the file path
file_path = "/content/drive/MyDrive/Pyramyd OA/G2 software product overview.csv"

In [10]:
def load_data(file_path):
    """
    Loads a CSV file into a Pandas DataFrame.

    Args:
        file_path (str): The path to the CSV file to be loaded.

    Returns:
        pd.DataFrame: A DataFrame containing the data from the CSV file.
    """
    df = pd.read_csv(file_path)
    return df

### Data info to check all attributes

In [11]:
df = load_data(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 45 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   url                           1000 non-null   object 
 1   product_name                  1000 non-null   object 
 2   rating                        1000 non-null   float64
 3   description                   996 non-null    object 
 4   product_url                   1000 non-null   object 
 5   seller                        1000 non-null   object 
 6   ownership                     230 non-null    object 
 7   seller_website                1000 non-null   object 
 8   headquarters                  1000 non-null   object 
 9   total_revenue                 169 non-null    object 
 10  social_media_profiles         1000 non-null   object 
 11  seller_description            1000 non-null   object 
 12  reviews_count                 1000 non-null   int64  
 13  disc

### Defining user query

In [17]:
software_category = "Accounting & Finance Software"
capabilities = ["Budgeting"]

### Function to pre-process text

In [9]:
def preprocess_text(stemmer, lemmatizer, stop_words, text):
    """
    Preprocesses the input text by:
    - Converting to lowercase.
    - Removing non-alphanumeric characters.
    - Tokenizing the text into words.
    - Removing stopwords.
    - Lemmatizing and stemming the words.

    Args:
        stemmer (PorterStemmer): The stemmer to apply to each word.
        lemmatizer (WordNetLemmatizer): The lemmatizer to apply to each word.
        stop_words (set): Set of stopwords to be removed from the text.
        text (str): The input text to be processed.

    Returns:
        str: The processed text as a single string, where each word is lemmatized, stemmed, and non-stopword.
    """
    # Lowercase the text
    text = text.lower()

    # Remove special characters (non-alphanumeric characters)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenization (split the text into words)
    words = text.split()

    # Remove stopwords and apply stemming and lemmatization
    processed_words = []
    for word in words:
        if word not in stop_words:
            # Lemmatize and stem
            lemmatized_word = lemmatizer.lemmatize(word)
            stemmed_word = stemmer.stem(lemmatized_word)
            processed_words.append(stemmed_word)

    # Join the processed words back into a string
    return ' '.join(processed_words)

def clean_json_for_csv(json_data):
    """
    Cleans up JSON data by removing newline and carriage return characters to ensure proper CSV formatting.

    Args:
        json_data (str): The JSON data to be cleaned.

    Returns:
        str: The cleaned JSON data.
    """
    return json_data.replace('\n', ' ').replace('\r', ' ')

def get_query(software_category, capabilities):
    """
    Constructs a query string based on the provided software category and capabilities.

    If capabilities are provided, they are included in the query string. If only one capability is provided,
    it will be added directly; otherwise, the capabilities will be joined with commas and 'and' for the last one.

    Args:
        software_category (str): The category of software.
        capabilities (list): A list of capabilities to filter by.

    Returns:
        str: A query string describing the software category and its capabilities (if any).
    """
    if capabilities:
        if len(capabilities) == 1:
            cap_str = capabilities[0]
        else:
            cap_str = ', '.join(capabilities[:-1]) + ' and ' + capabilities[-1]
        query = f"{software_category} with {cap_str}"
    else:
        query = software_category

    return query

### Function to generate TF-IDF vectors

In [None]:
def generate_tfidf_per_row(df_tfidf, directory_path=None):
    """
    Generates TF-IDF vectors for each row in the DataFrame based on text features
    provided in a JSON-formatted 'Features' column.

    Args:
        df_tfidf (pd.DataFrame): DataFrame containing a 'Features' column with JSON strings
            describing various features and associated text data.
        directory_path (str, optional): Unused parameter in current implementation. Reserved for future use.

    Returns:
        pd.DataFrame: The updated DataFrame with two new columns:
            - 'vectors': A dictionary of TF-IDF vectors (as lists) per feature.
            - 'vectorizers': A dictionary of fitted TfidfVectorizer objects per feature.

    Notes:
        - The function preprocesses text using lemmatization, stemming, and stop word removal.
        - It handles rows with invalid or empty JSON gracefully and logs errors.
        - Each feature's TF-IDF vector is generated using a separate `TfidfVectorizer` instance.
    """

    # Initialize new columns to store vectors and vectorizers
    df_tfidf['vectors'] = None
    df_tfidf['vectorizers'] = None

    for idx, row in df_tfidf.iterrows():
        vectors = {}       # To store actual TF-IDF vectors
        vectorizers = {}   # To store actual TfidfVectorizer objects
        column_value_as_string = str(row['Features'])

        # Check if the string is empty or just whitespace
        if column_value_as_string.strip():
            try:
                featuresValues = json.loads(column_value_as_string)

                # Iterate over features and generate TF-IDF vectors
                for categories in featuresValues:
                    for feature in categories.get("features", []):
                        name = feature.get('name', '')
                        description = feature.get('description', '')
                        description += ' ' + name + ' ' + categories.get('Category', '') + ' ' + row['main_category']

                        if description.strip():
                            processed_description = preprocess_text(stemmer, lemmatizer, stop_words, description)
                            vectorizer = TfidfVectorizer()
                            tfidf_matrix = vectorizer.fit_transform([processed_description])

                            # Store the vector and the vectorizer object
                            vectors[name] = tfidf_matrix.toarray().tolist()
                            vectorizers[name] = vectorizer
                        else:
                            vectors[name] = None
                            vectorizers[name] = None

                # Store in DataFrame
                df_tfidf.at[idx, 'vectors'] = vectors
                df_tfidf.at[idx, 'vectorizers'] = vectorizers

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON for row {idx}: {e}")
                df_tfidf.at[idx, 'vectors'] = None
                df_tfidf.at[idx, 'vectorizers'] = None
        else:
            print(f"Empty or invalid JSON for row {idx}")
            df_tfidf.at[idx, 'vectors'] = None
            df_tfidf.at[idx, 'vectorizers'] = None

    return df_tfidf

### Function to calculate similarity

In [12]:
def calculate_similarity(query, df):
    """
    Calculates similarity scores between a preprocessed user query and text-based feature vectors
    stored in a DataFrame for each row.

    Args:
        query (str): The input query string to compare against feature vectors.
        df (pd.DataFrame): A DataFrame containing the following columns:
            - 'vectors': A dictionary of precomputed feature vectors for each row.
            - 'vectorizers': A dictionary of fitted vectorizer objects corresponding to each feature.

    Returns:
        pd.DataFrame: The original DataFrame with the following added columns:
            - 'similarity_scores': A dictionary of cosine similarity scores per feature.
            - 'avg_similarity_scores': The average similarity score across all features in a row.

    Notes:
        - The query is first preprocessed using stemming, lemmatization, and stop word removal.
        - Cosine similarity is used to compare the vectorized query with stored feature vectors.
        - Handles missing vectorizers, unfitted vectorizers, and dimension mismatches.
        - Drops 'vectors' and 'vectorizers' columns before returning the final DataFrame.
    """

    processed_query = preprocess_text(stemmer, lemmatizer, stop_words, query)

    similarity_results = []
    avg_similarity_scores = []

    for idx, row in df.iterrows():
        similarity_scores = {}

        feature_vectors = row.get('vectors', {})
        feature_vectorizers = row.get('vectorizers', {})

        if feature_vectors and feature_vectorizers:
            total_score = 0
            count = 0

            for feature_name, vector in feature_vectors.items():
                print(f"Processing feature '{feature_name}' in row {idx}")

                if vector:
                    try:
                        vectorizer = feature_vectorizers.get(feature_name, None)

                        if vectorizer is None:
                            print(f"No vectorizer for feature '{feature_name}' in row {idx}")
                            similarity_scores[feature_name] = 0
                            continue

                        if not hasattr(vectorizer, 'vocabulary_'):
                            print(f"Error: Vectorizer not fitted for feature '{feature_name}' in row {idx}")
                            similarity_scores[feature_name] = 0
                            continue

                        # Transform the query
                        query_vector = vectorizer.transform([processed_query])
                        print(f"Query Vector Shape: {query_vector.shape}")

                        # Convert stored feature vector to numpy array
                        feature_vector = np.array(vector).reshape(1, -1)

                        if query_vector.shape[1] == feature_vector.shape[1]:
                            similarity = cosine_similarity(query_vector, feature_vector)[0][0]
                        else:
                            print(f"Dimension mismatch in row {idx}, feature '{feature_name}'")
                            similarity = 0

                        total_score += similarity
                        count += 1
                        similarity_scores[feature_name] = similarity

                    except Exception as e:
                        print(f"Error processing feature '{feature_name}' in row {idx}: {e}")
                        similarity_scores[feature_name] = 0
                else:
                    similarity_scores[feature_name] = 0

            #avg similarity scores calculations
            avg_similarity_scores.append(total_score / count if count > 0 else 0)
            similarity_results.append(similarity_scores)
        else:
            avg_similarity_scores.append(0)
            similarity_results.append({})

    #droping vectors once their work is done
    df.drop(['vectors', 'vectorizers'], axis=1, inplace=True)

    df['similarity_scores'] = similarity_results
    df['avg_similarity_scores'] = avg_similarity_scores

    return df

### Function to filter DFs where atleast one feature has 0.6 threshold

In [13]:
def filter_highly_similar_rows(df, threshold=0.6):
    """
    Filters rows in the DataFrame where at least one feature's similarity score
    is greater than or equal to the given threshold.

    Args:
        df (pd.DataFrame): The input DataFrame containing a 'similarity_scores' column
            (a dictionary of similarity scores per feature) and 'avg_similarity_scores'.
        threshold (float, optional): The minimum similarity score to consider a row relevant.
            Defaults to 0.6.

    Returns:
        pd.DataFrame: A filtered DataFrame containing only the rows with at least one
        feature whose similarity score meets or exceeds the threshold, sorted by
        'avg_similarity_scores' in descending order.
    """

    # Function to check if any feature in a row has similarity >= threshold
    def has_high_similarity(similarity_scores):
        return any(score >= threshold for score in similarity_scores.values())

    # Filter the DataFrame
    filtered_df = df[df['similarity_scores'].apply(has_high_similarity)]

    return filtered_df.sort_values(by='avg_similarity_scores', ascending=False)

### Function to rank all vendors

In [15]:
def rank_vendors(df, weight_similarity=0.7, weight_rating=0.3):
    """
    Ranks vendors based on weighted average of similarity and rating.

    Parameters:
    - df: DataFrame with columns `avg_similarity_scores`, `rating`, and optionally `prequalified`
    - weight_similarity: weight for average similarity score (default 0.7)
    - weight_rating: weight for vendor rating (default 0.3)
    - top_n: number of top vendors to return (default 10)

    Returns:
    - Ranked DataFrame with score and rank
    """
    df_ranked = df.copy()

    # Fill missing ratings with 0 if any
    if 'rating' in df_ranked.columns:
        df_ranked['rating'] = df_ranked['rating'].fillna(0)
    else:
        df_ranked['rating'] = 0  # Add rating column if not present

    # Normalize both columns to bring them into [0,1] range
    df_ranked['normalized_similarity'] = df_ranked['avg_similarity_scores'] / df_ranked['avg_similarity_scores'].max()
    df_ranked['normalized_rating'] = df_ranked['rating'] / df_ranked['rating'].max() if df_ranked['rating'].max() > 0 else 0

    # Compute final score using weighted sum
    df_ranked['final_score'] = (
        weight_similarity * df_ranked['normalized_similarity'] +
        weight_rating * df_ranked['normalized_rating']
    )


    # Sort: Prequalified vendors first, then by final score
    df_ranked.sort_values(by=['final_score'], ascending=[False], inplace=True)
    df_ranked['rank'] = range(1, len(df_ranked) + 1)

    return df_ranked

### Function to return ranked vendors

In [16]:
def get_qualifiedVendors(input_path, query, software_category, capabilities):
    """
    Filters and ranks vendors based on similarity to a query, within a specified software category,
    using TF-IDF vectorization and cosine similarity. The function returns the top 10 vendors based on
    their similarity scores and ranking.

    Args:
        input_path (str): Path to the input data file containing vendor information.
        query (str): The query text to compare against the vendors' features for similarity.
        software_category (str): The software category to filter the vendors by (case-insensitive).
        capabilities (dict): Additional capabilities or filters (not used in this implementation but reserved for future extensions).

    Returns:
        pd.DataFrame: A DataFrame containing the top 10 vendors sorted by their similarity to the query, including:
            - 'product_name': Name of the product/vendor.
            - 'rating': Vendor's rating.
            - 'seller': Vendor's seller.
            - 'main_category': Vendor's main software category.
            - 'Features': Features associated with the vendor.
            - 'avg_similarity_scores': The average similarity score of the vendor's features to the query.
            - 'final_score': Final score after ranking.
            - 'rank': Rank based on the final score.

    Notes:
        - The function preprocesses text and calculates TF-IDF vectors for the vendor features.
        - The vendors are filtered by their main category, then ranked based on their similarity to the input query.
        - The function assumes the input data is in a compatible format (e.g., CSV or JSON).
    """

    # Load vendor data from the specified file path
    df = load_data(input_path)

    # Select relevant columns from the data
    df = df[['product_name', 'rating', 'seller', 'main_category', 'Features']]

    # Filter vendors by the specified software category (case-insensitive)
    df = df[df['main_category'].str.contains(software_category, case=False, na=False)]

    # Generate TF-IDF vectors for each row based on the 'Features' column
    df = generate_tfidf_per_row(df.copy())

    # Calculate similarity scores between the query and vendor feature vectors
    df_new = calculate_similarity(query, df.copy())

    # Filter vendors that have highly similar features to the query (above a predefined threshold)
    filtereddf = filter_highly_similar_rows(df_new)

    # Rank the vendors based on their similarity scores
    rankedvendors = rank_vendors(filtereddf)

    # Return the top 10 vendors with the relevant information
    return rankedvendors[['product_name', 'rating', 'seller', 'main_category', 'Features', 'avg_similarity_scores', 'final_score', 'rank']].head(10)


### Return final output

In [None]:
def vendor_qualification ():

    """
    Endpoint to qualify vendors based on similarity to a provided query and filter by software category and capabilities.
    This function processes the incoming request, retrieves the list of qualified vendors, and returns them in a JSON format.

    Request Arguments:
        - software_category (str): The category of software the vendors must belong to.
        - capabilities (list): A list of capabilities used to refine the query (though not utilized in the current implementation).

    Returns:
        JSON Response:
            - 'message': A static message indicating the purpose of the endpoint ('Vendor Qualification').
            - 'similarity_scores': A list of top 10 qualified vendors, including product name, rating, seller, features, and similarity scores.
    """



    query = get_query(software_category, capabilities)

    qualifiedVendors = get_qualifiedVendors("C:\\Users\\naikn\\Downloads\\G2 software product overview.csv", query, software_category, capabilities)
    return qualifiedVendors