## NIC BMS ELIGBILITY ENGINE



In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
rec = pd.read_csv('recommendation_dataset.csv')

In [8]:
print(rec.head())

   user_id gender social_category         dob  age  scheme_id  \
0    72739      F              ST  2007-12-10   16        155   
1    48454      F              ST  2009-11-05   14        155   
2    45558      M              ST  2010-01-05   14        155   
3    29620      F              SC  2001-01-20   23        181   
4    90146      F              SC  2009-12-21   14        297   

                                         scheme_name domicile_of_tripura  \
0         Pre-Matric Scholarship for ST (VI to VIII)                   Y   
1         Pre-Matric Scholarship for ST (VI to VIII)                   Y   
2         Pre-Matric Scholarship for ST (VI to VIII)                   Y   
3                   Mukhyamantri Yuba Yogajog Yojana                   Y   
4  Pre-Matric Scholarship Class VI to VIII for SC...                   Y   

                                         description  
0  Pre-Matric Scholarship to ST Students for Clas...  
1  Pre-Matric Scholarship to ST Students f

In [9]:
print(rec.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114471 entries, 0 to 114470
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   user_id              114471 non-null  int64 
 1   gender               114471 non-null  object
 2   social_category      114471 non-null  object
 3   dob                  114471 non-null  object
 4   age                  114471 non-null  int64 
 5   scheme_id            114471 non-null  int64 
 6   scheme_name          114471 non-null  object
 7   domicile_of_tripura  114471 non-null  object
 8   description          114471 non-null  object
dtypes: int64(3), object(6)
memory usage: 7.9+ MB
None


In [10]:
# Drop rows with missing values (NaN values)
rec = rec.dropna()


In [11]:
# Check for any remaining missing values after dropping
missing_values = rec.isnull().sum()

In [12]:
if missing_values.any():
    print("There are still missing values after dropping.")
    print(missing_values)
else:
    print ("No missing values found after dropping.")

No missing values found after dropping.


In [26]:
column_name = 'scheme_name'  

# Get the value counts for the specified column
term_counts = rec[column_name].value_counts()
 
# Display the total occurrences of each term
print("Total occurrences of each term in column", column_name, "are:")
print(term_counts)

Total occurrences of each term in column scheme_name are:
scheme_name
Mukhyamantri Yuba Yogajog Yojana                                                                                                                       34728
Pre-Matric Scholarship for ST (VI to VIII)                                                                                                             24658
Pre-Matric Scholarship Class VI to VIII for SC Students                                                                                                13824
Dr. B. R. Ambedkar Merit Award                                                                                                                         11705
Merit Award for ST                                                                                                                                     11533
Ambedkar Merit Award                                                                                                                             

In [13]:
# Map age ranges to numerical codes
#This part converts age ranges specified as strings into numerical codes. For example, 'Below 10' is mapped to 0, '10-15' to 1, and so on.
age_mapping = {
    'Below 10': 0, '10-15': 1, '16-20': 2, '21-25': 3, '26-30': 4, 
    '31-35': 5, '36-40': 6, '41-45': 7, '46-50': 8, 'Above 50': 9
}
rec['age'] = rec['age'].map(age_mapping)

# Convert categorical variables to numerical codes, including SC, ST, and OBC for caste
caste_mapping = {'SC': 0, 'ST': 1, 'OBC': 2}
rec['social_category'] = rec['social_category'].map(caste_mapping)

# Convert categorical variables to numerical codes, including M, F, and T for gender
gender_mapping = {'M': 0, 'F': 1, 'T': 2}
rec['gender'] = rec['gender'].map(gender_mapping)

# Convert domicile to numerical (binary) variable
rec['domicile_of_tripura'] = rec['domicile_of_tripura'].map({'Y': 1, 'N': 0})

rec['scheme_text'] = rec['scheme_name'] + ' ' + rec['description']

In [14]:

# TDF-IDF VECTORIZATION for combined scheme text 

vectorizer=TfidfVectorizer()
tfidf_matrix=vectorizer.fit_transform(rec['scheme_text'])

#tfidf_matrix: This is a matrix where each row is a TF-IDF vector representing the text of a scheme (combining scheme name and description).


In [16]:
# Function for content-based filtering
def content_based_filtering (search_terms,age,social_category,gender,domicile,num_recommendations=5):
    
    # Purpose: to create a combined  single string that includes both the user's search terms and their demographic or categorical information(additional parameters)
    #This combined string is then used as input to the TF-IDF vectorizer to find schemes that are most similar to this combined query
    #thus making the search personalized and more relevant to the user's profile.

    params = ' '.join(filter(None, [age, social_category, gender, domicile_of_tripura]))

# filter:  creates an iterable that filters out any None values from the list 
#' '.join(...):  joins the remaining non-None elements in the list with a space (' ') as the separator, resulting in a single string

    search_terms_str = ' '.join(search_terms) + ' ' + params
# ' '.join(search_terms):   joins the elements of the search_terms list with a space (' ') as the separator, resulting in a single string. 
#  + ' ' + params:  concatenates the search_terms string with a space and the params string created in the previous line.


    # Transform search terms using TF-IDF vectorizer
    tfidf_search_terms = vectorizer.transform([search_terms_str])
    #This line takes the combined search terms string (search_terms_str) 
        #and converts it into a sparse TF-IDF vector using the vocabulary and IDF values
    #tfidf_search_terms, is a sparse matrix representing the TF-IDF vector for the input string. 
    #Each element in this vector corresponds to a term in the vectorizer’s vocabulary, and the value is the TF-IDF score for that term in the input document.
    
 # Calculate cosine similarities between search terms and scheme text TF-IDF vectors
    cosine_similarities = cosine_similarity(tfidf_search_terms, tfidf_matrix).flatten()


    
    # store it in binary form and write a function to read it 



    
    #.flatten(): This method converts the result from a 2D array to a 1D array (flattening the matrix). 
    
    # Get indices of top similar schemes
    top_similar_indices = cosine_similarities.argsort()[-num_recommendations:][::-1]
    #argsort(): This method returns the indices that would sort an array in ascending order
    #[-num_recommendations:]: This  selects the last num_recommendations elements from the sorted indices array.
# Print the suggested schemes
    print("Content-based Filtering:")
    print("The suggested schemes based on search terms and additional parameters are: \n")
    for i, index in enumerate(top_similar_indices):
        print(rec.iloc[index]['scheme_name'])
    

In [18]:
# Function to recommend schemes using both content-based and collaborative filtering
def recommend_schemes(search_terms, age, social_category, gender, domicile_of_tripura):
    # Content-based filtering
    content_based_filtering(search_terms, age, social_category, gender, domicile_of_tripura)

# Example usage:
search_terms = ['Pre-Matric']
age = '21-25'
social_category = 'OBC'
gender = 'M'
domicile_of_tripura = 'Y'
recommend_schemes(search_terms, age, social_category, gender, domicile_of_tripura)


Content-based Filtering:
The suggested schemes based on search terms and additional parameters are: 

DR. B. R. Ambedkar Merit Award for OBC
DR. B. R. Ambedkar Merit Award for OBC
DR. B. R. Ambedkar Merit Award for OBC
DR. B. R. Ambedkar Merit Award for OBC
DR. B. R. Ambedkar Merit Award for OBC
