In [1]:
import pandas as pd
import ast
import math
from datetime  import datetime,timezone
import random

### Exploring the available data

In [2]:
'''
progress contains the data for all the solved questions
questions contains the data for all available questons on leetcode

'''

progress=pd.read_csv('../data/progress.csv')
questions=pd.read_csv('../data/questions.csv')
progress.head(2)

Unnamed: 0.1,Unnamed: 0,frontendId,title,titleSlug,difficulty,lastSubmittedAt,numSubmitted,questionStatus,lastResult,topicTags
0,0,2476,Closest Nodes Queries in a Binary Search Tree,closest-nodes-queries-in-a-binary-search-tree,MEDIUM,2025-12-30T07:30:18+00:00,2,SOLVED,AC,"[{'name': 'Array', 'slug': 'array'}, {'name': ..."
1,1,756,Pyramid Transition Matrix,pyramid-transition-matrix,MEDIUM,2025-12-29T14:29:37+00:00,2,SOLVED,AC,"[{'name': 'Hash Table', 'slug': 'hash-table'},..."


In [3]:
questions.head(3)

Unnamed: 0.1,Unnamed: 0,frontendId,title,titleSlug,difficulty,topicTags,status
0,0,1,Two Sum,two-sum,EASY,"[{'name': 'Array', 'slug': 'array', '__typenam...",SOLVED
1,1,2,Add Two Numbers,add-two-numbers,MEDIUM,"[{'name': 'Linked List', 'slug': 'linked-list'...",SOLVED
2,2,3,Longest Substring Without Repeating Characters,longest-substring-without-repeating-characters,MEDIUM,"[{'name': 'Hash Table', 'slug': 'hash-table', ...",SOLVED


In [4]:
progress.drop(progress.columns[0],axis=1,inplace=True)
questions.drop(questions.columns[0],axis=1,inplace=True)

In [5]:
'''
Encoding difficulty column and converting topicTags to type list

'''


progress['difficulty']=progress['difficulty'].replace({
    'EASY':1,
    'MEDIUM':2,
    'HARD':3,
})

questions['difficulty']=questions['difficulty'].replace({
    'EASY':1,
    'MEDIUM':2,
    'HARD':3,
})

def to_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        return ast.literal_eval(x)
    return []

questions['topicTags'] = questions['topicTags'].apply(to_list)
progress['topicTags']=progress['topicTags'].apply(to_list)


  progress['difficulty']=progress['difficulty'].replace({
  questions['difficulty']=questions['difficulty'].replace({


In [6]:
progress.sample(2)

Unnamed: 0,frontendId,title,titleSlug,difficulty,lastSubmittedAt,numSubmitted,questionStatus,lastResult,topicTags
106,328,Odd Even Linked List,odd-even-linked-list,2,2025-10-13T12:43:52+00:00,1,SOLVED,AC,"[{'name': 'Linked List', 'slug': 'linked-list'}]"
125,1143,Longest Common Subsequence,longest-common-subsequence,2,2025-10-03T13:23:22+00:00,3,SOLVED,AC,"[{'name': 'String', 'slug': 'string'}, {'name'..."


### Feature Engineering

In [7]:
'''
Creating dict for topics that have done till now with the topic score

'''

topics = {}
def time_decay(x):
    return math.exp(-x/30)

progress['lastSubmittedAt']=pd.to_datetime(progress['lastSubmittedAt'],utc=True)
now=datetime.now(timezone.utc)
progress['days_ago']=(now - progress['lastSubmittedAt']).dt.days

for _, row in progress.iterrows():
    decay=time_decay(row['days_ago'])

    difficulty_weight = (row['difficulty'] / row['numSubmitted'])*decay 
    for topic in row['topicTags']:
        slug = topic['slug']
        topics[slug] = topics.get(slug, 0) + difficulty_weight



topic_mastery_series = pd.Series(topics)

# Keep only strong topics (top 40%)
CORE_PERCENTILE = 60
CORE_THRESHOLD = topic_mastery_series.quantile(CORE_PERCENTILE / 100)

core_topics = set(
    topic_mastery_series[
        topic_mastery_series >= CORE_THRESHOLD
    ].index
)

In [8]:
sorted(topics.items(), key=lambda x: -x[1])[:10]

[('array', 24.23851032858834),
 ('depth-first-search', 10.75352300584696),
 ('tree', 10.399713959148846),
 ('binary-tree', 10.399713959148846),
 ('sorting', 8.754636673163178),
 ('breadth-first-search', 8.138089515501377),
 ('two-pointers', 8.091196725644089),
 ('string', 7.1609749175772945),
 ('greedy', 6.438490146089634),
 ('math', 5.771111701421948)]

In [9]:
'''
Introducing topic_count column for number of topics in a particular question

'''

candidates = questions[questions['status'] != 'SOLVED'].copy()

topics_set = core_topics

candidates['topic_count'] = candidates['topicTags'].apply(
    lambda tags: sum(1 for t in tags if t['slug'] in topics_set)
)

candidates=candidates[candidates['topic_count']>=1]


In [10]:
candidates.sample(4)

Unnamed: 0,frontendId,title,titleSlug,difficulty,topicTags,status,topic_count
200,201,Bitwise AND of Numbers Range,bitwise-and-of-numbers-range,2,"[{'name': 'Bit Manipulation', 'slug': 'bit-man...",TO_DO,1
3265,3266,Final Array State After K Multiplication Opera...,final-array-state-after-k-multiplication-opera...,3,"[{'name': 'Array', 'slug': 'array', '__typenam...",TO_DO,1
2115,2116,Check if a Parentheses String Can Be Valid,check-if-a-parentheses-string-can-be-valid,2,"[{'name': 'String', 'slug': 'string', '__typen...",TO_DO,3
2085,2086,Minimum Number of Food Buckets to Feed the Ham...,minimum-number-of-food-buckets-to-feed-the-ham...,2,"[{'name': 'String', 'slug': 'string', '__typen...",TO_DO,3


In [11]:
'''
new topic count for topics that are not covered in progress till now

'''

candidates['new_topic_count']=candidates['topicTags'].apply(
    lambda tags:sum(1 for t in tags if t['slug'] not in topics_set)
)

candidates.head(3)

Unnamed: 0,frontendId,title,titleSlug,difficulty,topicTags,status,topic_count,new_topic_count
4,5,Longest Palindromic Substring,longest-palindromic-substring,2,"[{'name': 'Two Pointers', 'slug': 'two-pointer...",TO_DO,3,0
7,8,String to Integer (atoi),string-to-integer-atoi,2,"[{'name': 'String', 'slug': 'string', '__typen...",TO_DO,1,0
9,10,Regular Expression Matching,regular-expression-matching,3,"[{'name': 'String', 'slug': 'string', '__typen...",TO_DO,2,1


### Creating important features for recommendation

In [12]:
'''
This function gives a penalty for a number of new topic in question as more new topics mean more penalty so less reommendation

'''

def novelty_penalty(num_new):
    if num_new==0:
        return 0.0
    elif num_new==1:
        return 0.5
    else:
        return 2
    
candidates['novelty_penalty']=candidates['new_topic_count'].apply(novelty_penalty)

In [13]:
'''
This function gives a difficulty weight to question the weight depends on the current difficulty level --(here it is medium/2 )

'''

current_level=progress['difficulty'].median()

def difficulty_weight(diff):
    if diff == current_level:
        return 1.0
    elif diff == current_level+1:
        return 0.8
    elif diff == current_level-1:
        return 0.3
    else:
        return 0
    
candidates['difficulty_weight']=candidates['difficulty'].apply(difficulty_weight)

In [14]:
'''
This function gives a relevence for each question based on number of topics and sum their topic scores and creates a new feature topic_score

'''


def topic_relevance(tags,topics):
    return sum(topics.get(t['slug'],0) for t in tags)

candidates['topic_score']=candidates['topicTags'].apply(
    lambda tags: topic_relevance(tags,topics)
)

In [15]:
'''
This function helps to introduce a new feature that gives idea about the condition of a question and return safe,stretch and explore according to num of new topics in a question

'''


def bucket(x):
    if x==0:
        return "safe"
    elif x==1:
        return "stretch"
    else:
        return "explore"
    
candidates['bucket']=candidates['new_topic_count'].apply(bucket)

In [16]:
candidates[['title','topic_score']].sample(5).sort_values(by='topic_score',ascending=False)

Unnamed: 0,title,topic_score
3403,Count Special Subsequences,35.765641
2487,Count Subarrays With Median K,30.566527
2162,Minimum Difference in Sums After Removal of El...,28.904308
1928,Concatenation of Array,24.490329
3722,Maximize Sum of Squares of Digits,12.209602


In [17]:
# creating a new absolute feature based on features like topic_score,difficulty_weight and noveality_penalty

candidates['final_score']=candidates['topic_score']*candidates['difficulty_weight']-candidates['novelty_penalty']

In [18]:
''''

The function difficulty_bucket gives names based on wheather the question is easy:reinforce, medium:progress , hard:challenge
The function sample_selection is used to randomly choose difficulty buckets that will introduce randomness for difficulty in question

'''


probs={
    'reinforce':0.3,
    'progress':0.6,
    'challenge':0.1
    }

def difficulty_bucket(x):
    if(x==1):
        return 'reinforce'
    elif(x==2):
        return 'progress'
    else:
        return 'challenge'
    
def sample_selection(k=5):
    buckets=list(probs.keys())
    weights=list(probs.values())
    return random.choices(buckets,weights,k=k)
    
    
candidates['diff_bucket']=candidates['difficulty'].apply(difficulty_bucket)
x=sample_selection()

In [19]:

''''
The function difficulty_match_score is used to select the desire question if present else the next best question
The function select_question is used to selct question based on given topic_bucket and difficulty bucket
The function build_session is used to create top 5 recommendation and select questions using the above two functions 


Ex:
if topic_bucket is 'safe'
and diff_bucket is 'progress'
the recommended question will be of medium difficulty with no new topics

'''


topic_bucket=['safe','safe','safe','stretch','explore']


def difficulty_match_score(actual, desired):
    if actual == desired:
        return 1.0
    if actual == 'progress':
        return 0.7
    return 0.3


def select_question(candidates, topic_bucket, difficulty_bucket):
    pool = candidates[candidates['bucket'] == topic_bucket]

    if pool.empty:
        return None

    pool = pool.copy()
    pool['difficulty_match'] = pool['diff_bucket'].apply(
        lambda d: difficulty_match_score(d, difficulty_bucket)
    )

    pool['combined_score'] = pool['final_score'] * pool['difficulty_match']

    return pool.sort_values('combined_score', ascending=False).iloc[0]



def build_session(candidates, topic_plan, difficulty_plan):
    session = []
    remaining = candidates.copy()

    for t_bucket, d_bucket in zip(topic_plan, difficulty_plan):
        q = select_question(remaining, t_bucket, d_bucket)

        if q is None:
            continue

        session.append(q)
        remaining = remaining.drop(q.name)

    return session

recommendations=build_session(candidates,topic_bucket,x)

In [20]:
recommendations=pd.DataFrame(recommendations)
recommendations

Unnamed: 0,frontendId,title,titleSlug,difficulty,topicTags,status,topic_count,new_topic_count,novelty_penalty,difficulty_weight,topic_score,bucket,final_score,diff_bucket,difficulty_match,combined_score
1256,1257,Smallest Common Region,smallest-common-region,2,"[{'name': 'Array', 'slug': 'array', '__typenam...",TO_DO,6,0,0.0,1.0,65.982272,safe,65.982272,progress,1.0,65.982272
2444,2445,Number of Nodes With Value One,number-of-nodes-with-value-one,2,"[{'name': 'Array', 'slug': 'array', '__typenam...",TO_DO,5,0,0.0,1.0,63.929551,safe,63.929551,progress,0.7,44.750686
665,666,Path Sum IV,path-sum-iv,2,"[{'name': 'Array', 'slug': 'array', '__typenam...",TO_DO,5,0,0.0,1.0,61.082921,safe,61.082921,progress,0.7,42.758045
1201,1202,Smallest String With Swaps,smallest-string-with-swaps,2,"[{'name': 'Array', 'slug': 'array', '__typenam...",TO_DO,6,1,0.5,1.0,65.054644,stretch,64.554644,progress,1.0,64.554644
2367,2368,Reachable Nodes With Restrictions,reachable-nodes-with-restrictions,2,"[{'name': 'Array', 'slug': 'array', '__typenam...",TO_DO,5,2,2.0,1.0,61.326266,explore,59.326266,progress,1.0,59.326266
