# CP Recommendation System


## Content Based filtering

First let's start with content based filtering
Here, we'll be concatenating the tags into a single string and run cosine similarity based on text to find the similarity between this particular problem and other problems


In [5]:
#importing stuff
import requests
import json
from pandas import json_normalize
import pandas as pd

#load data
tag_set = set()
data = requests.get("https://codeforces.com/api/problemset.problems").json()
problems_json = data["result"]["problems"]

problems = json_normalize(problems_json)
print(problems.isnull().sum())

contestId       0
index           0
name            0
type            0
rating        176
tags            0
points       2251
dtype: int64


As we can see, one of the columns has the name index which is overriding the original column "index", so let's rename it to something else.

In [3]:
column_list = list(problems.columns)
column_list[1] = "ID"
problems.columns = column_list
print(problems.isnull().sum())

contestId       0
ID              0
name            0
type            0
rating        186
tags            0
points       2195
dtype: int64


### Get Data From Users

Time to get the user dataset, we'll collect the data of all users who participated in atleast one contest.

In [5]:
print(problems)

      contestId ID                    name         type  points  rating  \
0          1602  B            Divine Array  PROGRAMMING  1000.0  1100.0   
1          1602  A        Two Subsequences  PROGRAMMING   500.0   800.0   
2          1601  F               Two Sorts  PROGRAMMING  3000.0  3400.0   
3          1601  E          Phys Ed Online  PROGRAMMING  2250.0  2900.0   
4          1601  D      Difficult Mountain  PROGRAMMING  2250.0  2700.0   
...         ... ..                     ...          ...     ...     ...   
7320          2  B     The least round way  PROGRAMMING     NaN  2000.0   
7321          2  A                  Winner  PROGRAMMING     NaN  1500.0   
7322          1  C  Ancient Berland Circus  PROGRAMMING     NaN  2100.0   
7323          1  B             Spreadsheet  PROGRAMMING     NaN  1600.0   
7324          1  A          Theatre Square  PROGRAMMING     NaN  1000.0   

                                                   tags  
0             [constructive algorithms, i

In [4]:
def get_users():
    user_url = "https://codeforces.com/api/user.ratedList"
    user_data = requests.get(user_url).json()

    user_data = user_data["result"]

    df = json_normalize(user_data)
    return df

def get_users_from_csv():
    df = pd.read_csv("data/df_user.csv", encoding='utf-8')
    return df

The below code gets the submissions of one particular user.

In [5]:
def get_user_submissions(handle):
    start, count = (1, 999)

    user_url = "https://codeforces.com/api/user.status?handle={}&from={}&count={}"
    user_url = user_url.format(handle, start, count)
    user_data = requests.get(user_url).json()

    submissions = user_data["result"]
    df = json_normalize(submissions)
    #print(df[df["verdict"] == "OK"]["problem.name"] )
    return df

#df_submission = get_user_submissions("infnite_coder")

Now lets remove the users based on the following conditions
 - have rating less than or equal to 0
 - have stayed inactive for more than 1 year.

 ### Note: Run this below cell only if there is no csv file present, otherwise skip to the next one.

In [None]:
from datetime import datetime, date, timedelta

df_user = get_users()

df_user = df_user[['handle', 'country', 'rank', 'rating', 'maxRating', 'lastOnlineTimeSeconds']][df_user['rating']>0]

now = datetime.now()
lastYear = now.replace(year=now.year-1)

df_user["lastOnline"] = df_user['lastOnlineTimeSeconds'].map(lambda x: datetime.fromtimestamp(x))

df_user = df_user[df_user['lastOnline']  > lastYear][df_user.columns.difference(['lastOnlineTimeSeconds'], sort=False)]

df_user.to_csv("data/df_user.csv", index=False, encoding='utf-8')

Run this cell to load values into a dataframe from csv
### Note: Run this cell if you already have a df_user.csv with values stored.

In [6]:
df_user_total = get_users_from_csv()
print(df_user_total.head())

              handle        country                   rank  rating  maxRating  \
0            tourist        Belarus  legendary grandmaster    3707       3822   
1  Retired_MiFaFaOvO          Samoa  legendary grandmaster    3681       3681   
2               Benq  United States  legendary grandmaster    3672       3797   
3          Radewoosh         Poland  legendary grandmaster    3627       3720   
4             ksun48         Canada  legendary grandmaster    3547       3654   

            lastOnline  
0  2021-10-04 16:53:18  
1  2021-10-04 09:58:05  
2  2021-10-04 17:16:26  
3  2021-10-04 00:38:52  
4  2021-10-04 09:39:55  


In [7]:
def get_incorrect_user_submission(handle):
    user_submission = get_user_submissions(handle)
    return user_submission[user_submission["verdict"] != "OK"]["problem.name"].head()

def get_user_interactions(handle):
    user_submission = get_user_submissions(handle)
    if(len(user_submission.index) == 0):
        return
    problem_list = user_submission[user_submission["verdict"] != "OK"]["problem.name"].unique()
    for item in problem_list:
        yield [handle, item, 1]

def list_of_user_interactions():
    combined_list = []
    for user in df_user['handle']:
        for gen_item in get_user_interactions(user):
            combined_list.append(gen_item)
    return combined_list

In [16]:
cols=['handle', 'problem_name', 'wrong_submission']
user_problem_interaction = pd.DataFrame(columns=cols)

for user in df_user['handle']:
    incorrect_user_submission = get_incorrect_user_submission(user)

    problem_freq = dict()
    for item in incorrect_user_submission:
        if item in problem_freq.keys():
            problem_freq[item] = problem_freq[item]+1
        else:
            problem_freq[item] = 1
    
    for item in problem_freq:
        new_row = [user, item, problem_freq[item]]
        df_new_row = pd.DataFrame([new_row], columns=cols)
        user_problem_interaction = pd.concat([user_problem_interaction, df_new_row])
        #user_problem_interaction.append({'handle': user, 'problem_name': item, 'wrong_submission': problem_freq[item]}, ignore_index=True)

user_problem_interaction = user_problem_interaction.reset_index(drop=True)
user_problem_interaction.to_csv("data/user_problem_interaction.csv", index=False, encoding='utf-8')

### Cosine Similarity
Now we are gonna start our project with content based filtering with cosine similarity

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def combine_features(row):
    return row["type"]+" "+ " ".join(row["tags"])

problems["combined_features"] = problems.apply(combine_features, axis=1)

count_matrix = CountVectorizer().fit_transform(problems["combined_features"])
cosine_sim = cosine_similarity(count_matrix)

In [None]:
def index_from_name(name):
    return problems[problems.name == name].index

problem_name = "Armchairs"
index_of_prob = index_from_name(problem_name).values[0]
tgs = problems.iloc[index_of_prob].tags

# print(problems[problems["tags"]])

Neural Network

In [8]:
df_user = pd.read_csv("data/df_user_random.csv", encoding='utf-8')

def batch_list_of_user_interactions():
    combined_list = []
    for user in df_user.iloc[999:1000]['handle']:
        for gen_item in get_user_interactions(user):
            combined_list.append(gen_item)
    return combined_list

In [9]:
# df_user_interaction = pd.DataFrame(list_of_user_interactions(), columns=['handle', 'problem', 'rating'])
# print(len(df_user_interaction))
combined_list = batch_list_of_user_interactions()

In [None]:
interaction_list = pd.DataFrame(combined_list, columns=['handle', 'problem', 'rating'])

# START FROM HERE
### NCF Model

In [39]:
import pandas as pd

interaction_list = pd.read_csv("data/df_user_interaction-02.csv", encoding='utf-8')

num_handle = interaction_list['handle']
num_handle = pd.factorize(num_handle)[0]

num_problem = interaction_list['problem']
num_problem = pd.factorize(num_problem)[0]

interaction_list['handle'] = num_handle
interaction_list['problem'] = num_problem

### Skip this one

In [None]:
import numpy as np

all_problem_ids = interaction_list['problem'].unique()
user_item_set = set(zip(interaction_list['handle'][1:10], interaction_list['problem'][1:10]))

handles, problems, ratings = [], [], []

num_negatives = 4

for (user, problem) in user_item_set:
    handles.append(user)
    problems.append(problem)
    ratings.append(1)
    for _ in range(num_negatives):
        negative_problem = np.random.choice(all_problem_ids)
        while (user, negative_problem) in user_item_set:
            negative_problem = np.random.choice(all_problem_ids)
        handles.append(user)
        problems.append(negative_problem)
        ratings.append(0)
        
print(handles)
print(problems)
print(ratings)

In [40]:
import torch
from torch.utils.data import Dataset

class recommender_data_set(Dataset):
    def __init__(self, interaction_list, all_problem_ids):
        self.handles, self.problems, self.rating = self.get_dataset(interaction_list, all_problem_ids)
        
    def __len__(self):
        return len(self.handles)
    
    def __getitem__(self, idx):
        return self.handles[idx], self.problems[idx], self.rating[idx]
    
    def get_dataset(self, interaction_list, all_problem_ids):
        handles, problems, rating = [], [], []
        user_item_set = set(zip(interaction_list['handle'], interaction_list['problem']))
        
        num_negatives = 4

        for (user, item) in user_item_set:
            handles.append(user)
            problems.append(item)
            rating.append(1)
            for _ in range(num_negatives):
                negative_problem = np.random.choice(all_problem_ids)
                while (user, negative_problem) in user_item_set:
                    negative_problem = np.random.choice(all_problem_ids)
                handles.append(user)
                problems.append(negative_problem)
                rating.append(0)
        
        return torch.tensor(handles), torch.tensor(problems), torch.tensor(rating)

In [41]:
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader

class NCF(pl.LightningModule):
    def __init__(self, num_handles, num_problems, interaction_list, all_problem_ids):
        super().__init__()
        self.handle_embedding = nn.Embedding(num_embeddings=num_handles, embedding_dim=8)
        self.problem_embedding = nn.Embedding(num_embeddings=num_problems, embedding_dim=8)
        
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        
        self.output = nn.Linear(in_features=32, out_features=1)
        
        self.interaction_list = interaction_list
        
        self.all_problem_ids = all_problem_ids
        
    def forward(self, handle_input, problem_input):
        handle_embedded = self.handle_embedding(handle_input)
        problem_embedded = self.problem_embedding(problem_input)
        
        vector = torch.cat([handle_embedded, problem_embedded], dim=-1)
        
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        
        pred = nn.Sigmoid()(self.output(vector))
        
        return pred
    
    def training_step(self, batch, batch_idx):
        handle_input, problem_input, rating = batch
        predicted_labels = self(handle_input, problem_input)
        loss = nn.BCELoss()(predicted_labels, rating.view(-1, 1).float())
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())
    
    def train_dataloader(self):
        return DataLoader(recommender_data_set(self.interaction_list, self.all_problem_ids),batch_size=512, num_workers=4)

In [None]:
num_handles = interaction_list['handle'].max()+1
num_problems = interaction_list['problem'].max()+1
all_problem_ids = interaction_list['problem'].unique()

model = NCF(num_handles, num_problems, interaction_list, all_problem_ids)

trainer = pl.Trainer(max_epochs=5, gpus=0, reload_dataloaders_every_epoch=True, progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)