# CP Recommendation System


## Content Based filtering

First let's start with content based filtering
Here, we'll be concatenating the tags into a single string and run cosine similarity based on text to find the similarity between this particular problem and other problems


In [33]:
#importing stuff
import requests
import json
from pandas import json_normalize
import pandas as pd

#load data
tag_set = set()
data = requests.get("https://codeforces.com/api/problemset.problems").json()
problems_json = data["result"]["problems"]

problems = json_normalize(problems_json)
print(problems.isnull().sum())

contestId       0
index           0
name            0
type            0
rating        166
tags            0
points       2099
dtype: int64


As we can see, one of the columns has the name index which is overriding the original column "index", so let's rename it to something else.

In [34]:
column_list = list(problems.columns)
column_list[1] = "ID"
problems.columns = column_list
print(problems.isnull().sum())

contestId       0
ID              0
name            0
type            0
rating        166
tags            0
points       2099
dtype: int64


### Get Data From Users

Time to get the user dataset, we'll collect the data of all users who participated in atleast one contest.

In [36]:
def get_users():
    user_url = "https://codeforces.com/api/user.ratedList"
    user_data = requests.get(user_url).json()

    user_data = user_data["result"]

    df = json_normalize(user_data)
    return df

def get_users_from_csv():
    df = pd.read_csv("data/df_user_sliced.csv", encoding='utf-8')
    return df


The below code gets the submissions of one particular user.

In [35]:
def get_user_submissions(handle):
    start, count = (1, 999)

    user_url = "https://codeforces.com/api/user.status?handle={}&from={}&count={}"
    user_url = user_url.format(handle, start, count)
    user_data = requests.get(user_url).json()

    submissions = user_data["result"]
    df = json_normalize(submissions)
    print(df[df["verdict"] == "OK"]["problem.name"] )
    return df

# df_submission = get_user_submissions("infnite_coder")

Now lets remove the users based on the following conditions
 - have rating less than or equal to 0
 - have stayed inactive for more than 1 year.

 ### Note: Run this below cell only if there is no csv file present, otherwise skip to the next one.

In [40]:
from datetime import datetime, date, timedelta

df_user = get_users()

df_user = df_user[['handle', 'country', 'rank', 'rating', 'maxRating', 'lastOnlineTimeSeconds']][df_user['rating']>0]

now = datetime.now()
lastYear = now.replace(year=now.year-1)

df_user["lastOnline"] = df_user['lastOnlineTimeSeconds'].map(lambda x: datetime.fromtimestamp(x))

df_user = df_user[df_user['lastOnline']  > lastYear][df_user.columns.difference(['lastOnlineTimeSeconds'], sort=False)]

df_user.to_csv("data/df_user.csv", index=False, encoding='utf-8')

Run this cell to load values into a dataframe from csv
### Note: Run this cell if you already have a df_user.csv with values stored.

In [38]:
df_user = get_users_from_csv()
print(df_user.head())

              handle        country                   rank  rating  maxRating  \
0            tourist        Belarus  legendary grandmaster    3707       3822   
1  Retired_MiFaFaOvO          Samoa  legendary grandmaster    3681       3681   
2               Benq  United States  legendary grandmaster    3672       3797   
3          Radewoosh         Poland  legendary grandmaster    3655       3720   
4             ksun48         Canada  legendary grandmaster    3547       3654   

            lastOnline  
0  2021-09-23 14:19:54  
1  2021-09-23 00:06:49  
2  2021-09-23 08:59:18  
3  2021-09-18 22:49:25  
4  2021-09-23 08:32:43  


### Cosine Similarity
Now we are gonna start our project with content based filtering with cosine similarity

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def combine_features(row):
    return row["type"]+" "+ " ".join(row["tags"])

problems["combined_features"] = problems.apply(combine_features, axis=1)

count_matrix = CountVectorizer().fit_transform(problems["combined_features"])
cosine_sim = cosine_similarity(count_matrix)

In [None]:
def index_from_name(name):
    return problems[problems.name == name].index

problem_name = "Armchairs"
index_of_prob = index_from_name(problem_name).values[0]
tgs = problems.iloc[index_of_prob].tags

# print(problems[problems["tags"]])