# Imports

In [64]:
import pandas as pd

# Reading files

In [65]:
scraped_data = pd.read_json("../data/neetcode_data.jsonl", lines=True)
original_data = pd.read_json("../data/codecomplex_data.jsonl", lines=True)

# Renaming columns

In [66]:
scraped_data.rename(columns={'time_complexity': 'complexity'}, inplace=True)
original_data.rename(columns={'src': 'code'}, inplace=True)

# Dropping columns

In [67]:
original_data = original_data.iloc[:, :2]

In [68]:
original_data['complexity'].value_counts()

complexity
O(n)        853
O(nlogn)    796
O(1)        791
O(logn)     669
O(n ^ 2)    657
O(n ^ 3)    606
np          528
Name: count, dtype: int64

**Dropping *other* time complexity classes and *linear* class, to have the distribution of classes as close to normal as possible.**

In [69]:
index = scraped_data[scraped_data.loc[:, 'complexity'].apply(lambda row: row in ['other', 'linear'])].index
scraped_data.drop(index, inplace=True)

In [70]:
scraped_data['complexity'].value_counts()

complexity
O(n)        142
O(n ^ 2)     49
O(nlogn)     35
O(logn)      17
O(1)         15
O(2 ^ n)     11
O(n!)         6
O(n ^ 3)      4
Name: count, dtype: int64

# Merging the two datasets

**Final dataset:**

In [71]:
merged = pd.concat([original_data, scraped_data])
merged.shape

(5179, 2)

# Checking nan entries

In [72]:
merged.isna().sum().sum()

np.int64(0)

# Checking duplicates

In [73]:
num_of_dups = merged['code'].duplicated().sum()
print(f"There are {num_of_dups} of duplicates.")

There are 240 of duplicates.


In [74]:
merged[merged['code'].duplicated()]

Unnamed: 0,code,complexity
17,import math\n\ndef getdt():\n return map(in...,O(1)
18,import math\n\ndef getdt():\n return map(in...,O(1)
29,n = int(input())\nprint(3*n//2),O(1)
30,n = int(input())\nprint(int(3 * n / 2)),O(1)
77,n=int(input())\nif(n<3):\n print(n)\nelse:\...,O(1)
...,...,...
4842,"import math\ndef f(n,s):\n d=[-n,-n];d[s]=0...",np
4843,"def f(n,s):\n d=[-n,-n];\n d[s]=0;\n ...",np
4856,import sys\ninput = sys.stdin.readline\n\ndef ...,np
4895,"import sys\ninput = sys.stdin.readline\n\nn, k...",np


## Dropping duplicates

In [75]:
merged = merged.drop_duplicates()
merged.duplicated().sum()

np.int64(0)

# Final class distribution

In [76]:
merged['complexity'].value_counts()

complexity
O(n)        972
O(nlogn)    807
O(1)        778
O(n ^ 2)    689
O(logn)     668
O(n ^ 3)    570
np          497
O(2 ^ n)     11
O(n!)         6
Name: count, dtype: int64

# Saving as .csv

In [78]:
merged.to_csv('../codecomplex_neetcode_merged.csv', index=False)