# Imports

In [1]:
import pandas as pd

# Reading files

In [55]:
messy_data_clear_labels = pd.read_csv("../data/leetcode-parsed/clear_labels_messy_leetcode_data.csv")

In [57]:
messy_data_clear_labels['complexity'].value_counts()

complexity
O(n)        117
O(nlogn)     31
O(n ^ 2)     25
O(1)         21
O(logn)       9
O(n ^ 3)      7
O(2 ^ n)      3
Name: count, dtype: int64

In [36]:
messy_data = pd.read_csv("../data/leetcode-parsed/messy_leetcode_data.csv")
messy_data['label'].value_counts()[:15]

label
# Time:  O(n)                                                                          117
# Time:  O(nlogn)                                                                       33
# Time:  O(n^2)                                                                         25
# Time:  O(1)                                                                           21
# Time:  O(m * n)                                                                       20
# Time:  O(n * k)                                                                       10
# Time:  O(logn)                                                                         9
# Time:  O(n^3)                                                                          7
# Time:  O(n + m)                                                                        6
# Time:  O(nlogr), r = max(nums)                                                         6
# Time:  O(logn) = O(1)                                                             

In [38]:
messy_data['label']

0                        # Time:  O(1) ~ O(n), n is 10^3
1                               # Time:  O(n), n is 10^3
2                        # Time:  O(1) ~ O(n), n is 10^3
3                    # Time:  O(1) ~ O(nlogn), n is 10^3
4                                      # Time:  O(n + l)
                             ...                        
606                                     # Time:  O(logn)
607                                     # Time:  O(logn)
608    # Time:  O(a + d), a is the number of grids co...
609    # Time:  O(a + d), a is the number of grids co...
610    # Time:  O(m * n * 4 * 3^(h - 1)) ~= O(m * n *...
Name: label, Length: 611, dtype: object

In [10]:
merged_data = pd.read_csv("../data/codecomplex_neetcode_merged_data.csv")
ai_relabeled_clean_leetcode = pd.read_csv("../data/leetcode-parsed/label_updated_clean_leetcode_data.csv")

In [12]:
merged_data['complexity'].value_counts()

complexity
O(n)        972
O(nlogn)    807
O(1)        778
O(n ^ 2)    689
O(logn)     668
O(n ^ 3)    570
np          497
O(2 ^ n)     11
O(n!)         6
Name: count, dtype: int64

In [13]:
ai_relabeled_clean_leetcode['complexity'].value_counts()

complexity
O(n)        1421
O(nlogn)     321
O(n ^ 2)     193
O(1)         173
O(logn)      101
O(n ^ 3)      26
O(n!)          5
Name: count, dtype: int64

In [None]:
scraped_data['complexity'].value_counts()

# Renaming columns

In [66]:
scraped_data.rename(columns={'time_complexity': 'complexity'}, inplace=True)
original_data.rename(columns={'src': 'code'}, inplace=True)

# Dropping columns

In [67]:
original_data = original_data.iloc[:, :2]

In [68]:
original_data['complexity'].value_counts()

complexity
O(n)        853
O(nlogn)    796
O(1)        791
O(logn)     669
O(n ^ 2)    657
O(n ^ 3)    606
np          528
Name: count, dtype: int64

### Dropping *other* time complexity classes and *linear* class, to have the distribution of classes as close to uniform as possible.

In [25]:
index = ai_relabeled_clean_leetcode[ai_relabeled_clean_leetcode.loc[:, 'complexity'].apply(lambda row: row in ['other', 'linear'])].index
ai_relabeled_clean_leetcode.drop(index, inplace=True)

In [26]:
ai_relabeled_clean_leetcode['complexity'].value_counts()

complexity
O(n)        1421
O(nlogn)     321
O(n ^ 2)     193
O(1)         173
O(logn)      101
O(n ^ 3)      26
O(n!)          5
Name: count, dtype: int64

# Merging the two datasets

**Final dataset:**

In [71]:
merged = pd.concat([original_data, scraped_data])
merged.shape

(5179, 2)

# Checking nan entries

In [72]:
merged.isna().sum().sum()

np.int64(0)

# Checking duplicates

In [73]:
num_of_dups = merged['code'].duplicated().sum()
print(f"There are {num_of_dups} of duplicates.")

There are 240 of duplicates.


In [74]:
merged[merged['code'].duplicated()]

Unnamed: 0,code,complexity
17,import math\n\ndef getdt():\n return map(in...,O(1)
18,import math\n\ndef getdt():\n return map(in...,O(1)
29,n = int(input())\nprint(3*n//2),O(1)
30,n = int(input())\nprint(int(3 * n / 2)),O(1)
77,n=int(input())\nif(n<3):\n print(n)\nelse:\...,O(1)
...,...,...
4842,"import math\ndef f(n,s):\n d=[-n,-n];d[s]=0...",np
4843,"def f(n,s):\n d=[-n,-n];\n d[s]=0;\n ...",np
4856,import sys\ninput = sys.stdin.readline\n\ndef ...,np
4895,"import sys\ninput = sys.stdin.readline\n\nn, k...",np


## Dropping duplicates

In [75]:
merged = merged.drop_duplicates()
merged.duplicated().sum()

np.int64(0)

# Final class distribution

In [76]:
merged['complexity'].value_counts()

complexity
O(n)        972
O(nlogn)    807
O(1)        778
O(n ^ 2)    689
O(logn)     668
O(n ^ 3)    570
np          497
O(2 ^ n)     11
O(n!)         6
Name: count, dtype: int64

# Saving as .csv

In [32]:
ai_relabeled_clean_leetcode.to_csv("../data/label_updated_clean_leetcode_data.csv", index=False)

In [78]:
merged.to_csv('../codecomplex_neetcode_merged.csv', index=False)

In [52]:
messy_data_clear_labels.to_csv('../data/leetcode-parsed/clear_labels_messy_leetcode_data.csv', index=False)