In [3]:
from datasets import load_dataset

# Load the Python subset of CodeSearchNet with trust_remote_code=True
dataset = load_dataset("code_search_net", "python", trust_remote_code=True)

# Explore the dataset
print(dataset)


python.zip:   8%|7         | 73.4M/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})


In [5]:
import pandas as pd
# Convert each split to a pandas DataFrame
train_df = pd.DataFrame(dataset['train'])
validation_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

# Display the first few rows of the training set
print("Training DataFrame Head:")
print(train_df.head())

Training DataFrame Head:
  repository_name  func_path_in_repository                    func_name  \
0  proycon/pynlpl  pynlpl/formats/folia.py  AbstractElement.addidsuffix   
1  proycon/pynlpl  pynlpl/formats/folia.py   AbstractElement.setparents   
2  proycon/pynlpl  pynlpl/formats/folia.py       AbstractElement.setdoc   
3  proycon/pynlpl  pynlpl/formats/folia.py      AbstractElement.hastext   
4  proycon/pynlpl  pynlpl/formats/folia.py      AbstractElement.hasphon   

                                   whole_func_string language  \
0  def addidsuffix(self, idsuffix, recursive = Tr...   python   
1  def setparents(self):\n        """Correct all ...   python   
2  def setdoc(self,newdoc):\n        """Set a dif...   python   
3  def hastext(self,cls='current',strict=True, co...   python   
4  def hasphon(self,cls='current',strict=True,cor...   python   

                                    func_code_string  \
0  def addidsuffix(self, idsuffix, recursive = Tr...   
1  def setparents(sel

In [6]:
# Remove duplicates based on 'func_code_string'
initial_count = len(train_df)
train_df.drop_duplicates(subset=['func_code_string'], inplace=True)
final_count = len(train_df)
print(f"Removed {initial_count - final_count} duplicate code snippets.")


Removed 0 duplicate code snippets.


In [7]:
# Remove rows with missing documentation
initial_count = len(train_df)
train_df.dropna(subset=['func_documentation_string'], inplace=True)
final_count = len(train_df)
print(f"Removed {initial_count - final_count} samples with missing documentation.")

Removed 0 samples with missing documentation.


In [8]:
# Verify all entries are in Python
non_python = train_df[train_df['language'] != 'python']
print(f"Number of non-Python samples: {len(non_python)}")

# Optionally, remove non-Python samples
train_df = train_df[train_df['language'] == 'python']

Number of non-Python samples: 0


In [9]:
# Sample 10,000 examples for development
sample_size = 10000
train_sample = train_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
print(f"Sampled {len(train_sample)} examples for development.")

Sampled 10000 examples for development.


In [10]:
# Save the cleaned training sample to a CSV file
train_sample.to_csv('cleaned_train_sample.csv', index=False)