In [None]:
# Install required libraries
!pip install datasets transformers accelerate -q

from datasets import load_dataset
import pandas as pd
import re
from tqdm import tqdm

In [None]:
print("Loading dataset...")

# Load dataset in streaming mode
dataset = load_dataset("mlfoundations-dev/github-issues", split="train", streaming=True)

# Load 500,000 samples
num_samples = 500000
samples = []

print(f"Loading {num_samples:,} samples...")
for i, example in enumerate(tqdm(dataset, total=num_samples)):
    samples.append(example)
    if i >= num_samples - 1:
        break

df = pd.DataFrame(samples)
print(f"\nLoaded {len(df):,} samples")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Loading dataset (this will take a few minutes)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/799 [00:00<?, ?B/s]

Loading 500,000 samples...


 60%|██████    | 300816/500000 [01:50<01:13, 2711.71it/s]



Loaded 300,816 samples
Memory usage: 918.66 MB


In [None]:
print("Cleaning and filtering data...")

# Apply cleaning
df['title_clean'] = df['title'].apply(clean_text)
df['body_clean'] = df['body'].apply(clean_text)

# Count words
df['title_words'] = df['title_clean'].apply(count_words)
df['body_words'] = df['body_clean'].apply(count_words)

# Initial stats
print("\nBefore filtering:")
print(f"Total samples: {len(df):,}")

# Apply filters
df_filtered = df[
    (df['title_words'] > 0) &           # Has title
    (df['title_words'] <= 30) &         # Title not too long
    (df['body_words'] >= 20) &          # Body has substance
    (df['body_words'] <= 1000) &        # Body not too long
    (df['title_clean'].str.len() > 0) & # Non-empty after cleaning
    (df['body_clean'].str.len() > 0)    # Non-empty after cleaning
].copy()

print(f"\nAfter filtering:")
print(f"Total samples: {len(df_filtered):,}")
print(f"Retention rate: {len(df_filtered)/len(df)*100:.1f}%")

# Show statistics
stats_data = {
    'Metric': ['Count', 'Mean', 'Median', 'Min', 'Max'],
    'Title Length': [
        len(df_filtered),
        f"{df_filtered['title_words'].mean():.1f}",
        f"{df_filtered['title_words'].median():.0f}",
        f"{df_filtered['title_words'].min():.0f}",
        f"{df_filtered['title_words'].max():.0f}"
    ],
    'Body Length': [
        len(df_filtered),
        f"{df_filtered['body_words'].mean():.1f}",
        f"{df_filtered['body_words'].median():.0f}",
        f"{df_filtered['body_words'].min():.0f}",
        f"{df_filtered['body_words'].max():.0f}"
    ]
}

stats_df = pd.DataFrame(stats_data)
print("\n\n" + "="*60)
print("FILTERED DATASET STATISTICS")
print("="*60)
print(stats_df.to_string(index=False))

Cleaning and filtering data...

Before filtering:
Total samples: 300,816

After filtering:
Total samples: 239,635
Retention rate: 79.7%


FILTERED DATASET STATISTICS
Metric Title Length Body Length
 Count       239635      239635
  Mean          7.2       103.3
Median            7          73
   Min            1          20
   Max           30         999


In [None]:
from sklearn.model_selection import train_test_split

print("Splitting dataset...")

# First split: 80% train, 20% temp (for val + test)
train_df, temp_df = train_test_split(
    df_filtered,
    test_size=0.2,
    random_state=42
)

# Second split: Split temp into 50% val, 50% test (10% each of total)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42
)

# Display split sizes
split_data = {
    'Split': ['Training', 'Validation', 'Test', 'Total'],
    'Samples': [
        f"{len(train_df):,}",
        f"{len(val_df):,}",
        f"{len(test_df):,}",
        f"{len(train_df) + len(val_df) + len(test_df):,}"
    ],
    'Percentage': [
        f"{len(train_df)/len(df_filtered)*100:.1f}%",
        f"{len(val_df)/len(df_filtered)*100:.1f}%",
        f"{len(test_df)/len(df_filtered)*100:.1f}%",
        "100.0%"
    ]
}

split_df = pd.DataFrame(split_data)
print("\n" + "="*60)
print("DATASET SPLIT")
print("="*60)
print(split_df.to_string(index=False))

Splitting dataset...

DATASET SPLIT
     Split Samples Percentage
  Training 191,708      80.0%
Validation  23,963      10.0%
      Test  23,964      10.0%
     Total 239,635     100.0%


In [None]:
# Create final dataset format
def create_training_data(df):
    # Format data for training
    data = []
    for _, row in df.iterrows():
        data.append({
            'input_text': row['body_clean'],
            'target_text': row['title_clean']
        })
    return data

print("Formatting data for training...")

train_data = create_training_data(train_df)
val_data = create_training_data(val_df)
test_data = create_training_data(test_df)

print(f"\nTrain samples: {len(train_data):,}")
print(f"Validation samples: {len(val_data):,}")
print(f"Test samples: {len(test_data):,}")

# Show example
print("\n" + "="*60)
print("EXAMPLE TRAINING PAIR")
print("="*60)
print(f"INPUT (Body):\n{train_data[0]['input_text'][:200]}...")
print(f"\nTARGET (Title):\n{train_data[0]['target_text']}")

Formatting data for training...

Train samples: 191,708
Validation samples: 23,963
Test samples: 23,964

EXAMPLE TRAINING PAIR
INPUT (Body):
Hi, I have a question about the language models one can download by calling the stanza.download method. More specifically, I have downloaded the Lithuanian and North Sami models for a project I am cur...

TARGET (Title):
Model performance discrepancies


In [None]:
import pickle

print("Saving processed datasets...")

# Save as pickle files
with open('train_data.pkl', 'wb') as f:
    pickle.dump(train_data, f)

with open('val_data.pkl', 'wb') as f:
    pickle.dump(val_data, f)

with open('test_data.pkl', 'wb') as f:
    pickle.dump(test_data, f)

print("\nDatasets saved:")
print("- train_data.pkl")
print("- val_data.pkl")
print("- test_data.pkl")

# Also save summary statistics
summary = {
    'total_samples_loaded': len(df),
    'filtered_samples': len(df_filtered),
    'retention_rate': len(df_filtered)/len(df),
    'train_size': len(train_data),
    'val_size': len(val_data),
    'test_size': len(test_data),
    'title_median_words': float(df_filtered['title_words'].median()),
    'body_median_words': float(df_filtered['body_words'].median())
}

with open('data_summary.pkl', 'wb') as f:
    pickle.dump(summary, f)

print("\nSummary saved to data_summary.pkl")
print("\n" + "="*60)
print("DATA PREPROCESSING COMPLETE!")
print("="*60)

# Print final summary
print("\nFinal Dataset Summary:")
for key, value in summary.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value:,}" if isinstance(value, int) else f"  {key}: {value}")

Saving processed datasets...

Datasets saved:
- train_data.pkl
- val_data.pkl
- test_data.pkl

Summary saved to data_summary.pkl

DATA PREPROCESSING COMPLETE!

Final Dataset Summary:
  total_samples_loaded: 300,816
  filtered_samples: 239,635
  retention_rate: 0.80
  train_size: 191,708
  val_size: 23,963
  test_size: 23,964
  title_median_words: 7.00
  body_median_words: 73.00


In [None]:
from google.colab import files

# Download all pickle files
files.download('train_data.pkl')
files.download('val_data.pkl')
files.download('test_data.pkl')
files.download('data_summary.pkl')

print("Files downloaded to your computer")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Files downloaded to your computer
