In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
file_path = "/content/drive/MyDrive/pageviews-20241104-user"
output_file = "/content/drive/MyDrive/en_pageviews.txt"

def extract_and_save_en_lines(input_path, output_path):
    try:
        with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
            for line in input_file:
                parts = line.split()
                if parts and parts[0].startswith('en'):
                    output_file.write(line)
        print(f"Extracted 'en' lines have been saved to {output_path}")
    except FileNotFoundError:
        print(f"Error: The file '{input_path}' was not found.")
    except IOError:
        print(f"Error: Unable to read or write the file.")

extract_and_save_en_lines(file_path, output_file)

Extracted 'en' lines have been saved to /content/drive/MyDrive/en_pageviews.txt


In [1]:
from collections import defaultdict

# Sample function to categorize a page based on its title
def categorize_page(page_title):
    if 'Main_Page' in page_title:
        return 'Main'
    else:
        return 'Other'

# Function to create a time bucket (e.g., hourly, daily)
def get_time_bucket(timestamp):
    # Using a simplified fixed time bucket for demonstration purposes
    return '2024-11-26'  # Modify as needed to parse real timestamps

# Function to group views by the specified attributes
def group_by_attributes(views):
    grouped = defaultdict(list)
    for view in views:
        key = (view['project_type'], view['page_category'], view['access_type'], view['time_bucket'])
        grouped[key].append(view)
    return grouped.values()

# Function to further generalize a group with size < k
def further_generalize(group):
    generalized_group = []
    for view in group:
        generalized_view = {
            'project_type': 'other',
            'page_category': 'general',
            'access_type': view['access_type'],
            'time_bucket': view['time_bucket']
        }
        generalized_group.append(generalized_view)
    return generalized_group

# Main function to apply k-anonymity
def apply_k_anonymity(pageviews, k=5):
    generalized_views = []
    for view in pageviews:
        generalized_view = {
            'project_type': view['project'].split('.')[1],  # e.g., 'wikibooks'
            'page_category': categorize_page(view['page_title']),
            'access_type': 'mobile' if 'mobile' in view['access_method'] else 'desktop',
            'time_bucket': get_time_bucket(view['timestamp'])
        }
        generalized_views.append(generalized_view)

    grouped_views = group_by_attributes(generalized_views)
    anonymized_views = []

    for group in grouped_views:
        if len(group) >= k:
            anonymized_views.extend(group)
        else:
            generalized_group = further_generalize(group)
            anonymized_views.extend(generalized_group)

    return anonymized_views

# Function to read from a file and apply k-anonymity in batches
def process_file(input_filename, output_filename, k=5, batch_size=1000):
    batch = []
    with open(input_filename, 'r') as infile, open(output_filename, 'w') as outfile:
        for line in infile:
            parts = line.strip().split(' ', 4)
            if len(parts) >= 5:
                project, page_title, access_method, timestamp, _ = parts
                batch.append({
                    'project': project,
                    'page_title': page_title,
                    'access_method': access_method,
                    'timestamp': timestamp
                })

            # Process and write to the file in batches
            if len(batch) >= batch_size:
                anonymized_data = apply_k_anonymity(batch, k)
                for view in anonymized_data:
                    outfile.write(f"{view['project_type']} {view['page_category']} {view['access_type']} {view['time_bucket']}\n")
                batch.clear()

        # Process any remaining data in the last batch
        if batch:
            anonymized_data = apply_k_anonymity(batch, k)
            for view in anonymized_data:
                outfile.write(f"{view['project_type']} {view['page_category']} {view['access_type']} {view['time_bucket']}\n")

# Example usage
input_filename = '/content/drive/MyDrive/en_pageviews.txt'  # Replace with your actual input file name
output_filename = '/content/drive/MyDrive/outputPageViewKanomity.txt'  # Replace with your desired output file name

# Process the file and create the anonymized output in batches
process_file(input_filename, output_filename, k=5, batch_size=1000)

print(f"Anonymized data written to {output_filename}")


Anonymized data written to /content/drive/MyDrive/outputPageViewKanomity.txt
