In [1]:
import pickle
import pandas as pd
import numpy as np
import collections
from collections import defaultdict
import os
import functools
import gc

print("Import success!")

@functools.lru_cache(maxsize=1)
def reviveEngine(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def process_year(year):
    engine = reviveEngine(f'/app/Data/waybackCollectedData/usenix{year}.pickle')
    cookie_data = []
    for site in engine.getAllSitesWithTrackers():
        for tracker, types in engine.sitesToTrackersToTypes[site].items():
            if 'C' in types:
                cookie_data.append({
                    'year': year,
                    'domain': site,
                    'tracker': tracker,
                    'types': ','.join(types)
                })
    del engine
    gc.collect()
    return cookie_data

def process_all_years():
    all_data = []
    for year in range(1996, 2017):
        year = str(year)
        year_data = process_year(year)
        all_data.extend(year_data)
        print(f"Processed data for {year}")
        
        # Save intermediate results to prevent data loss
        temp_df = pd.DataFrame(all_data)
        temp_df.to_csv(f'cookie_tracking_data_up_to_{year}.csv', index=False)
        
        # Clear memory
        del year_data
        del temp_df
        all_data = []
        gc.collect()
    
    # Combine all intermediate files
    final_df = pd.concat([pd.read_csv(f) for f in os.listdir('.') if f.startswith('cookie_tracking_data_up_to_')])
    final_df.to_csv('cookie_tracking_data_final.csv', index=False)
    print("Final data saved to cookie_tracking_data_final.csv")
    
    return final_df

# Main execution
print("Starting data processing...")
final_df = process_all_years()
print(final_df.head())
print(final_df.shape)

# Optional: Remove intermediate files
for f in os.listdir('.'):
    if f.startswith('cookie_tracking_data_up_to_'):
        os.remove(f)
print("Intermediate files removed")


Import success!
Starting data processing...
Processed data for 1996
Processed data for 1997
Processed data for 1998
Processed data for 1999
Processed data for 2000
Processed data for 2001
Processed data for 2002
Processed data for 2003
Processed data for 2004
Processed data for 2005
Processed data for 2006
Processed data for 2007
Processed data for 2008
Processed data for 2009
Processed data for 2010
Processed data for 2011
Processed data for 2012
Processed data for 2013
Processed data for 2014
Processed data for 2015
Processed data for 2016


EmptyDataError: No columns to parse from file

In [3]:
def process_year(year, engine):
    cookie_data = []
    for site in engine.getAllSitesWithTrackers():
        for tracker, types in engine.sitesToTrackersToTypes[site].items():
            if 'C' in types:
                cookie_data.append({
                    'year': year,
                    'domain': site,
                    'tracker': tracker,
                    'types': ','.join(types)
                })
    return cookie_data

def process_all_years(engines):
    all_data = []
    for year, engine in engines.items():
        year_data = process_year(year, engine)
        all_data.extend(year_data)
        print(f"Processed data for {year}")
        
        # Save intermediate results to prevent data loss
        temp_df = pd.DataFrame(all_data)
        temp_df.to_csv(f'cookie_tracking_data_up_to_{year}.csv', index=False)
        
        # Clear memory
        del year_data
        all_data = []
    
    # Combine all intermediate files
    final_df = pd.concat([pd.read_csv(f) for f in os.listdir('.') if f.startswith('cookie_tracking_data_up_to_')])
    final_df.to_csv('cookie_tracking_data_final.csv', index=False)
    print("Final data saved to cookie_tracking_data_final.csv")

process_all_years(engines)


NameError: name 'engines' is not defined

In [None]:
def extract_cookie_data(engines):
    all_cookie_data = []
    for year, engine in engines.items():
        for site in engine.getAllSitesWithTrackers():
            for tracker, types in engine.sitesToTrackersToTypes[site].items():
                if 'C' in types:  # 'C' represents cookie-based tracking
                    all_cookie_data.append({
                        'year': year,
                        'domain': site,
                        'tracker': tracker,
                        'types': ','.join(types)
                    })
        print(f"Processed data for {year}")
    return all_cookie_data

cookie_data = extract_cookie_data(engines)
df = pd.DataFrame(cookie_data)
print(df.head())
print(df.shape)


In [14]:
def load_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)


In [15]:
def extract_cookie_data(engine, year):
    cookie_data = []
    for site in engine.getAllSitesWithTrackers():
        for tracker, types in engine.sitesToTrackersToTypes[site].items():
            if 'C' in types:  # 'C' represents cookie-based tracking
                cookie_data.append({
                    'year': year,
                    'domain': site,
                    'tracker': tracker
                })
    return cookie_data


In [16]:
pickle_dir = '/app/Data/waybackCollectedData/'  # Replace with your actual path
years = range(1996, 2017)


In [17]:
def process_all_years(pickle_dir, years):
    all_cookie_data = []
    for year in years:
        file_path = os.path.join(pickle_dir, f'usenix{year}.pickle')
        try:
            engine = load_pickle(file_path)
            year_data = extract_cookie_data(engine, year)
            all_cookie_data.extend(year_data)
            print(f"Processed data for {year}")
        except Exception as e:
            print(f"Error processing {year}: {e}")
    return all_cookie_data


In [18]:
all_cookie_data = process_all_years(pickle_dir, years)
df = pd.DataFrame(all_cookie_data)
print(df.head())
print(df.shape)


Processed data for 1996
Processed data for 1997
Processed data for 1998
Processed data for 1999
Processed data for 2000
Processed data for 2001
Processed data for 2002
Processed data for 2003
Processed data for 2004
Processed data for 2005
Processed data for 2006
Processed data for 2007
Processed data for 2008
Processed data for 2009
Processed data for 2010
Processed data for 2011
Processed data for 2012
Processed data for 2013
Processed data for 2014
Processed data for 2015
Processed data for 2016
   year         domain                tracker
0  2004    gaoshou.net              baidu.com
1  2004    99lover.com              17777.com
2  2004  61.152.97.111          61.152.97.111
3  2004       3tom.com              baidu.com
4  2006        alt.com  adultfriendfinder.com
(8, 3)
