# Data Retrieve

## Events





### Events list



Retrieved through Octopus, and you can access the processes by using the project file: Event_list.otd

### Events details

In [None]:
import requests
from lxml import html
import pandas as pd
import json
import time
import random
import os

In [None]:
df = pd.read_csv("./data/Event_list.csv", encoding='ISO-8859-1')

# List of User-Agent strings to rotate
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
]

# Function to get a random User-Agent
def get_random_user_agent():
    return random.choice(user_agents)

In [None]:
def process_batch(urls, batch_index):
    # Dictionaries to store the events details and errors
    events = {}
    errors = {}
    # Loop through each URL
    for url in urls:
        try:
            # Set headers with a random User-Agent
            headers = {'User-Agent': get_random_user_agent()}

            # Send an HTTP request to the URL
            response = requests.get(url, headers=headers)

            # Parse the HTML content
            tree = html.fromstring(response.content)

            # Use XPath to extract the content of the specific element
            element_content = tree.xpath('/html/body/script[1]/text()')

            # If content is successfully extracted, save it in the events dictionary
            if element_content:
                parsed_data = json.loads(element_content[0])
                events[url] = parsed_data
            else:
                errors[url] = "No content found"

        # In case of an error, log the error message
        except Exception as e:
            errors[url] = str(e)
        time.sleep(random.uniform(1, 5))

    # Save results and errors
    if not os.path.exists(f'./data/temp/events_details/{batch_index}'):
        os.makedirs(f'./data/temp/events_details/{batch_index}')
    with open(f'./data/temp/events_details/{batch_index}/results.json', 'w') as f:
        json.dump(events, f, indent=2)
    with open(f'./data/temp/events_details/{batch_index}/errors.json', 'w') as f:
        json.dump(errors, f, indent=2)

In [None]:
# Split URLs into batches of 100 each
urls = df['URL'].tolist()
batch_size = 100
url_batches = [urls[i:i + batch_size] for i in range(0, len(urls), batch_size)]

# Process each batch
for index, batch in enumerate(url_batches):
    process_batch(batch, index)
    print(f"Batch {index} processed.")

In [None]:
# Merge results
final_results = {}
final_errors = {}

for i in range(len(url_batches)):
    with open(f'./data/temp/events_details/{i}/results.json') as f:
        final_results.update(json.load(f))
    with open(f'./data/temp/events_details/{i}/errors.json') as f:
        final_errors.update(json.load(f))

# Save final results and errors
with open('./data/events_details.json', 'w') as f:
    json.dump(final_results, f, indent=2)
with open('./temp/events_details_errors.json', 'w') as f:
    json.dump(final_errors, f, indent=2)

print("All batches processed.")

All batches processed.


### Event Categories - Gamma API

In [None]:
url = "https://gamma-api.polymarket.com/categories"

# Send a GET request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse JSON response
    data = response.json()

    # Save the data to a JSON file
    with open('./data/Categories.json', 'w') as file:
        json.dump(data, file, indent=2)
    print("Done.")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

Done.


## Market & Participants - Subgraph

In [None]:
import requests
import json
import pandas as pd

url = "https://api.studio.thegraph.com/query/63814/pm_analysis/v0.0.1" #Subgraph API
log_file = './temp/market_user_request_errors.log'

### Retrieve all User overview data

In [None]:
def send_request(url, query, last_timestamp, log_file, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.post(url, json={'query': query}, timeout=10)
            response.raise_for_status()
            data = json.loads(response.text)
            if 'errors' in data:
                print(f"Error in response: {data['errors']}")
                with open(log_file, 'a') as log:
                    log.write(f"Error with timestamp {last_timestamp}, response errors: {data['errors']}\n")
                return None
            return data
        except requests.exceptions.RequestException as e:
            print(f"Error, retry {attempt+1}/{max_retries}: {e}")
            if attempt == max_retries - 1:
                with open(log_file, 'a') as log:
                    # log error and last timestamp
                    log.write(f"Failed to fetch data after timestamp: {last_timestamp}, error: {e}\n")
                return None

In [None]:
markets_user_data_all = []
last_timestamp = 1599166595  # Minimum known timestamp
max_timestamp = 1708985858  # Maximum known timestamp
max_data = False  # Flag to indicate when to stop the loop

while not max_data and last_timestamp <= max_timestamp:
    query = """
    {{
        accounts(
            where: {{creationTimestamp_gt: {}}}
            first: 1000  # 1000 is the max number of accounts per request
            orderBy: creationTimestamp
            orderDirection: asc
        ) {{
            id
            creationTimestamp
            lastSeenTimestamp
            numTrades
            scaledCollateralVolume
            lastTradedTimestamp
            scaledProfit
        }}
    }}
    """.format(last_timestamp)

    data = send_request(url, query, last_timestamp, log_file)  # Send the request and get the response data
    if data is None:
        max_data = True
    else:
        accounts_data = data.get('data', {}).get('accounts')
        if accounts_data:
            markets_user_data_all.extend(accounts_data)
            print(f"Retrieved {len(markets_user_data_all)} accounts")
            last_timestamp = int(accounts_data[-1]['creationTimestamp'])  # Update last_timestamp
        else:
            max_data = True  # No more data

In [None]:
#Save all market user list
with open('./data/markets_user_data_all.json', 'w', encoding='utf-8') as file:
    json.dump(markets_user_data_all, file, ensure_ascii=False, indent=2)

### Retrieve User list in each politics market

In [None]:
def send_request(url, query, market_id, clobTokenId, skip, log_file, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.post(url, json={'query': query}, timeout=10)
            response.raise_for_status()
            return json.loads(response.text)
        except requests.exceptions.RequestException as e:
            print(f"Error, retry {attempt+1}/{max_retries}: {e}")
            if attempt == max_retries - 1:
                # Record errors
                with open(log_file, 'a') as log:
                    log.write(f"Failed to fetch data for market_id: {market_id}, clobTokenId: {clobTokenId}, skip: {skip}, error: {e}\n")
                return None

In [None]:
with open('./data/politics_markets.json', 'r', encoding='utf-8') as file:
    politics_markets_data = json.load(file)

# Extract user information for each market
markets_user_data = []

for event in politics_markets_data.values():
    event_data = event['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']

    for market in event_data.get('markets', []):
        market_id = market.get('id')
        condition_id = market.get('conditionId')
        clobTokenIds = market.get('clobTokenIds', [])

        if clobTokenIds:
            clobTokenId = clobTokenIds[0]
            accounts = []
            skip = 0

            while True:
                # GraphQL
                query = """
                {{
                    accounts(
                        where: {{marketPositions_: {{market: "{}"}}}}
                        skip: {}
                    ) {{
                        id
                    }}
                }}
                """.format(clobTokenId, skip)

                data = send_request(url, query, market_id, clobTokenId, skip, log_file)
                if data is None:
                    break  # If failed, next clobTokenId

                accounts_batch = data['data']['accounts']
                if not accounts_batch:
                    break

                accounts.extend([account['id'] for account in accounts_batch])
                skip += 100

            # Add users data
            markets_user_data.append({
                "id": market_id,
                "conditionId": condition_id,
                "accounts": accounts
            })

In [None]:
#Save political market user list
with open('./data/temp/politics_markets_user_data.json', 'w', encoding='utf-8') as file:
    json.dump(markets_user_data, file, ensure_ascii=False, indent=2)

### Retrieve failed market

In [None]:
with open('./data/temp/politics_markets_user_data.json', 'r', encoding='utf-8') as file:
    markets_user_data = json.load(file)

with open('./data/temp/Politics_market_user_request_errors.log', 'r') as log_file:
    failed_requests = log_file.readlines()

# Retrieve agian
for failed_request in failed_requests:
    parts = failed_request.strip().split(", ", 3)
    market_id = parts[0].split(": ")[1]
    clobTokenId = parts[1].split(": ")[1]
    skip = int(parts[2].split(": ")[1])

    query = """
    {{
        accounts(
            where: {{marketPositions_: {{market: "{}"}}}}
            skip: {}
        ) {{
            id
        }}
    }}
    """.format(clobTokenId, skip)

    try:
        response = requests.post(url, json={'query': query}, timeout=30)  # More timeout
        response.raise_for_status()
        data = json.loads(response.text)
        accounts_batch = data['data']['accounts']

        if not accounts_batch:
            break

        # update markets_user_data
        for market in markets_user_data:
            if market['id'] == market_id:
                market['accounts'].extend([account['id'] for account in accounts_batch])
                break
        skip += 100

    except requests.exceptions.RequestException as e:
        print(f"Retry failed: {e}")

# Save update Json
with open('./data/politics_markets_user_data_updated.json', 'w', encoding='utf-8') as file:
    json.dump(markets_user_data, file, ensure_ascii=False, indent=2)

### Subgraph not includes data

In [None]:
import datetime
import requests
from lxml import html
import pandas as pd
import json
import time
import random
import os
import csv

#### Obtain markets with empty accounts

In [None]:
# From subgraph and Web retrieve
with open('./data/politics_markets_user_data_updated.json', 'r', encoding='utf-8') as file:
    markets_user_data_updated = json.load(file)
# From Web retrieve
with open('/data/politics_markets.json', 'r', encoding='utf-8') as file:
    politics_markets_data = json.load(file)

In [None]:
# Find the subgraph missed markets id
empty_accounts_ids = [item['id'] for item in markets_user_data_updated if not item['accounts']]

matched_markets = {}

# Obtain related data
for event_url, event_data in politics_markets_data.items():
    markets = event_data.get('props', {}).get('pageProps', {}).get('dehydratedState', {}).get('queries', [])
    for query in markets:
        for market in query.get('state', {}).get('data', {}).get('markets', []):
            if market['id'] in empty_accounts_ids:
                if event_url not in matched_markets:
                    matched_markets[event_url] = {'markets': []}
                matched_markets[event_url]['markets'].append({
                    'id': market['id'],
                    'question': market['question'],
                    'conditionId': market['conditionId'],
                    'slug': market['slug'],
                    'clobTokenIds': market['clobTokenIds']
                })

In [None]:
# Save the temp json
with open('./data/temp/politics_markets_user_data_subgraphmissed.json', 'w', encoding='utf-8') as file:
    json.dump(matched_markets, file, ensure_ascii=False, indent=2)

# Save the temp csv
df = pd.DataFrame(matched_markets)
df.to_csv('./data/temp/politics_markets_user_data_subgraphmissed.csv', index=False)

# Save the temp market url csv
with open('./data/temp/politics_markets_user_data_subgraphmissed.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    for key in matched_markets.keys():
        csvwriter.writerow([key])

#### Retrieve lost data through octopus and integrate

You can use the Octopus app with the project file:

Subgraphmissed.otd

to obtain the lost data Subgraphmissed.csv

In [None]:
# Load the supplement data
subgraph_missed = pd.read_csv('./data/temp/Subgraphmissed.csv')
subgraph_missed['URL'] = subgraph_missed['URL'].apply(lambda x: x.lstrip('\ufeff'))

In [None]:
# Preprocessing
# Exclude the just 0 betting volume markets, which have effective peer markets
urls_to_exclude = []
for url, data in matched_markets.items():
    matched_markets_length = len(data['markets'])
    politics_markets_length = len(politics_markets_data[url]['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['markets'])
    if matched_markets_length < politics_markets_length:
        urls_to_exclude.append(url)

# Delete null lines
subgraph_missed.dropna(subset=['Market'], inplace=True)

# Delete final blank
subgraph_missed['Market'] = subgraph_missed['Market'].str.strip()

# Special processing
special_url1 = 'https://polymarket.com/event/electoral-college-margin-of-victory-in-the-2024-presidential-election'
special_url2 = 'https://polymarket.com/event/biden-disapproval-on-dec-29'
special_url3 = 'https://polymarket.com/event/el-salvador-presidential-election-winner'
special_url4 = 'https://polymarket.com/event/republican-nominee-2024'
special_url5 = 'https://polymarket.com/event/biden-approval-on-jan-7'
special_url6 = 'https://polymarket.com/event/democratic-nominee-2024'
special_url7 = 'https://polymarket.com/event/finland-presidential-election-who-will-win'
special_url8 = 'https://polymarket.com/event/iowa-caucus-2nd-place'
special_url9 = 'https://polymarket.com/event/new-hampshire-democratic-primary-winner'
special_url10 = 'https://polymarket.com/event/presidential-election-popular-vote-winner-2024'
special_url11 = 'https://polymarket.com/event/presidential-election-winner-2024'
special_url12 = 'https://polymarket.com/event/trump-margin-of-victory-in-iowa-caucus'

subgraph_missed.loc[subgraph_missed['URL'] == special_url1, 'Market'] = subgraph_missed['Market'].str.replace('GOP/Dems ', '')\
                                                                        .str.replace('Dems ', 'Democrats win ').str.replace('GOP ', 'GOP wins ')
subgraph_missed.loc[subgraph_missed['URL'] == special_url2, 'Market'] = subgraph_missed['Market'].str.replace('>', 'greater than ')\
                                                                        .str.replace('<', 'less than ').str.replace(' - ', ' and ')
subgraph_missed.loc[subgraph_missed['URL'] == special_url3, 'Market'] = subgraph_missed['Market'].str.replace('Sánchez', 'Sanchez')\
                                                                        .str.replace('Other', 'someone else')
subgraph_missed.loc[subgraph_missed['URL'] == special_url4, 'Market'] = subgraph_missed['Market'].str.replace(r'Donald Trump(?!\sJr\.)', 'Donald J. Trump', regex=True)\
                                                                        .str.replace('Other', 'someone else')
subgraph_missed.loc[subgraph_missed['URL'] == special_url5, 'Market'] = subgraph_missed['Market'].str.replace('>', 'greater than ')\
                                                                        .str.replace('<', 'less than ')
subgraph_missed.loc[subgraph_missed['URL'] == special_url6, 'Market'] = subgraph_missed['Market'].str.replace('Other', 'someone else')
subgraph_missed.loc[subgraph_missed['URL'] == special_url7, 'Market'] = subgraph_missed['Market'].str.replace('Väyrynen', 'Vayrynen')
subgraph_missed.loc[subgraph_missed['URL'] == special_url8, 'Market'] = subgraph_missed['Market'].str.replace('Donald Trump', 'Trump')\
                                                                        .str.replace('Nikki Haley', 'Haley').str.replace('Other', 'someone else')\
                                                                        .str.replace('Ron DeSantis', 'DeSantis').str.replace('Vivek Ramaswamy', 'Ramaswamy')
subgraph_missed.loc[subgraph_missed['URL'] == special_url9, 'Market'] = subgraph_missed['Market'].str.replace('Joe Biden (Write-in)', 'Biden').str.replace('Other', 'another')
subgraph_missed.loc[subgraph_missed['URL'] == special_url10, 'Market'] = subgraph_missed['Market'].str.replace('Robert F. Kennedy', 'RFK')
subgraph_missed.loc[subgraph_missed['URL'] == special_url11, 'Market'] = subgraph_missed['Market'].str.replace('Democrat ', 'Democratic ')
subgraph_missed.loc[subgraph_missed['URL'] == special_url12, 'Market'] = subgraph_missed['Market'].str.replace('0-20%', '<20%').str.replace(' Loses', ' lose')


subgraph_missed_list = subgraph_missed.to_dict('records')
# Log failed
unmatched_urls_log = []

for missed in subgraph_missed_list:
    url = missed['URL']
    # Ignore the excluding urls
    if url in urls_to_exclude:
        continue

    market_name = str(missed['Market']).lower().strip()
    address = missed['Address']

    # Match markets
    if url in matched_markets:
        market_found = False
        for market in matched_markets[url]['markets']:
            # Match market name within question
            if market_name in str(market['question']).lower():
                market_id = market['id']
                market_found = True
                # Upgrade accounts
                for user_data in markets_user_data_updated:
                    if user_data['id'] == market_id:
                        if address not in user_data['accounts']:
                            user_data['accounts'].append(address)
                        break
    # For those market names not matched
        if not market_found:
            unmatched_urls_log.append(f"Name not match, URL: {url}, Market Name: {missed['Market']}")

    # For those not find urls
    else:
        unmatched_urls_log.append(f"URL not found, URL: {url}")

In [None]:
# Log
unmatched_urls_log = list(set(unmatched_urls_log))
with open('./data/temp/unmatched_urls_log.txt', 'w', encoding='utf-8') as log_file:
    for url in sorted(unmatched_urls_log):
        log_file.write(url + '\n')

In [None]:
# Upgrade markets_user_data_updated
# Use this updated data for EDA's Participants calculate and all other user retrieve with politics markets
with open('./data/politics_markets_user_data_updated.json', 'w', encoding='utf-8') as file: 
    json.dump(markets_user_data_updated, file, ensure_ascii=False, indent=2)

## Participants and Transactions

### Obtain the participants list

In [None]:
import json
import csv
import requests
from datetime import datetime
from collections import defaultdict
import os
from urllib.parse import urlparse

In [None]:
# From subgraph, Web retrieve, and supplimentary
with open('./data/politics_markets_user_data_updated.json', 'r', encoding='utf-8') as file:
    markets_user_data_updated = json.load(file)

# For retrieve from subgraph
unique_accounts = set()
for entry in markets_user_data_updated:
    for account in entry["accounts"]:
        unique_accounts.add(account)

In [None]:
# For retrieve from webpage
prefixed_accounts = ["https://polymarket.com/profile/" + account for account in unique_accounts]

with open('./data/temp/politics_accounts.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    for account in prefixed_accounts:
        writer.writerow([account])

### API - UserData

In [None]:
# Obtain the unique political markets participants' profiles and activities
unique_accounts = list(unique_accounts)
batch_size = 100

for i in range(0, len(unique_accounts), batch_size):
    batch_addresses = unique_accounts[i:i+batch_size]
    results = {}
    for address in batch_addresses:

        # initial
        combined_data = {"profile": {}, "profit": {}, "marketsTraded": "", "volumeTraded": "", "subgraph": {}, "positions": [], "activities": []}

        # API URLs
        profile_url = f"https://polymarket.com/api/profile/userData?address={address}"
        profit_url = f"https://polymarket.com/api/profile/profit?address={address}"
        markets_traded_url = f"https://polymarket.com/api/profile/marketsTraded?address={address}"
        subgraph_account_data_url = f"https://polymarket.com/api/profile/subgraphAccountData?address={address}"
        volume_traded_url = f"https://polymarket.com/api/profile/volume?range=all&address={address}"
        positions_url = f"https://polymarket.com/api/profile/positions?user={address}"
        activity_url_template = "https://polymarket.com/api/profile/activity?user={address}&limit=100&offset={offset}"

        # request APIs
        try:
            profile_data = requests.get(profile_url, timeout=10).json()
            if not isinstance(profile_data, dict):
                profile_data = {}
        except requests.exceptions.RequestException as e:
            profile_data = {}
        try:
            profit_data = requests.get(profit_url, timeout=10).json()
            if not isinstance(profit_data, dict):
                profit_data = {}
        except requests.exceptions.RequestException as e:
            profit_data = {}

        try:
            markets_traded_data = requests.get(markets_traded_url, timeout=10).json()
            if not isinstance(markets_traded_data, dict):
                markets_traded_data = {}
        except requests.exceptions.RequestException as e:
            markets_traded_data = {}

        try:
            subgraph_account_data = requests.get(subgraph_account_data_url, timeout=10).json()
            if not isinstance(subgraph_account_data, dict):
                subgraph_account_data = {}
        except requests.exceptions.RequestException as e:
            subgraph_account_data = {}

        try:
            volume_traded_data = requests.get(volume_traded_url, timeout=10).json()
            if not isinstance(volume_traded_data, dict):
                volume_traded_data = {}
        except requests.exceptions.RequestException as e:
            volume_traded_data = {}

        combined_data['profile'] = {k: profile_data.get(k, None) for k in profile_data if k not in ['displayUsernamePublic', 'profileImage', 'profileImageOptimized', 'users', '__typename']}
        combined_data['profit'] = {k: profit_data.get(k, None) for k in ['realized', 'unrealized', 'total']}
        combined_data['marketsTraded'] = int(markets_traded_data.get('count', 0))
        combined_data['volumeTraded'] = volume_traded_data.get('amount', 0)
        combined_data['subgraph'] = {k: subgraph_account_data.get(k, None) for k in subgraph_account_data if k not in ['vid', 'block_range', 'id', '_gs_chain', '_gs_gid']}

        # Positions
        try:
            positions_response = requests.get(positions_url, timeout=10).json()
            if not isinstance(positions_response, list):
                positions_response = []  # If it is not a list, give it []
        except requests.exceptions.RequestException as e:
            positions_response = []

        for position in positions_response:
            position_data = {k: v for k, v in position.items() if k not in ['proxyWallet', 'icon']}
            combined_data['positions'].append(position_data)

        # Activities
        offset = 0
        while True:
            try:
                activity_url = activity_url_template.format(address=address, offset=offset)
                activity_response = requests.get(activity_url, timeout=10).json()
                if not isinstance(activity_response, list):
                    activity_response = []
            except requests.exceptions.RequestException as e:
                activity_response = []

            if not activity_response:
                break  # No more data
            for activity in activity_response:
                activity_data = {k: v for k, v in activity.items() if k not in ['proxyWallet', 'icon', 'name', 'pseudonym', 'bio', 'profileImage', 'profileImageOptimized']}
                combined_data['activities'].append(activity_data)
            offset += 100

        # Add the combined data to the results dictionary
        key = 'https://polymarket.com/profile/' + address
        results[key] = combined_data


    # Save batch jason
    batch_filename = f'./data/temp/politics_users_profile_api_batch_{i//batch_size + 1}.json'
    with open(batch_filename, 'w', encoding='utf-8') as file:
        json.dump(results, file, ensure_ascii=False, indent=2)

    # update log
    with open('./data/temp/log.txt', 'a') as log_file:
        for address in batch_addresses:
            log_file.write(f'{address} \n')

    print(f'Batch {i//batch_size + 1} processed')

In [None]:
# Combine Json and preprocess
folder_path = './data/temp/'
combined_data = defaultdict(dict)

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            data = json.load(file)
            for address, info in data.items():
                combined_data[address].update(info)

# calculate "activities"
for address, info in combined_data.items():
    unique_activities = {each['transactionHash']: each for each in info.get('activities', [])}.values()
    combined_data[address]['activities'] = list(unique_activities)
    combined_data[address]['activityNum'] = len(unique_activities)


for address, info in combined_data.items():
    # Update "profile" - "createdAt"
    if not info['profile'].get('createdAt'):
        creation_timestamp = info['subgraph'].get('creation_timestamp')
        if creation_timestamp:
            # Transfer timestamp to 'YYYY-MM-DDTHH:MM:SS.SSSZ'
            creation_datetime = datetime.utcfromtimestamp(int(creation_timestamp)).strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
            info['profile']['createdAt'] = creation_datetime
        else:
            info['profile']['createdAt'] = None

    # Update "profile" - "proxyWallet"
    if not info['profile'].get('proxyWallet'):
        proxy_wallet = urlparse(address).path.split('/')[-1]
        info['profile']['proxyWallet'] = proxy_wallet

    # Update "profit" - "total"
    if info['profit'].get('total') is None:
        info['profit']['total'] = float(info['subgraph'].get('scaled_profit', None))

    # Update "marketsTraded"
    if info.get('marketsTraded') == 0:
        condition_ids = set()
        for position in info.get('positions', []):
            condition_ids.add(position.get('conditionId'))
        for activity in info.get('activities', []):
            condition_ids.add(activity.get('conditionId'))
        info['marketsTraded'] = len(condition_ids)

# Save the combined Json
output_file_path = './data/politics_users_profile_api.json'
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(combined_data, file, ensure_ascii=False, indent=2)

### Polygon API

In [None]:
import json
import requests
import pickle as pkl
from google.colab import userdata #change the API for your own
import time
import concurrent.futures
from collections import defaultdict
import queue
import logging
import threading
api_key = userdata.get('polygon1')
api_key2 = userdata.get('polygon2')
api_key3 = userdata.get('polygon3')
api_key4 = userdata.get('polygon4')
api_key5 = userdata.get('polygon5')
api_key6 = userdata.get('polygon6')

api_keys = [api_key, api_key2, api_key3, api_key4, api_key5, api_key6]

In [None]:
with open('./data/politics_users_profile_api.json', 'r', encoding='utf-8') as file:
    politics_users_profile = json.load(file)

In [None]:
filtered_data = []
for user, user_data in politics_users_profile.items():
    if "activities" in user_data:
        activities = user_data["activities"]
        redeem_condition_ids = set()
        non_redeem_condition_ids = set()

        for activity in activities:
            if activity["type"] == "REDEEM":
                redeem_condition_ids.add(activity["conditionId"])
            else:
                non_redeem_condition_ids.add(activity["conditionId"])

        common_condition_ids = redeem_condition_ids & non_redeem_condition_ids

        if common_condition_ids:
            user_data["activities"] = [activity for activity in activities if activity["conditionId"] not in common_condition_ids]
        user_data["activities"] = [activity for activity in user_data["activities"] if activity["type"] == "REDEEM"]

    if "activities" not in user_data or len(user_data["activities"]) > 0:
        filtered_data.append({user: user_data})

with open('./data/temp/filtered_politics_users_profile_temp.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

filtered_proxy_wallets = []
for user_data in filtered_data:
    for user, data in user_data.items():
        if "profile" in data and "proxyWallet" in data["profile"]:
            filtered_proxy_wallets.append(data["profile"]["proxyWallet"])

In [None]:
exist_hash = {}
for user_url, user_data in politics_users_profile.items():
    if "profile" in user_data and "proxyWallet" in user_data["profile"]:
        proxy_wallet = user_data["profile"]["proxyWallet"]
        if proxy_wallet in filtered_proxy_wallets:
            if "activities" in user_data:
                transaction_hashes = [activity["transactionHash"] for activity in user_data["activities"]]
                exist_hash[proxy_wallet] = transaction_hashes
            else:
                exist_hash[proxy_wallet] = []

In [None]:
api_type = "https://api.polygonscan.com/api?module=logs&action=getLogs"
asset_address = "0x4d97dcd97ec945f40cf65f87097ace5ea0476045"  # The smart contract of the betting
topic_0 = "0xc3d58168c5ae7397731d063d5bbf3d657854427343f4c083240f7aacaa2d0f62" # ERC1155 TransferSingle event
start_block = 9000000  # 2021
end_block = 54000000    # 2024-02-26
RATE_LIMIT = 5  # 5 requests per second

def get_transaction_hashes(user_address, from_block, to_block, api_key):
    url = f"{api_type}&fromBlock={from_block}&toBlock={to_block}&address={asset_address}&topic0={topic_0}&topic0_2_opr=and&topic2=0x000000000000000000000000{user_address[2:]}&topic2_3_opr=or&topic3=0x000000000000000000000000{user_address[2:]}&apikey={api_key}"

    while True:
        try:
            time.sleep(1 / RATE_LIMIT)
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()#["result"]
            #print(data)
            if isinstance(data, str):  # If data is a string, there was an error
                #print(f"Str Error: {data}")
                return []

            if data["status"] == "0": #and data["message"] == "No records found"
                return []
            logs = data["result"]
            #print(logs)
            tx_hashes = [log["transactionHash"] for log in logs if log["topics"][1].lower() != f"0x000000000000000000000000{user_address[2:].lower()}"]
            return list(set(tx_hashes))
        except Exception as e:
            #print(f"Error for get hash of {user_address}: {str(e)}")
            #time.sleep(1)
            return []

def get_block_timestamp(block_number, api_key):
    url = f"https://api.polygonscan.com/api?module=block&action=getblockreward&blockno={block_number}&apikey={api_key}"

    while True:
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()["result"]
            if data:
                return int(data["timeStamp"])
            else:
                return None
        except requests.exceptions.RequestException as e:
            print(f"Error for get timestamp {e}")
            time.sleep(1)

def get_transaction_data(user_address, tx_hash, block_timestamp_cache, api_key):
    url = f"https://api.polygonscan.com/api?module=proxy&action=eth_getTransactionReceipt&txhash={tx_hash}&apikey={api_key}"

    while True:
        try:
            time.sleep(1 / RATE_LIMIT)
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()

            if not data:
                return None

            if "result" not in data:
                print(f"No 'result' field in response for tx hash {tx_hash}")
                return None

            tx_data = data["result"]

            logs = tx_data["logs"]

            has_target_topic = False
            for log in logs:
                if log["topics"][0].lower() == topic_0:
                    has_target_topic = True
                    break

            if not has_target_topic:  # Skip split/merge tx
                return None

            block_number = int(tx_data["blockNumber"], 16)  # Reduce the use of API2
            if block_number in block_timestamp_cache:
                timestamp = block_timestamp_cache[block_number]
            else:
                timestamp = get_block_timestamp(block_number, api_key)
                block_timestamp_cache[block_number] = timestamp

            activity_type = None
            size = None
            usdc_size = None
            token_id = None
            outcome_index = None

            for log in logs:
                topics = log["topics"]
                log_data = log["data"]

                if topics[0].lower() == "0xddf252ad1be2c89b69c2b068fc378daa952ba7f163c4a11628f55a4df523b3ef":  # ERC20 Transfer event
                    #print("Processing Transfer event")
                    if topics[1].lower() == f"0x000000000000000000000000{user_address[2:].lower()}":  # User is sender
                        activity_type = "Sell"
                    elif topics[2].lower() == f"0x000000000000000000000000{user_address[2:].lower()}":  # User is recipient
                        activity_type = "Buy"
                    usdc_size = int(log_data, 16) / 10**6  # USDC has 6 decimal places

                elif topics[0].lower() == "0xc3d58168c5ae7397731d063d5bbf3d657854427343f4c083240f7aacaa2d0f62":  # ERC1155 TransferSingle event
                    if topics[2].lower() == f"0x000000000000000000000000{user_address[2:].lower()}":  # User is sender
                        activity_type = "Sell"
                    elif topics[3].lower() == f"0x000000000000000000000000{user_address[2:].lower()}":  # User is recipient
                        activity_type = "Buy"
                    token_id = int(log_data[:66], 16)
                    size = int(log_data[66:], 16) / 10**6  # 6 decimal places for size

                elif topics[0].lower() == "0x4f62630f51608fc8a7603a9391a5101e58bd7c276139366fc107dc3b67c3dcf8":  # Custom event for outcome index
                    outcome_index = int(topics[2], 16)

            if activity_type is not None and size is not None and usdc_size is not None:
                if size != 0:
                    price = usdc_size / size
                else:
                    price = 0

                activity = {
                    "timestamp": timestamp,
                    "side": activity_type,
                    "size": size,
                    "usdcSize": usdc_size,
                    "transactionHash": str(tx_hash),
                    "price": price,
                    "asset": str(token_id)
                }

                if outcome_index is not None:
                    activity["outcomeIndex"] = outcome_index

                return activity

            return None

        except Exception as e:
            logging.error(f"Error for get tx {tx_hash} for {user_address}: {str(e)}")
            time.sleep(1)
            return None

def process_user_transactions(user_address, block_step=20000000, block_timestamp_cache=None, api_key=None):

    if block_timestamp_cache is None:
        block_timestamp_cache = {}
    all_activities = []
    all_activities_lock = threading.Lock()

    local_existing_hashes = set(exist_hash.get(user_address, []))

    num_intervals = (end_block - start_block) // block_step + 1
    for i in range(num_intervals):
        from_block = start_block + i * block_step
        to_block = min(from_block + block_step - 1, end_block)
        try:
            tx_hashes = get_transaction_hashes(user_address, from_block, to_block, api_key)
            new_hashes = set(tx_hashes) - local_existing_hashes

            for tx_hash in new_hashes:
                activity = None  # Reset activity
                try:
                    activity = get_transaction_data(user_address, tx_hash, block_timestamp_cache, api_key)
                    if activity:
                        with all_activities_lock:
                            all_activities.append(activity)
                finally:
                    activity = None  # Reset activity regardless of whether an exception occurred
        except Exception as e:
            logging.error(f"Error processing {user_address}: {str(e)}")
            return []

    print(f"{len(all_activities)} new tx for {user_address}")
    return all_activities

def process_user_transactions_parallel(user_addresses, max_workers=6):
    result_queue = queue.Queue()

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {}
        api_key_index = 0
        for user_address in user_addresses:
            api_key = api_keys[api_key_index]
            block_timestamp_cache = {}
            future = executor.submit(process_user_transactions, user_address, block_timestamp_cache=block_timestamp_cache, api_key=api_key)
            futures[future] = user_address
            api_key_index = (api_key_index + 1) % len(api_keys)  # Cycle through the API keys

        completed_count = 0
        total_count = len(user_addresses)
        for future in concurrent.futures.as_completed(futures):
            user_address = futures[future]
            user_transactions = future.result()
            if user_transactions:
                user_result = {
                    "proxyWallet": user_address,
                    "activities": user_transactions
                }
                result_queue.put(user_result)
            completed_count += 1
            progress = completed_count / total_count * 100
            print(f"Progress: {completed_count}/{total_count} ({progress:.2f}%)")

    add_tx = []
    while not result_queue.empty():
        add_tx.append(result_queue.get())

    return add_tx

In [None]:
add_tx = process_user_transactions_parallel(list(exist_hash.keys()))
#add_tx to json
with open('./data/temp/add_tx.json', 'w', encoding='utf-8') as file:
    json.dump(add_tx, file, ensure_ascii=False, indent=2)

In [None]:
with open('./data/temp/add_tx.json', 'r', encoding='utf-8') as file:
    add_tx = json.load(file)

In [None]:
# Create dictionaries for faster matching
asset_to_position = {}
asset_to_activity = {}

for profile_url, profile_data in politics_users_profile.items():
    for position in profile_data['positions']:
        asset_to_position[position['asset']] = {
            'conditionId': position['conditionId'],
            'title': position['title'],
            'slug': position['slug'],
            'eventSlug': position['eventSlug'],
            'outcome': position['outcome'],
            'outcomeIndex': position['outcomeIndex']
        }
        if 'oppositeAsset' in position:
            asset_to_position[position['oppositeAsset']] = {
                'conditionId': position['conditionId'],
                'title': position['title'],
                'slug': position['slug'],
                'eventSlug': position['eventSlug'],
                'outcome': 'Yes' if position['outcome'] == 'No' else 'No',
                'outcomeIndex': 0 if position['outcomeIndex'] == 1 else 1
            }
    for activity in profile_data['activities']:
        asset_to_activity[activity['asset']] = activity

# Iterate over each activity in add_tx
for wallet in add_tx:
    for activity in wallet['activities']:
        activity['type'] = 'TRADE'
        asset = activity['asset']

        # Adjust the value of "side" to uppercase
        activity['side'] = activity['side'].upper()

        # Search for matching asset in positions
        if asset in asset_to_position:
            position = asset_to_position[asset]
            activity.update(position)
        # Search for matching asset in activities
        elif asset in asset_to_activity:
            matched_activity = asset_to_activity[asset]
            activity.update({
                #'timestamp': matched_activity['timestamp'],
                'conditionId': matched_activity['conditionId'],
                'title': matched_activity['title'],
                'slug': matched_activity['slug'],
                'eventSlug': matched_activity['eventSlug'],
                'outcome': matched_activity['outcome'],
                'outcomeIndex': matched_activity['outcomeIndex']
            })

    # Remove activities without conditionId or with null timestamp
    wallet['activities'] = [activity for activity in wallet['activities'] if 'conditionId' in activity and activity['timestamp'] is not None]

# Remove proxyWallets with empty activities
add_tx = [wallet for wallet in add_tx if wallet['activities']]

print(f"Total number of added proxyWallets: {len(add_tx)}")

Total number of added proxyWallets: 5547


In [None]:
# Merge processed add_tx data into politics_users_profile
for wallet in add_tx:
    proxy_wallet = wallet['proxyWallet']
    profile_url = f"https://polymarket.com/profile/{proxy_wallet}"
    if profile_url in politics_users_profile:
        politics_users_profile[profile_url]['activities'].extend(wallet['activities'])
    else:
        continue

# Sort activities by timestamp for each proxyWallet
for profile_data in politics_users_profile.values():
    profile_data['activities'].sort(key=lambda x: x['timestamp'])

# Write the updated politics_users_profile to a file
with open('./data/politics_users_profile_api2.json', 'w', encoding='utf-8') as f:
    json.dump(politics_users_profile, f, ensure_ascii=False, indent=2)

### Subgraph

In [None]:
url = "https://api.studio.thegraph.com/query/63814/pm_analysis/v0.0.1"
log_file1 = './data/temp/account_query_errors.log'
log_file2 = './data/temp/tx_query_errors.log'

In [None]:
def send_request(url, query, account_id, skip, log_file, max_retries=3):
    last_error = ''  # Record the last error
    for attempt in range(max_retries):
        try:
            response = requests.post(url, json={'query': query}, timeout=10)
            response.raise_for_status()
            data = json.loads(response.text)

            # For no data obtained
            if not data.get('data', {}).get('account'):
                last_error = f"No data retrieved for: {account_id}"
                continue # Again

            return data  # Success
        except requests.exceptions.RequestException as e:
            last_error = f"Failed to fetch: {account_id}, error: {e}"
            continue  # Again

    # Re-try 3 times，record error
    with open(log_file, 'a') as log:
        log.write(last_error + "\n")
    return None

In [None]:
results = []

for account_id in unique_accounts:
    skip = 0  # original skip
    all_fpmm_memberships = []  # The liquidities where the address provided, corresponding to the marketMakerAddress

    while True:
        query = """
        {{
          account(id: "{account_id}") {{
            id
            creationTimestamp
            lastSeenTimestamp
            lastTradedTimestamp
            numTrades
            scaledProfit
            scaledCollateralVolume
            fpmmPoolMemberships(skip: {skip}) {{
              id
              amount
              pool {{
                    scaledFeeVolume
                    scaledLiquidityParameter
                    liquidityAddQuantity
              }}
            }}
          }}
        }}
        """.format(account_id=account_id, skip=skip)

        data = send_request(url, query, account_id, skip, log_file1)
        if data is None or not data['data'].get('account'):
            break # For errors or totally null

        account_data = data['data']['account']
        fpmm_memberships = account_data.get('fpmmPoolMemberships', [])

        for membership in fpmm_memberships:
            membership['id'] = membership['id'].replace(account_id, '')

        all_fpmm_memberships.extend(fpmm_memberships)
        if not fpmm_memberships:
            break  # No more fpmmPoolMemberships data

        skip += 100  # More data

    account_data['fpmmPoolMemberships'] = all_fpmm_memberships
    results.append(account_data)

# Save JSON
with open('./data/temp/politics_users_profile_infor.json', 'w', encoding='utf-8') as file:
    json.dump(results, file, ensure_ascii=False, indent=2)

In [None]:
# Re-retrieve
with open('./data/temp/politics_users_profile_infor.json', 'r', encoding='utf-8') as file:
    existing_results = json.load(file)
existing_results_map = {item['id']: item for item in existing_results}

# From log_file1
with open(log_file1, 'r') as file:
    failed_accounts_fpmm = {line.split(": ")[1].strip() for line in file}

In [None]:
# re-retrieve for failed accounts
results = []
for account_id in failed_accounts_fpmm:

    skip = 0
    all_fpmm_memberships = []

    while True:
        query = """
        {{
          account(id: "{account_id}") {{
            id
            creationTimestamp
            lastSeenTimestamp
            lastTradedTimestamp
            numTrades
            scaledProfit
            scaledCollateralVolume
            fpmmPoolMemberships(skip: {skip}) {{
              id
              amount
              pool {{
                    scaledFeeVolume
                    scaledLiquidityParameter
                    liquidityAddQuantity
              }}
            }}
          }}
        }}
        """.format(account_id=account_id, skip=skip)

        data = send_request(url, query, account_id, skip, log_file1)
        if data is None or not data['data'].get('account'):
            break # For errors or totally null

        account_data = data['data']['account']
        fpmm_memberships = account_data.get('fpmmPoolMemberships', [])

        for membership in fpmm_memberships:
            membership['id'] = membership['id'].replace(account_id, '')

        all_fpmm_memberships.extend(fpmm_memberships)
        if not fpmm_memberships:
            break  # No more fpmmPoolMemberships data

        skip += 100  # More data

    account_data['fpmmPoolMemberships'] = all_fpmm_memberships
    results.append(account_data)

for account_data in results:
    account_id = account_data['id']
    existing_results_map[account_id] = account_data

updated_results = list(existing_results_map.values())

with open('./data/temp/updated_politics_users_profile_infor.json', 'w', encoding='utf-8') as file:
    json.dump(updated_results, file, ensure_ascii=False, indent=2)

In [None]:
# retrieve for transactions
results = []

for account_id in unique_accounts:
    skip = 0  # original skip
    all_transactions = []
    account_basic_info = None  # The ID
    while True:
        query = """
        {{
          account(id: "{account_id}") {{
            id
            transactions(skip: {skip}) {{
              id
              type
              tradeAmount
              timestamp
              outcomeTokensAmount
              outcomeIndex
              market {{
                id
              }}
            }}
          }}
        }}
        """.format(account_id=account_id, skip=skip)

        data = send_request(url, query, account_id, skip, log_file2)
        if data is None or not data['data'].get('account'):
            break  # Error or Null

        account_data = data['data']['account']
        if skip == 0:  # At the first time, record the ID
            if not account_data.get('id'):
                break
            account_basic_info = {'id': account_data['id']}

        transactions = account_data.get('transactions', [])
        if not transactions:
            break  # No more tx data

        all_transactions.extend(transactions)
        skip += len(transactions)  # More data
    if account_basic_info:
        account_basic_info['transactions'] = all_transactions
        results.append(account_basic_info)

# Save JSON
with open('./data/temp/politics_users_profile_tx.json', 'w', encoding='utf-8') as file:
    json.dump(results, file, ensure_ascii=False, indent=2)

In [None]:
# Re-retrieve tx
with open('./data/temp/politics_users_profile_tx.json', 'r', encoding='utf-8') as file:
    existing_tx_results = json.load(file)
existing_tx_map = {item['id']: item for item in existing_tx_results}

with open(log_file2, 'r') as file:
    failed_accounts_tx = {line.split(": ")[1].strip() for line in file}

In [None]:
# re-retrieve for failed tx
for account_id in failed_accounts_tx:
    skip = 0
    all_transactions = []
    while True:
        query = """
        {{
          account(id: "{account_id}") {{
            id
            transactions(skip: {skip}) {{
              id
              type
              tradeAmount
              timestamp
              outcomeTokensAmount
              outcomeIndex
              market {{
                id
              }}
            }}
          }}
        }}
        """.format(account_id=account_id, skip=skip)

        data = send_request(url, query, account_id, skip, log_file2)
        if data is None or not data['data'].get('account'):
            break

        account_data = data['data']['account']
        transactions = account_data.get('transactions', [])
        if not transactions:
            break

        all_transactions.extend(transactions)
        skip += len(transactions)

    if account_id in existing_tx_map:
        existing_tx_map[account_id]['transactions'].extend(all_transactions)
    else:
        existing_tx_map[account_id] = {'id': account_id, 'transactions': all_transactions}

updated_tx_results = list(existing_tx_map.values())

# Save JSON
with open('./data/temp/updated_politics_users_profile_tx.json', 'w', encoding='utf-8') as file:
    json.dump(updated_tx_results, file, ensure_ascii=False, indent=2)

In [None]:
# Combine data
with open('./data/temp/updated_politics_users_profile_infor.json', 'r', encoding='utf-8') as file:
    profile_data = json.load(file)
with open('./data/temp/updated_politics_users_profile_tx.json', 'r', encoding='utf-8') as file:
    transactions_data = json.load(file)

# maping transactions to id
transactions_map = {item['id']: item for item in transactions_data}

for item in profile_data:
    account_id = item['id']
    if account_id in transactions_map:
        item['transactions'] = transactions_map[account_id]['transactions']
    else:
        item['transactions'] = []

# Save final JSON, to combine with data obtained from Polygon API
with open('./data/temp/politics_users_profile.json', 'w', encoding='utf-8') as file:
    json.dump(profile_data, file, ensure_ascii=False, indent=2)

### Preprocessing for outcomes

In [None]:
import json
import csv
with open('./data/events_details.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [None]:
result = {}

for url, event_data in data.items():
    try:
        markets = event_data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['markets']
        for market in markets:
            slug = market.get('slug', None)
            end_date = market.get('endDate', None)
            start_date = market.get('startDate', None)
            created_at = market.get('createdAt', None)
            closed = market.get('closed', None)
            closed_time = market.get('closedTime', None)
            resolution_price = market.get('resolutionData', {}).get('price', None)

            result[slug] = {
                'endDate': end_date,
                'startDate': start_date,
                'createdAt': created_at,
                'closed': closed,
                'closedTime': closed_time,
                'resolutionPrice': resolution_price
            }
    except (KeyError, IndexError):
        continue

In [None]:
# Save JSON
with open('./data/events_details_clip.json', 'w', encoding='utf-8') as file:
    json.dump(result, file, ensure_ascii=False, indent=2)

In [None]:
filtered_data = {}

for slug, data in result.items():
    resolution_price = data['resolutionPrice']
    if resolution_price not in ["1000000000000000000", "0"]:
        filtered_data[slug] = data

urls = ["https://polymarket.com/market/" + slug for slug in filtered_data.keys()]

with open('./data/temp/output.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['URL'])
    writer.writerows([[url] for url in urls])

In [None]:
with open('./data/events_details_clip.json', 'r', encoding='utf-8') as file:
    result = json.load(file)