In [2]:
# M Science Analysis - Part 2C: Technographic & Digital Footprint Intelligence
# Free Alternative Data Sources for Software Adoption & Usage Pattern Analysis
# Tracking technology adoption, web traffic, and digital footprint signals

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import dns.resolver
import socket
import ssl
import re
from datetime import datetime, timedelta
import json
import time
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (14, 8)

print("🔍 TECHNOGRAPHIC & DIGITAL FOOTPRINT INTELLIGENCE")
print("=" * 60)
print("Tracking software adoption, web traffic, and digital signals")
print("Free alternative data approach for technology usage analysis")
print()

🔍 TECHNOGRAPHIC & DIGITAL FOOTPRINT INTELLIGENCE
Tracking software adoption, web traffic, and digital signals
Free alternative data approach for technology usage analysis



In [3]:
# =============================================================================
# 1. COMPANY DIGITAL FOOTPRINT MAPPING
# =============================================================================

# Charles Rogers' coverage companies - digital footprint tracking
COMPANIES_DIGITAL_MAPPING = {
    'MDB': {
        'name': 'MongoDB',
        'ticker': 'MDB',
        'main_domains': ['mongodb.com', 'mongodb.org'],
        'cloud_services': ['cloud.mongodb.com', 'atlas.mongodb.com'],
        'developer_platforms': ['docs.mongodb.com', 'university.mongodb.com'],
        'api_endpoints': ['cloud.mongodb.com/api', 'realm.mongodb.com'],
        'cdn_usage': 'cloudflare',
        'category': 'Database/Cloud'
    },
    'NET': {
        'name': 'Cloudflare',
        'ticker': 'NET',
        'main_domains': ['cloudflare.com', 'cloudflare.net'],
        'cloud_services': ['workers.cloudflare.com', 'pages.cloudflare.com'],
        'developer_platforms': ['developers.cloudflare.com', 'api.cloudflare.com'],
        'api_endpoints': ['api.cloudflare.com/client/v4'],
        'cdn_usage': 'self',
        'category': 'CDN/Security'
    },
    'GTLB': {
        'name': 'GitLab',
        'ticker': 'GTLB',
        'main_domains': ['gitlab.com', 'gitlab.io'],
        'cloud_services': ['gitlab.com', 'about.gitlab.com'],
        'developer_platforms': ['docs.gitlab.com', 'gitlab.com/api'],
        'api_endpoints': ['gitlab.com/api/v4'],
        'cdn_usage': 'cloudflare',
        'category': 'DevOps'
    },
    'DOCN': {
        'name': 'DigitalOcean',
        'ticker': 'DOCN',
        'main_domains': ['digitalocean.com'],
        'cloud_services': ['cloud.digitalocean.com', 'marketplace.digitalocean.com'],
        'developer_platforms': ['docs.digitalocean.com', 'www.digitalocean.com/community'],
        'api_endpoints': ['api.digitalocean.com/v2'],
        'cdn_usage': 'fastly',
        'category': 'Cloud Platform'
    },
    'TEAM': {
        'name': 'Atlassian',
        'ticker': 'TEAM',
        'main_domains': ['atlassian.com', 'atlassian.net'],
        'cloud_services': ['atlassian.cloud', 'bitbucket.org'],
        'developer_platforms': ['developer.atlassian.com', 'community.atlassian.com'],
        'api_endpoints': ['api.atlassian.com', 'bitbucket.org/api'],
        'cdn_usage': 'akamai',
        'category': 'DevTools'
    },
    'AKAM': {
        'name': 'Akamai',
        'ticker': 'AKAM',
        'main_domains': ['akamai.com', 'akamaized.net'],
        'cloud_services': ['control.akamai.com', 'luna.akamaiapis.net'],
        'developer_platforms': ['developer.akamai.com', 'learn.akamai.com'],
        'api_endpoints': ['akzure.akamaized.net', 'luna.akamaiapis.net'],
        'cdn_usage': 'self',
        'category': 'CDN'
    },
    'FSLY': {
        'name': 'Fastly',
        'ticker': 'FSLY',
        'main_domains': ['fastly.com', 'fastlylb.net'],
        'cloud_services': ['manage.fastly.com', 'compute.fastly.com'],
        'developer_platforms': ['developer.fastly.com', 'docs.fastly.com'],
        'api_endpoints': ['api.fastly.com'],
        'cdn_usage': 'self',
        'category': 'CDN/Edge'
    }
}

print("🏢 Mapped digital footprints for 7 companies across 4 categories")
print("📊 Tracking domains, cloud services, developer platforms, and API endpoints")

🏢 Mapped digital footprints for 7 companies across 4 categories
📊 Tracking domains, cloud services, developer platforms, and API endpoints


In [4]:
# =============================================================================
# 2. DNS & INFRASTRUCTURE INTELLIGENCE
# =============================================================================

def analyze_dns_infrastructure(domain):
    """
    Analyze DNS infrastructure to understand technology adoption
    """
    infrastructure_data = {
        'domain': domain,
        'ip_addresses': [],
        'nameservers': [],
        'mx_records': [],
        'txt_records': [],
        'cdn_detection': 'unknown',
        'cloud_provider': 'unknown',
        'security_headers': {},
        'ssl_info': {}
    }
    
    try:
        # Get A records (IP addresses)
        a_records = dns.resolver.resolve(domain, 'A')
        infrastructure_data['ip_addresses'] = [str(record) for record in a_records]
        
        # Get nameservers
        try:
            ns_records = dns.resolver.resolve(domain, 'NS')
            infrastructure_data['nameservers'] = [str(record).rstrip('.') for record in ns_records]
        except:
            pass
        
        # Get MX records
        try:
            mx_records = dns.resolver.resolve(domain, 'MX')
            infrastructure_data['mx_records'] = [str(record) for record in mx_records]
        except:
            pass
        
        # Get TXT records (often contain technology info)
        try:
            txt_records = dns.resolver.resolve(domain, 'TXT')
            infrastructure_data['txt_records'] = [str(record) for record in txt_records]
        except:
            pass
        
        # Detect CDN and cloud provider based on IP/nameservers
        if infrastructure_data['nameservers']:
            ns_string = ' '.join(infrastructure_data['nameservers']).lower()
            if 'cloudflare' in ns_string:
                infrastructure_data['cdn_detection'] = 'cloudflare'
            elif 'fastly' in ns_string:
                infrastructure_data['cdn_detection'] = 'fastly'
            elif 'akamai' in ns_string:
                infrastructure_data['cdn_detection'] = 'akamai'
            elif 'amazonaws' in ns_string:
                infrastructure_data['cloud_provider'] = 'aws'
            elif 'azure' in ns_string or 'microsoft' in ns_string:
                infrastructure_data['cloud_provider'] = 'azure'
            elif 'googledomains' in ns_string or 'google' in ns_string:
                infrastructure_data['cloud_provider'] = 'gcp'
        
        # Check for common CDN patterns in TXT records
        if infrastructure_data['txt_records']:
            txt_string = ' '.join(infrastructure_data['txt_records']).lower()
            if 'cloudflare' in txt_string:
                infrastructure_data['cdn_detection'] = 'cloudflare'
            elif 'fastly' in txt_string:
                infrastructure_data['cdn_detection'] = 'fastly'
        
    except Exception as e:
        print(f"   ⚠️  DNS analysis failed for {domain}: {e}")
    
    return infrastructure_data

def check_ssl_certificate(domain):
    """
    Check SSL certificate information for security and infrastructure insights
    """
    ssl_info = {
        'has_ssl': False,
        'issuer': 'unknown',
        'subject': 'unknown',
        'valid_from': None,
        'valid_to': None,
        'san_domains': []
    }
    
    try:
        context = ssl.create_default_context()
        with socket.create_connection((domain, 443), timeout=10) as sock:
            with context.wrap_socket(sock, server_hostname=domain) as ssock:
                cert = ssock.getpeercert()
                ssl_info['has_ssl'] = True
                ssl_info['issuer'] = dict(x[0] for x in cert.get('issuer', []))
                ssl_info['subject'] = dict(x[0] for x in cert.get('subject', []))
                ssl_info['valid_from'] = cert.get('notBefore')
                ssl_info['valid_to'] = cert.get('notAfter')
                
                # Get Subject Alternative Names (indicates infrastructure scale)
                san = cert.get('subjectAltName', [])
                ssl_info['san_domains'] = [name[1] for name in san if name[0] == 'DNS']
                
    except Exception as e:
        print(f"   ⚠️  SSL check failed for {domain}: {e}")
    
    return ssl_info

print("\n" + "="*60)
print("1️⃣  DNS & INFRASTRUCTURE INTELLIGENCE COLLECTION")
print("="*60)

# Collect infrastructure data for all companies
infrastructure_intelligence = {}

for ticker, company_data in COMPANIES_DIGITAL_MAPPING.items():
    print(f"\n🔍 {company_data['name']} ({ticker})")
    
    company_infrastructure = {
        'company_info': company_data,
        'domain_analysis': {},
        'ssl_certificates': {},
        'infrastructure_summary': {}
    }
    
    # Analyze main domains
    for domain in company_data['main_domains']:
        print(f"   📡 Analyzing DNS for {domain}...")
        dns_data = analyze_dns_infrastructure(domain)
        ssl_data = check_ssl_certificate(domain)
        
        company_infrastructure['domain_analysis'][domain] = dns_data
        company_infrastructure['ssl_certificates'][domain] = ssl_data
        
        # Small delay to be respectful
        time.sleep(0.5)
    
    infrastructure_intelligence[ticker] = company_infrastructure

print(f"\n✅ Collected infrastructure intelligence for {len(infrastructure_intelligence)} companies")



1️⃣  DNS & INFRASTRUCTURE INTELLIGENCE COLLECTION

🔍 MongoDB (MDB)
   📡 Analyzing DNS for mongodb.com...
   📡 Analyzing DNS for mongodb.org...

🔍 Cloudflare (NET)
   📡 Analyzing DNS for cloudflare.com...
   📡 Analyzing DNS for cloudflare.net...

🔍 GitLab (GTLB)
   📡 Analyzing DNS for gitlab.com...
   📡 Analyzing DNS for gitlab.io...

🔍 DigitalOcean (DOCN)
   📡 Analyzing DNS for digitalocean.com...

🔍 Atlassian (TEAM)
   📡 Analyzing DNS for atlassian.com...
   📡 Analyzing DNS for atlassian.net...

🔍 Akamai (AKAM)
   📡 Analyzing DNS for akamai.com...
   📡 Analyzing DNS for akamaized.net...
   ⚠️  DNS analysis failed for akamaized.net: The DNS response does not contain an answer to the question: akamaized.net. IN A
   ⚠️  SSL check failed for akamaized.net: [Errno 8] nodename nor servname provided, or not known

🔍 Fastly (FSLY)
   📡 Analyzing DNS for fastly.com...
   📡 Analyzing DNS for fastlylb.net...
   ⚠️  DNS analysis failed for fastlylb.net: The DNS response does not contain an answ

In [None]:
# =============================================================================
# 3. TECHNOLOGY ADOPTION PATTERN ANALYSIS
# =============================================================================

print("\n" + "="*60)
print("2️⃣  TECHNOLOGY ADOPTION PATTERN ANALYSIS")
print("="*60)

def analyze_technology_patterns():
    """
    Analyze technology adoption patterns across companies
    """
    adoption_data = []
    
    for ticker, infra_data in infrastructure_intelligence.items():
        company_info = infra_data['company_info']
        
        # Analyze CDN adoption
        detected_cdns = []
        for domain, dns_data in infra_data['domain_analysis'].items():
            if dns_data['cdn_detection'] != 'unknown':
                detected_cdns.append(dns_data['cdn_detection'])
        
        # Analyze cloud provider signals
        detected_clouds = []
        for domain, dns_data in infra_data['domain_analysis'].items():
            if dns_data['cloud_provider'] != 'unknown':
                detected_clouds.append(dns_data['cloud_provider'])
        
        # Count total subdomains/endpoints (proxy for service scale)
        total_endpoints = len(company_info['main_domains']) + len(company_info['cloud_services']) + \
                         len(company_info['developer_platforms']) + len(company_info['api_endpoints'])
        
        # SSL certificate analysis
        ssl_complexity = 0
        for domain, ssl_data in infra_data['ssl_certificates'].items():
            if ssl_data['has_ssl']:
                ssl_complexity += len(ssl_data.get('san_domains', []))
        
        adoption_data.append({
            'ticker': ticker,
            'company': company_info['name'],
            'category': company_info['category'],
            'cdn_adoption': list(set(detected_cdns)),
            'cloud_signals': list(set(detected_clouds)),
            'endpoint_complexity': total_endpoints,
            'ssl_complexity': ssl_complexity,
            'developer_focus': len(company_info['developer_platforms']),
            'api_endpoints': len(company_info['api_endpoints']),
            'infrastructure_sophistication': len(set(detected_cdns + detected_clouds))
        })
    
    return pd.DataFrame(adoption_data)

# Analyze technology adoption patterns
df_tech_adoption = analyze_technology_patterns()

print("🔧 Technology Adoption Analysis:")
display_cols = ['company', 'ticker', 'cdn_adoption', 'endpoint_complexity', 'developer_focus', 'category']
for _, row in df_tech_adoption.iterrows():
    print(f"   • {row['company']} ({row['ticker']}): {row['cdn_adoption']} CDN, {row['endpoint_complexity']} endpoints")
