In [3]:
# Author: Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD 3 clause

import datetime

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
try:
    from matplotlib.finance import quotes_historical_yahoo
except ImportError:
    from matplotlib.finance import quotes_historical_yahoo_ochl as quotes_historical_yahoo
from matplotlib.collections import LineCollection

from sklearn import cluster, covariance, manifold

In [4]:
###############################################################################
# Retrieve the data from Internet

# Choose a time period reasonnably calm (not too long ago so that we get
# high-tech firms, and before the 2008 crash)
d1 = datetime.datetime(2003, 1, 1)
d2 = datetime.datetime(2008, 1, 1)

In [6]:
print d1
print d2

2003-01-01 00:00:00
2008-01-01 00:00:00


In [5]:
# kraft symbol has now changed from KFT to MDLZ in yahoo
symbol_dict = {
    'TOT': 'Total',
    'XOM': 'Exxon',
    'CVX': 'Chevron',
    'COP': 'ConocoPhillips',
    'VLO': 'Valero Energy',
    'MSFT': 'Microsoft',
    'IBM': 'IBM',
    'TWX': 'Time Warner',
    'CMCSA': 'Comcast',
    'CVC': 'Cablevision',
    'YHOO': 'Yahoo',
    'DELL': 'Dell',
    'HPQ': 'HP',
    'AMZN': 'Amazon',
    'TM': 'Toyota',
    'CAJ': 'Canon',
    'MTU': 'Mitsubishi',
    'SNE': 'Sony',
    'F': 'Ford',
    'HMC': 'Honda',
    'NAV': 'Navistar',
    'NOC': 'Northrop Grumman',
    'BA': 'Boeing',
    'KO': 'Coca Cola',
    'MMM': '3M',
    'MCD': 'Mc Donalds',
    'PEP': 'Pepsi',
    'MDLZ': 'Kraft Foods',
    'K': 'Kellogg',
    'UN': 'Unilever',
    'MAR': 'Marriott',
    'PG': 'Procter Gamble',
    'CL': 'Colgate-Palmolive',
    'GE': 'General Electrics',
    'WFC': 'Wells Fargo',
    'JPM': 'JPMorgan Chase',
    'AIG': 'AIG',
    'AXP': 'American express',
    'BAC': 'Bank of America',
    'GS': 'Goldman Sachs',
    'AAPL': 'Apple',
    'SAP': 'SAP',
    'CSCO': 'Cisco',
    'TXN': 'Texas instruments',
    'XRX': 'Xerox',
    'LMT': 'Lookheed Martin',
    'WMT': 'Wal-Mart',
    'WBA': 'Walgreen',
    'HD': 'Home Depot',
    'GSK': 'GlaxoSmithKline',
    'PFE': 'Pfizer',
    'SNY': 'Sanofi-Aventis',
    'NVS': 'Novartis',
    'KMB': 'Kimberly-Clark',
    'R': 'Ryder',
    'GD': 'General Dynamics',
    'RTN': 'Raytheon',
    'CVS': 'CVS',
    'CAT': 'Caterpillar',
    'DD': 'DuPont de Nemours'}

In [7]:
print symbol_dict

{'COP': 'ConocoPhillips', 'AXP': 'American express', 'RTN': 'Raytheon', 'BA': 'Boeing', 'AAPL': 'Apple', 'PEP': 'Pepsi', 'NAV': 'Navistar', 'GSK': 'GlaxoSmithKline', 'MSFT': 'Microsoft', 'KMB': 'Kimberly-Clark', 'R': 'Ryder', 'SAP': 'SAP', 'GS': 'Goldman Sachs', 'CL': 'Colgate-Palmolive', 'WMT': 'Wal-Mart', 'GE': 'General Electrics', 'SNE': 'Sony', 'PFE': 'Pfizer', 'AMZN': 'Amazon', 'MAR': 'Marriott', 'NVS': 'Novartis', 'KO': 'Coca Cola', 'MMM': '3M', 'CMCSA': 'Comcast', 'SNY': 'Sanofi-Aventis', 'IBM': 'IBM', 'CVX': 'Chevron', 'WFC': 'Wells Fargo', 'DD': 'DuPont de Nemours', 'CVS': 'CVS', 'TOT': 'Total', 'CAT': 'Caterpillar', 'CAJ': 'Canon', 'BAC': 'Bank of America', 'WBA': 'Walgreen', 'AIG': 'AIG', 'TWX': 'Time Warner', 'HD': 'Home Depot', 'TXN': 'Texas instruments', 'VLO': 'Valero Energy', 'F': 'Ford', 'CVC': 'Cablevision', 'TM': 'Toyota', 'PG': 'Procter Gamble', 'LMT': 'Lookheed Martin', 'K': 'Kellogg', 'HMC': 'Honda', 'GD': 'General Dynamics', 'HPQ': 'HP', 'DELL': 'Dell', 'MTU': 'M

In [8]:
symbols, names = np.array(list(symbol_dict.items())).T

In [9]:
print symbols

['COP' 'AXP' 'RTN' 'BA' 'AAPL' 'PEP' 'NAV' 'GSK' 'MSFT' 'KMB' 'R' 'SAP'
 'GS' 'CL' 'WMT' 'GE' 'SNE' 'PFE' 'AMZN' 'MAR' 'NVS' 'KO' 'MMM' 'CMCSA'
 'SNY' 'IBM' 'CVX' 'WFC' 'DD' 'CVS' 'TOT' 'CAT' 'CAJ' 'BAC' 'WBA' 'AIG'
 'TWX' 'HD' 'TXN' 'VLO' 'F' 'CVC' 'TM' 'PG' 'LMT' 'K' 'HMC' 'GD' 'HPQ'
 'DELL' 'MTU' 'XRX' 'YHOO' 'XOM' 'JPM' 'MCD' 'CSCO' 'NOC' 'MDLZ' 'UN']


In [10]:
print names

['ConocoPhillips' 'American express' 'Raytheon' 'Boeing' 'Apple' 'Pepsi'
 'Navistar' 'GlaxoSmithKline' 'Microsoft' 'Kimberly-Clark' 'Ryder' 'SAP'
 'Goldman Sachs' 'Colgate-Palmolive' 'Wal-Mart' 'General Electrics' 'Sony'
 'Pfizer' 'Amazon' 'Marriott' 'Novartis' 'Coca Cola' '3M' 'Comcast'
 'Sanofi-Aventis' 'IBM' 'Chevron' 'Wells Fargo' 'DuPont de Nemours' 'CVS'
 'Total' 'Caterpillar' 'Canon' 'Bank of America' 'Walgreen' 'AIG'
 'Time Warner' 'Home Depot' 'Texas instruments' 'Valero Energy' 'Ford'
 'Cablevision' 'Toyota' 'Procter Gamble' 'Lookheed Martin' 'Kellogg'
 'Honda' 'General Dynamics' 'HP' 'Dell' 'Mitsubishi' 'Xerox' 'Yahoo'
 'Exxon' 'JPMorgan Chase' 'Mc Donalds' 'Cisco' 'Northrop Grumman'
 'Kraft Foods' 'Unilever']


In [11]:
quotes = [quotes_historical_yahoo(symbol, d1, d2, asobject=True)
          for symbol in symbols]

In [12]:
print quotes

[rec.array([ (datetime.date(2003, 1, 2), 2003, 1, 2, 731217.0, 11.910844140430907, 12.040208, 12.057292512531216, 11.827861594800904, 5370700.0, 12.040208),
 (datetime.date(2003, 1, 3), 2003, 1, 3, 731218.0, 12.089022985618877, 12.057293, 12.135397223260023, 12.020684932731147, 3661400.0, 12.057293),
 (datetime.date(2003, 1, 6), 2003, 1, 6, 731221.0, 12.081696448954661, 12.284284, 12.325771366300632, 12.020684065566018, 6230400.0, 12.284284),
 ...,
 (datetime.date(2007, 12, 27), 2007, 12, 27, 733037.0, 48.9562767809853, 48.807624, 49.165491806075735, 48.692010125437456, 10633100.0, 48.807624),
 (datetime.date(2007, 12, 28), 2007, 12, 28, 733038.0, 49.0003196413489, 49.071891, 49.490332059740425, 48.80762379499629, 9596200.0, 49.071891),
 (datetime.date(2007, 12, 31), 2007, 12, 31, 733041.0, 49.055382788219795, 48.614929, 49.07189150549771, 48.28458948468394, 8596000.0, 48.614929)], 
          dtype=[('date', 'O'), ('year', '<i2'), ('month', 'i1'), ('day', 'i1'), ('d', '<f8'), ('open', 