In [1]:
from IPython.display import Image
Image(url='http://python.org/images/python-logo.gif')


# IPython notebook


In [2]:
Image(url='http://ipython.org/_static/IPy_header.png')

In [4]:
Image(url='http://jupyter.org/assets/main-logo.svg', width=300, height=300)


# Jupyter
IPython will continue to exist as a Python kernel for Jupyter, but the notebook and other language-agnostic parts of IPython will move to new projects under the Jupyter name. IPython 3.0 will be the last monolithic release of IPython.

- Let's continue to call this IPython for now

# IPython
- interactive shell
- browser-based notebook (this)
- 'Kernel'
- great support for visualization library (eg. matplotlib)
- built on pyzmq, tornado

## IPython notebook
### Notebook == browser-based REPL
IPython Notebook is a web-based interactive computational environment for creating IPython notebooks. An IPython notebook is a JSON document containing an ordered list of input/output cells which can contain code, text, mathematics, plots and rich media.

## matplotlib
matplotlib tries to make easy things easy and hard things possible. You can generate plots, histograms, power spectra, bar charts, errorcharts, scatterplots, etc, with just a few lines of code, with familiar MATLAB APIs.

```py
plt.barh(y_pos, performance, xerr=error, align='center', alpha=0.4)
plt.yticks(y_pos, people)
plt.xlabel('Performance')
plt.title('How fast do you want to go today?')
plt.show()
```

## PySpark
Spark on Python, this serves as the Kernel, integrating with IPython
- Each notebook spins up a new instance of the Kernal (ie. PySpark running as the Spark Driver)

## Environment

- CentOS 6.5
- CDH 5.3.0 cluster
- Spark
- PySpark [(YARN client mode)](http://spark.apache.org/docs/latest/running-on-yarn.html)
- matplotlib and other packages installed


# Maxwell's Equations
\begin{align}
\nabla \times \vec{\mathbf{B}} -\, \frac1c\, \frac{\partial\vec{\mathbf{E}}}{\partial t} & = \frac{4\pi}{c}\vec{\mathbf{j}} \\   \nabla \cdot \vec{\mathbf{E}} & = 4 \pi \rho \\
\nabla \times \vec{\mathbf{E}}\, +\, \frac1c\, \frac{\partial\vec{\mathbf{B}}}{\partial t} & = \vec{\mathbf{0}} \\
\nabla \cdot \vec{\mathbf{B}} & = 0 
\end{align}


```Python
# Markdown code block
if not full:
    print 'eat more!'
```

In [5]:
import matplotlib
matplotlib.__version__

'1.4.3'

# Spark

In [9]:
import sys
print (sys.version)
print (sc.version)


3.5.0 |Anaconda 2.4.0 (64-bit)| (default, Oct 19 2015, 21:57:25) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
1.5.1


In [10]:
lines = sc.parallelize(['Its fun to have fun,','but you have to know how.']) 
wordcounts = lines.map( lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower()) \
        .flatMap(lambda x: x.split()) \
        .map(lambda x: (x, 1)) \
        .reduceByKey(lambda x,y:x+y) \
        .map(lambda x:(x[1],x[0])) \
        .sortByKey(False) 
wordcounts.take(10)


[(2, 'fun'),
 (2, 'to'),
 (2, 'have'),
 (1, 'but'),
 (1, 'know'),
 (1, 'its'),
 (1, 'you'),
 (1, 'how')]

In [14]:
pagecounts = sc.textFile('pagecounts') # Out on HDFS under my home direcrtory.
pagecounts.take(10)

['20090505-000000 aa Main_Page 2 9980',
 '20090505-000000 ab %D0%90%D0%B8%D0%BD%D1%82%D0%B5%D1%80%D0%BD%D0%B5%D1%82 1 465',
 '20090505-000000 ab %D0%98%D1%85%D0%B0%D0%B4%D0%BE%D1%83_%D0%B0%D0%B4%D0%B0%D2%9F%D1%8C%D0%B0 1 16086',
 '20090505-000000 af.b Tuisblad 1 36236',
 '20090505-000000 af.d Tuisblad 4 189738',
 '20090505-000000 af.q Tuisblad 2 56143',
 '20090505-000000 af Afrika 1 46833',
 '20090505-000000 af Afrikaans 2 53577',
 '20090505-000000 af Australi%C3%AB 1 132432',
 '20090505-000000 af Barack_Obama 1 23368']

In [15]:
enPages = pagecounts.filter(lambda x: x.split(" ")[1] == "en")
enPages.map(lambda x: x.split(" ")).map(lambda x: (x[2], int(x[3]))).reduceByKey(lambda x, y: x + y, 40).filter(lambda x: x[1] > 200000).map(lambda x: (x[1], x[0])).collect()
# This runs in the cluster

[(468159, 'Special:Search'), (1066734, '404_error/'), (451126, 'Main_Page')]

# To be or not to be

In [None]:
!hadoop fs -put /home/users/jsparks/notebooks/spark-notebook-examples/data/hamlet.txt hamlet.txt

In [16]:
words = sc.textFile('/user/jsparks/hamlet.txt')
words.take(5)

['', '1604', '', '', 'THE TRAGEDY OF HAMLET, PRINCE OF DENMARK']

In [17]:
import re
hamlet = words.flatMap(lambda line: re.split('\W+', line.lower().strip()))
hamlet.take(5)

['', '1604', '', '', 'the']

In [19]:
tmp = hamlet.filter(lambda x: len(x) > 2 )
print(tmp.take(5))

['1604', 'the', 'tragedy', 'hamlet', 'prince']


In [20]:
tmp = tmp.map(lambda word: (word, 1))
tmp.take(5)

[('1604', 1), ('the', 1), ('tragedy', 1), ('hamlet', 1), ('prince', 1)]

In [21]:
tmp = tmp.reduceByKey(lambda a, b: a + b)
tmp.take(5)
 

[('seein', 1),
 ('contents', 1),
 ('comart', 1),
 ('pleas', 2),
 ('incontinency', 1)]

In [22]:
tmp = tmp.map(lambda x: (x[1], x[0])).sortByKey(False)
tmp.take(20)

[(1091, 'the'),
 (969, 'and'),
 (558, 'you'),
 (405, 'that'),
 (358, 'ham'),
 (315, 'not'),
 (304, 'his'),
 (300, 'this'),
 (278, 'with'),
 (274, 'but'),
 (252, 'for'),
 (242, 'your'),
 (226, 'lord'),
 (219, 'what'),
 (203, 'king'),
 (197, 'him'),
 (183, 'have'),
 (173, 'will'),
 (132, 'are'),
 (125, 'all')]

In [23]:
tmp = tmp.map(lambda x: (x[1], x[0]))
tmp.take(20)

[('the', 1091),
 ('and', 969),
 ('you', 558),
 ('that', 405),
 ('ham', 358),
 ('not', 315),
 ('his', 304),
 ('this', 300),
 ('with', 278),
 ('but', 274),
 ('for', 252),
 ('your', 242),
 ('lord', 226),
 ('what', 219),
 ('king', 203),
 ('him', 197),
 ('have', 183),
 ('will', 173),
 ('are', 132),
 ('all', 125)]

In [24]:
%matplotlib inline
import matplotlib.pyplot as plt

def plot(words):
    values = map(lambda x: x[1], words)
    labels = map(lambda x: x[0], words)
    plt.barh(range(len(values)), values, color='grey')
    plt.yticks(range(len(values)), labels)
    plt.show()

In [25]:
plot(tmp.take(15))

TypeError: object of type 'map' has no len()

# Word vector
Word2Vec computes distributed vector representation of words. Distributed vector representation is showed to be useful in many natural language processing applications such as named entity recognition, disambiguation, parsing, tagging and machine translation.
https://code.google.com/p/word2vec/

Spark implements the Skip-gram approach. With Skip-gram we want to predict a window of words given a single word.

It was recently shown that the word vectors capture many linguistic regularities, for example vector operations vector('Paris') - vector('France') + vector('Italy') results in a vector that is very close to vector('Rome'), and vector('king') - vector('man') + vector('woman') is close to vector('queen') [3, 1].


## Data set
Wikipedia dump http://mattmahoney.net/dc/textdata  
`grep -o -E '\w+(\W+\w+){0,15}' text8 > text8_lines`  
then randomly sampled to ~200k lines



In [27]:
!hadoop fs -put /home/users/jsparks/notebooks/spark-notebook-examples/data/text8_linessmall text8_linessmall

In [28]:
from pyspark.mllib.feature import Word2Vec

textpath = '/user/jsparks/text8_linessmall'
inp = sc.textFile(textpath).map(lambda row: row.split(" "))

word2vec = Word2Vec()
model = word2vec.fit(inp)

# This takes a while....

In [31]:
synonyms = model.findSynonyms('car', 40)

for word, cosine_distance in synonyms:
  print("{}: {}".format(word, cosine_distance))


driver: 1.0747402043666905
super: 0.9298379095446133
chelsea: 0.8870288699650138
marlin: 0.887006929106233
race: 0.8864043432698806
passenger: 0.8651924290900989
manager: 0.8614759450410525
club: 0.8579558110111344
pilot: 0.8558232934371292
geoff: 0.851116591824512
sabre: 0.8483126311249742
boy: 0.8413431340173244
oakland: 0.841218990265561
raf: 0.8348503785176464
boxer: 0.8337644424069316
cable: 0.8308231128583504
soldier: 0.8303334608291026
racing: 0.8249907782156056
player: 0.8147892911257824
team: 0.813609003993094
jockey: 0.812658276522972
yu: 0.8122575982263345
truck: 0.8090596523261673
defensive: 0.8083024428059771
pittsburgh: 0.8055121372409465
jumper: 0.8049751310669593
professional: 0.8025658645888358
logo: 0.8022691907323706
arlene: 0.8017016910473784
samurai: 0.7977291519644112
runner: 0.7954678341761656
xerox: 0.7922306555930824
mitsuda: 0.789607969294256
stout: 0.7860371062566447
stadium: 0.7853272721993177
yacht: 0.7849792519447497
winning: 0.7767089278588613
winer: 0.77

In [32]:
values = map(lambda x: x[1], synonyms)
labels = map(lambda x: x[0], synonyms)
plt.barh(range(len(values)), values, color='blue')
plt.yticks(range(len(values)), labels)
plt.show()


TypeError: object of type 'map' has no len()

In [34]:
from wordcloud import WordCloud, STOPWORDS

words = " ".join([x[0] for x in synonyms for times in range(0, int(x[1]*10))])
 
wordcloud = WordCloud(font_path='/home/fcheung/CabinSketch-Bold.ttf',
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=1800,
                      height=1400
                     ).generate(words)
 
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

ValueError: zero-size array to reduction operation maximum which has no identity

#### wordcloud package uses PIL/Image
