In [1]:
# Load Spark

In [17]:
import os
from pyspark import *

In [5]:
# Connect to the default SparkCluster, change the master to have control over this

In [4]:
sc = SparkContext()

In [60]:
from pyspark.sql import *
from pyspark.sql.types import *

In [8]:
session = SparkSession.builder.getOrCreate()

Create a Python2Vec model using PySpark source code as the input, we do this using the sc + wholeTextFiles so we can get access on a per-file basis

In [22]:
from pyspark.find_spark_home import _find_spark_home
spark_home = _find_spark_home()

In [32]:
spark_files_rdd = sc.wholeTextFiles(spark_home)
pyspark_files_rdd = spark_files_rdd.filter(lambda x: x[0].endswith(".py"))

Now we want to tokenize the pyspark_files_rdd - we could do spaces but Python is our friend!

In [51]:
def tokenize_pyspark_file(file_contents):
    """Tokenize the PySpark files, for now this is targetted towards word2vec but we could build some interesting stuff on top of it - like looking at the comments etc."""
    import tokenize
    import StringIO
    raw_tokens = tokenize.generate_tokens(StringIO.StringIO(file_contents).readline)
    # Strip positional information from the tokens
    stripped_tokens = map(lambda x: (x[0], x[1]), raw_tokens)
    # Turn the result into a list since it gets pickled
    return list(stripped_tokens)

In [85]:
tokenized_files = (pyspark_files_rdd
        .map(lambda file_name_and_contents: tokenize_pyspark_file(file_name_and_contents[1]))
        .filter(lambda tokens: tokens)
                  )

In [86]:
tokenized_files.take(1)

[[(53, u'#'),
  (54, u'\n'),
  (53,
   u'# Licensed to the Apache Software Foundation (ASF) under one or more'),
  (54, u'\n'),
  (53,
   u'# contributor license agreements.  See the NOTICE file distributed with'),
  (54, u'\n'),
  (53,
   u'# this work for additional information regarding copyright ownership.'),
  (54, u'\n'),
  (53,
   u'# The ASF licenses this file to You under the Apache License, Version 2.0'),
  (54, u'\n'),
  (53,
   u'# (the "License"); you may not use this file except in compliance with'),
  (54, u'\n'),
  (53, u'# the License.  You may obtain a copy of the License at'),
  (54, u'\n'),
  (53, u'#'),
  (54, u'\n'),
  (53, u'#    http://www.apache.org/licenses/LICENSE-2.0'),
  (54, u'\n'),
  (53, u'#'),
  (54, u'\n'),
  (53,
   u'# Unless required by applicable law or agreed to in writing, software'),
  (54, u'\n'),
  (53, u'# distributed under the License is distributed on an "AS IS" BASIS,'),
  (54, u'\n'),
  (53,
   u'# WITHOUT WARRANTIES OR CONDITIONS OF ANY 

In [54]:
# Shove it back into strings and convert it to a DataFrame

In [84]:
tokenized_df = tokenized_files.map(lambda x: Row(tokens=str(x))).toDF(schema=StructType([StructField("tokens", StringType(), False)]))

In [87]:
tokenized_df.take(1)

[Row(tokens=u'[(53, u\'#\'), (54, u\'\\n\'), (53, u\'# Licensed to the Apache Software Foundation (ASF) under one or more\'), (54, u\'\\n\'), (53, u\'# contributor license agreements.  See the NOTICE file distributed with\'), (54, u\'\\n\'), (53, u\'# this work for additional information regarding copyright ownership.\'), (54, u\'\\n\'), (53, u\'# The ASF licenses this file to You under the Apache License, Version 2.0\'), (54, u\'\\n\'), (53, u\'# (the "License"); you may not use this file except in compliance with\'), (54, u\'\\n\'), (53, u\'# the License.  You may obtain a copy of the License at\'), (54, u\'\\n\'), (53, u\'#\'), (54, u\'\\n\'), (53, u\'#    http://www.apache.org/licenses/LICENSE-2.0\'), (54, u\'\\n\'), (53, u\'#\'), (54, u\'\\n\'), (53, u\'# Unless required by applicable law or agreed to in writing, software\'), (54, u\'\\n\'), (53, u\'# distributed under the License is distributed on an "AS IS" BASIS,\'), (54, u\'\\n\'), (53, u\'# WITHOUT WARRANTIES OR CONDITIONS OF

Now we start creating a word2vec (or py2vec :p) model :) - shout out to lab41 who did something similar while trying to recommend source code

In [88]:
from pyspark.ml import Word2Vec
word2vec_estimator = Word2Vec().setInputCol("tokens")

ImportError: No module named param