# W261 Final Project ETL for Development Sample

### Notebook Set-Up

In [5]:
# imports
import time
import numpy as np
import pandas as pd
from pyspark.sql import Row
from pyspark.ml.feature import CountVectorizer
#mllib.linalg library 
from pyspark.sql import DataFrame

In [6]:
%reload_ext autoreload
%autoreload 2

In [7]:
# store path to notebook
#PWD = !pwd
#PWD = PWD[0]

In [8]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "w261FinalProject"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

__`REMINDER:`__ If you are running this notebook on the course docker container, you can monitor the progress of your jobs using the Spark UI at: http://localhost:4040/jobs/

## Load Data

In [9]:
original_trainRDD = sc.textFile('data/train.txt')
original_testRDD = sc.textFile('data/test.txt')

## Transformation

### Sample

In [6]:
#change the seed for a different sample
sampleRDD1, sampleRDD2 = original_trainRDD.randomSplit([0.9999,0.0001], seed = 1)
sampleRDD2.cache()

PythonRDD[4] at RDD at PythonRDD.scala:49

In [7]:
sample_testRDD1, sample_testRDD2 = original_trainRDD.randomSplit([0.9999,0.000025], seed = 1)
sample_testRDD2
sample_testRDD2.count()

1098

In [8]:
ex = sampleRDD2.take(2)
ex

['0\t\t4\t50\t18\t3339\t20\t26\t17\t133\t\t2\t\t18\t09ca0b81\t09e68b86\t86c4b829\te3d0459f\t25c83c98\t\t7227c706\t0b153874\ta73ee510\t305a0646\t9625b211\t997a695a\tdccbd94b\t07d13a8f\t36721ddc\tc0b906bb\te5ba7672\t5aed7436\t21ddcdc9\ta458ea53\t0cbbcc92\t\t32c7478e\t0174dd24\t3d2bedd7\td8ecbc17',
 '0\t\t12\t20\t18\t30445\t82\t0\t18\t53\t\t0\t\t18\tbe589b51\t8e465f4d\t35d889dd\t5e5e218f\t25c83c98\t6f6d9be8\t5732a3f8\t0b153874\ta73ee510\ta1680317\td70e2491\t575bb5c9\t2b9f0754\t07d13a8f\te815112f\t85a05c1a\td4bb7bd8\tf2becb37\t\t\tfe89e74a\t\t32c7478e\tbaf42944\t\t']

In [8]:
ncol = len(sampleRDD2.take(1)[0].split('\t'))
nrow = sampleRDD2.count()
print("This sample contains", str(nrow), "rows.")

This sample contains 4478 rows.


In [9]:
print("This sample contains", str(ncol), "columns.")

This sample contains 40 columns.


In [10]:
def avgFeatures(line):
    
    count = 0
    feats = line.split('\t')[1:]
    
    for feat in feats:
        if feat != '':
            count += 1

    return count

print("There is an average of", str(round(sampleRDD2.map(avgFeatures).mean(),2)), "populated features per observation.")
sampleRDD2.map(avgFeatures).mean()

There is an average of 33.53 populated features per observation.


33.5288075033497

# Put in wide, sparse feature format

In [8]:
def parseCV(line):
    """
    Map record_csv_string --> (features, label)
    """

    # start of categorical features
    col_start = 14
    
    raw_values = line.split('\t')
    label = int(raw_values[0])  ## y variable 
    
    # ignore numerics to start
    #numerical_values = list(pd.Series(raw_values[1:14]).apply(pd.to_numeric))
    numericals = []
    for idx, value in enumerate(raw_values[1:col_start]):
        if value != '':
            numericals.append('n' + str(idx) + '_' + str(value))
            
    
    categories = []
    for idx, value in enumerate(raw_values[col_start:]):
        if value != '':
            categories.append('c'+ str(idx) + '_' + str(value))

    return Row(label=label, raw=numericals + categories)


def vectorizeCV(DF):
    
    vectorizer = CountVectorizer()
    cv = CountVectorizer(inputCol="raw", outputCol="features")
    
    model = cv.fit(DF)
    result = model.transform(DF)
    
    return result
parsedDF = sampleRDD2.map(parseCV).toDF().cache()
vectorizedDF = vectorizeCV(parsedDF)

In [29]:
vectorizedDF[2].show()

TypeError: 'Column' object is not callable

In [56]:
## Numericals ategories output
parsedDF = sampleRDD2.map(parseCV).toDF().cache()
parsedDF

DataFrame[_1: string, _2: string, _3: string, _4: string, _5: string, _6: string, _7: string, _8: string, _9: string, _10: string]

In [58]:
## Categories output
parsedDF = sampleRDD2.map(parseCV).toDF().cache()
parsedDF

DataFrame[_1: string, _2: string, _3: string, _4: string, _5: string, _6: string, _7: string, _8: string, _9: string, _10: string, _11: string, _12: string, _13: string, _14: string, _15: string, _16: string, _17: string, _18: string, _19: string, _20: string, _21: string, _22: string, _23: string, _24: string]

In [61]:
## parse output
parsedDF = sampleRDD2.map(parseCV).toDF().cache()
parsedDF

DataFrame[label: bigint, raw: array<string>]

In [62]:
parsedDF.head()

Row(label=0, raw=['n1_4', 'n2_50', 'n3_18', 'n4_3339', 'n5_20', 'n6_26', 'n7_17', 'n8_133', 'n10_2', 'n12_18', 'c0_09ca0b81', 'c1_09e68b86', 'c2_86c4b829', 'c3_e3d0459f', 'c4_25c83c98', 'c6_7227c706', 'c7_0b153874', 'c8_a73ee510', 'c9_305a0646', 'c10_9625b211', 'c11_997a695a', 'c12_dccbd94b', 'c13_07d13a8f', 'c14_36721ddc', 'c15_c0b906bb', 'c16_e5ba7672', 'c17_5aed7436', 'c18_21ddcdc9', 'c19_a458ea53', 'c20_0cbbcc92', 'c22_32c7478e', 'c23_0174dd24', 'c24_3d2bedd7', 'c25_d8ecbc17'])

In [9]:
#not sure why its 30,946 in the first column...comes out of countvectorizor on spark
#https://spark.apache.org/docs/latest/ml-features.html#countvectorizer
vectorizedDF = vectorizeCV(parsedDF)
vectorizedDF.show()

+-----+--------------------+--------------------+
|label|                 raw|            features|
+-----+--------------------+--------------------+
|    0|[n1_4, n2_50, n3_...|(30946,[0,1,2,4,5...|
|    0|[n1_12, n2_20, n3...|(30946,[0,1,2,5,1...|
|    1|[n1_1, n2_1, n4_9...|(30946,[0,1,6,7,1...|
|    0|[n0_8, n1_17, n3_...|(30946,[0,1,4,12,...|
|    1|[n0_6, n1_1, n2_7...|(30946,[0,1,2,4,1...|
|    1|[n1_99, n2_1, n3_...|(30946,[1,2,4,10,...|
|    0|[n0_3, n1_21, n2_...|(30946,[0,1,4,8,1...|
|    0|[n1_2, n2_20, n3_...|(30946,[0,1,3,5,8...|
|    0|[n0_0, n1_144, n4...|(30946,[0,2,3,4,5...|
|    0|[n1_0, n2_5, n4_3...|(30946,[0,2,3,6,1...|
|    0|[n0_0, n1_1, n2_4...|(30946,[0,1,2,3,5...|
|    0|[n0_9, n1_5, n2_1...|(30946,[0,2,3,6,9...|
|    0|[n1_323, n2_2, n3...|(30946,[1,2,14,16...|
|    0|[n0_0, n1_424, n3...|(30946,[0,1,2,4,6...|
|    0|[n0_0, n1_13, n2_...|(30946,[0,1,2,5,6...|
|    0|[n1_180, n2_6, n3...|(30946,[1,2,8,14,...|
|    0|[n1_126, n2_2, n3...|(30946,[0,2,4,6,8...|


In [24]:
vectorizedDF.show(truncate=False)

+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|raw                                                                                                                                                                                                                                         

In [16]:
lst = [1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]

In [17]:
sum(lst)

34.0

In [19]:
24+14

38

In [21]:
lst2 = [0,1,2,4,5,7,10,20,32,122,155,173,214,363,369,495,504,630,634,869,1887,2124,2261,2298,2776,6219,10115,13231,14000,18536,20030,23934,25540,26249]
len(lst2)

34

In [23]:
lst3 = ['n1_4', 'n2_50', 'n3_18', 'n4_3339', 'n5_20', 'n6_26', 'n7_17', 'n8_133', 'n10_2', 'n12_18', 'c0_09ca0b81', 'c1_09e68b86', 'c2_86c4b829', 'c3_e3d0459f', 'c4_25c83c98', 'c6_7227c706', 'c7_0b153874', 'c8_a73ee510', 'c9_305a0646', 'c10_9625b211', 'c11_997a695a', 'c12_dccbd94b', 'c13_07d13a8f', 'c14_36721ddc', 'c15_c0b906bb','c16_e5ba7672', 'c17_5aed7436', 'c18_21ddcdc9', 'c19_a458ea53', 'c20_0cbbcc92', 'c22_32c7478e', 'c23_0174dd24', 'c24_3d2bedd7', 'c25_d8ecbc17']                              
len(lst3)

34

In [None]:
n11, n13, c5, c21

In [27]:
len([0,1,2,5,10,16,17,42,120,133,219,283,363,364,369,583,1694,2174,2746,4907,5102,6720,8641,8698,13777,17186,18774,22920,23234,23616,26609])

31

In [141]:
sc.addPyFile("fm_parallel_sgd.py")
import fm_parallel_sgd as fm

#import fm_parallel_sgd as fm

In [157]:
vectorizedDF.to_csv(r'c:\data\test_sample.txt', header=None, index=None, sep=' ', mode='a')

AttributeError: 'DataFrame' object has no attribute 'to_csv'

In [156]:
pd.to_csv(r'c:\data\test_sample.txt', header=None, index=None, sep=' ', mode='a')

AttributeError: 'DataFrame' object has no attribute 'to_csv'

In [159]:
np.savetxt(r'test_sample.txt', vectorizedDF.values(), fmt='%d')

AttributeError: 'DataFrame' object has no attribute 'values'

In [158]:
np.savetxt(r'test_sample.txt', vectorizedDF.values, fmt='%d')

AttributeError: 'DataFrame' object has no attribute 'values'

In [160]:
vectorizedDF.to_csv(sep=' ', index=False, header=False)

AttributeError: 'DataFrame' object has no attribute 'to_csv'

In [None]:
 sc.textFile('data/train.txt')

In [172]:
vectorizedDF.rdd.take(1)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 94.0 failed 1 times, most recent failure: Lost task 0.0 in stage 94.0 (TID 11057, localhost, executor driver): org.apache.spark.SparkException: File /tmp/spark-a194cc3a-675c-4381-a2a2-ecd85134dd1a/userFiles-902295f0-b4c9-438d-b805-521175bab9d2/fm_parallel_sgd.py exists and does not match contents of file:/media/notebooks/Assignments/FinalProject/w261_final_project/fm_parallel_sgd.py
	at org.apache.spark.util.Utils$.copyFile(Utils.scala:585)
	at org.apache.spark.util.Utils$.doFetchFile(Utils.scala:691)
	at org.apache.spark.util.Utils$.fetchFile(Utils.scala:488)
	at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDependencies$3.apply(Executor.scala:743)
	at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDependencies$3.apply(Executor.scala:740)
	at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
	at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99)
	at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99)
	at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230)
	at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
	at scala.collection.mutable.HashMap.foreach(HashMap.scala:99)
	at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
	at org.apache.spark.executor.Executor.org$apache$spark$executor$Executor$$updateDependencies(Executor.scala:740)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:312)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:149)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor135.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: File /tmp/spark-a194cc3a-675c-4381-a2a2-ecd85134dd1a/userFiles-902295f0-b4c9-438d-b805-521175bab9d2/fm_parallel_sgd.py exists and does not match contents of file:/media/notebooks/Assignments/FinalProject/w261_final_project/fm_parallel_sgd.py
	at org.apache.spark.util.Utils$.copyFile(Utils.scala:585)
	at org.apache.spark.util.Utils$.doFetchFile(Utils.scala:691)
	at org.apache.spark.util.Utils$.fetchFile(Utils.scala:488)
	at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDependencies$3.apply(Executor.scala:743)
	at org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDependencies$3.apply(Executor.scala:740)
	at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
	at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99)
	at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99)
	at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230)
	at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
	at scala.collection.mutable.HashMap.foreach(HashMap.scala:99)
	at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
	at org.apache.spark.executor.Executor.org$apache$spark$executor$Executor$$updateDependencies(Executor.scala:740)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:312)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [10]:
parsedDF = sample_testRDD2.map(parseCV).toDF().cache()
vectorizedTest = vectorizeCV(parsedDF)
vectorizedTest = vectorizeCV(parsedDF)
vectorizedTest.show()

+-----+--------------------+--------------------+
|label|                 raw|            features|
+-----+--------------------+--------------------+
|    0|[n1_4, n2_50, n3_...|(10968,[0,1,2,4,5...|
|    0|[n1_12, n2_20, n3...|(10968,[0,1,2,5,8...|
|    0|[n1_0, n2_5, n4_3...|(10968,[0,2,3,6,1...|
|    0|[n1_323, n2_2, n3...|(10968,[1,2,13,15...|
|    0|[n0_0, n1_13, n2_...|(10968,[0,1,2,5,6...|
|    1|[n1_4, n2_1, n3_2...|(10968,[0,6,7,16,...|
|    1|[n0_2, n1_0, n2_1...|(10968,[0,1,2,4,1...|
|    0|[n0_0, n1_23, n2_...|(10968,[0,1,3,7,1...|
|    0|[n0_0, n1_15, n2_...|(10968,[0,2,3,4,1...|
|    0|[n1_54, n2_1, n3_...|(10968,[0,1,2,3,4...|
|    0|[n0_0, n1_-1, n2_...|(10968,[0,1,4,5,7...|
|    0|[n1_3, n2_2, n4_4...|(10968,[0,1,2,3,5...|
|    0|[n0_2, n1_180, n2...|(10968,[0,1,2,3,4...|
|    0|[n0_0, n1_1, n2_2...|(10968,[0,2,3,4,5...|
|    0|[n1_3, n2_8, n3_8...|(10968,[5,8,13,15...|
|    1|[n0_8, n1_-1, n3_...|(10968,[0,1,2,3,4...|
|    0|[n1_4, n4_8012, n...|(10968,[0,1,13,15...|


In [11]:
sc.addPyFile("fm_parallel_sgd.py")
import fm_parallel_sgd as fm

In [4]:
print (vectorizedDF.rdd.count())
print (vectorizedTest.rdd.count())
print (vectorizedDF.rdd.first())

NameError: name 'vectorizedDF' is not defined

In [None]:
temp = time.time()
model = fm.trainFM_parallel_sgd (sc, vectorizedDF.rdd, iterations=1, iter_sgd= 1, alpha=0.01, regParam=0.01, factorLength=4,\
                      verbose=True, savingFilename = None, evalTraining=None)
print ('time :', time.time()-temp)

iter 	time 	train_logl 	val_logl
0 	0 	0.696258 	0.696293


In [None]:
print (evaluate(vectorizedTest, model))