In [None]:
import numpy as np
from time import time
import code.profiles.generation as gen
from code.profiles.definitions import Profile
from code.profiles.vcrDetection import detectVCRProperty, detectCRProperty, detectVRProperty
from code.utils import getNumpyColumns

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, FloatType

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext # sc = SparkContext.getOrCreate()
spark

In [21]:
def run(C: int=3, V:int =3):
    statistics = {}
    candidatesIds = ["A", "B", "C", "D", "E", "F", "G"][:C]
    votersIds = ["0", "1", "2", "3", "4", "5", "6"][:V]

    profilesRDD = sc.parallelize(gen.parallelProfileGeneration(cpu=16, candidatesNumber=C, voterNumber=V))
    
    vcrProfilesRDD = profilesRDD \
        .map(lambda p: (p,detectVCRProperty(A=p, C=candidatesIds[:C], V=votersIds[:V]))) \
        .filter(lambda pRes: pRes[1][0] == 2)

    vcrNCOPProfilesRDD = vcrProfilesRDD \
        .filter(lambda pRes: not detectCRProperty(A=pRes[0], C=candidatesIds, V=votersIds)) \
        .filter(lambda pRes: not detectVRProperty(A=pRes[0], C=candidatesIds, V=votersIds)) \
        .map(lambda pRes: Profile.fromILPRes(pRes[0], pRes[1][1], candidatesIds, votersIds))

    NPRow = Row(*tuple(getNumpyColumns(C,V)))
    schema = StructType([StructField(n, FloatType(), False) for n in getNumpyColumns(C,V)])
    
    vcrNCOPNumpyRows = vcrNCOPProfilesRDD \
        .map(lambda profile: profile.asNumpy().tolist()) \
        .map(lambda a: NPRow(*tuple(a))) \
        
    spark.createDataFrame(vcrNCOPNumpyRows, schema) \
        .write.format("com.databricks.spark.csv") \
        .option("header", "true") \
        .save("resources/{}C{}V".format(C,V), mode="append") \

    print("After save")

    statistics["VCR"] = vcrProfilesRDD.count()
    print(statistics["VCR"])

    print("After VCR Count")

    statistics["NCOPVCR"] = vcrNCOPProfilesRDD.count()
    print(statistics["NCOPVCR"])

    spark.createDataFrame(statistics.items(), ["key", "value"]) \
        .repartition(1) \
        .write.format("com.databricks.spark.csv") \
        .option("header", "true") \
        .save("resources/{}C{}V-stats.csv".format(C,V), mode="append") \

    print("After Stats save")

    return statistics, vcrNCOPNumpyRows

In [25]:
startTime = time()
stats, vcrNCOPProfiles = run(C=4, V=4)
endTime = time()
print(endTime - startTime)
print(stats)


After save
57832
After VCR Count
96
After Stats save
114.01347422599792
{'VCR': 57832, 'NCOPVCR': 96}


In [16]:
def loadProfiles(C:int=3, V:int=3):
    vcrNCOPProfilesDF = spark.read.format("csv")\
        .option("inferSchema", "true") \
        .option("header", "true") \
        .load("resources/{}C{}V".format(C,V))

    return vcrNCOPProfilesDF.rdd \
        .map(lambda r: np.array(r, dtype=np.float)) \
        .map(lambda npProf: Profile.fromNumpy(npProf)) \

def loadStatistics(C:int=3, V:int=3):
    return spark.read.format("csv")\
        .option("inferSchema", "true") \
        .option("header", "true") \
        .load("resources/{}C{}V-stats.csv".format(C,V))



[Row(key='VCR', value=16),
 Row(key='NCOPVCR', value=0),
 Row(key='VCR', value=16),
 Row(key='NCOPVCR', value=0)]