In [1]:
from pyspark import *
from pyspark.sql import *

In [2]:
spark = SparkSession.builder.appName('h516eveRetriever').getOrCreate()
sqlContext = SQLContext(spark)

In [3]:
from pyspark.sql.types import *

In [4]:
headers = StructType([\
            StructField('victimCharacterId',IntegerType(), True),\
            StructField('victimShipId',IntegerType(), True),\
            StructField('victimShipName',StringType(), True),\
            StructField('victimShipCategoryId',IntegerType(), True),\
            StructField('victimShipCategoryName',StringType(), True),\
            StructField('attackerCharacterId',IntegerType(), True),\
            StructField('attackerShipId',IntegerType(), True),\
            StructField('attackerShipName',StringType(), True),\
            StructField('attackerShipCategoryId',IntegerType(), True),\
            StructField('attackerShipCategoryName',StringType(), True),\
            StructField('killId',IntegerType(), True),\
            StructField('date',DateType(), True)])
df = spark.read.csv('/home/rlemke/eveRetrieveResults.csv',header=False,schema=headers)
df.createOrReplaceTempView('rawEveData')

In [5]:
selectStatement =  "SELECT 0 AS victory, victimCharacterId AS pilotOne, victimShipId AS pilotOneShip, attackerCharacterId AS pilotTwo, attackerShipId AS pilotTwoShip FROM rawEveData WHERE attackerCharacterId > victimCharacterId"
selectStatement += " UNION "
selectStatement += "SELECT 1 AS victory, attackerCharacterId AS pilotOne, attackerShipId AS pilotOneShip, victimCharacterId AS pilotTwo, victimShipId AS pilotTwoShip FROM rawEveData WHERE victimCharacterId > attackerCharacterId"
preparedDf = spark.sql(selectStatement)
preparedDf.createOrReplaceTempView('preparedEveData')

In [6]:
from pyspark.ml.feature import RFormula

In [8]:
formula = RFormula(formula="victory ~ pilotOneShip:pilotTwoShip")

In [9]:
fittedDf = formula.fit(preparedDf)
transformedDf = fittedDf.transform(preparedDf)

In [10]:
from pyspark.ml.classification import LogisticRegression

In [11]:
regression = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

train, test = transformedDf.randomSplit([0.7, 0.3])
lrModel = regression.fit(train)

In [12]:
predictions = lrModel.transform(test)

In [13]:
predictions.createOrReplaceTempView('evePredictions')

In [14]:
wrongGuesses = spark.sql("SELECT -1 FROM evePredictions WHERE CAST(prediction AS INTEGER) != victory").count()

In [15]:
totalRows = spark.sql("SELECT -1 FROM evePredictions").count()

In [16]:
print("percent correct: " + str((totalRows - wrongGuesses)/totalRows))

percent correct: 0.6581725918018163


In [17]:
print(totalRows)
print(wrongGuesses)

273524
93498
