In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName('aws') \
        .getOrCreate()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3,application_1647157037580_0005,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
import pandas as pd
import pyspark.sql.functions as f
from pyspark.ml.fpm import FPGrowth

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Loading the data

In [20]:
path = 's3://bdcc-final-project-bucket/war_ukr_rus_w_internal.parquet'
data = spark.read.parquet(path)
data.createOrReplaceTempView('data')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Creating the database

In [21]:
trans_db = spark.sql(
    """
    SELECT
        Year,
        SQLDATE,
        Actor1Name,
        COLLECT_SET(CONCAT(
            Actor1Name, "_1", "-",
            Actor2Name, "_2", "-",
            Actor2CountryCode, "-",
            EVENTDESCRIPTION
        )) AS itemset
    FROM data
    GROUP BY Year, SQLDATE, Actor1Name
    """
)

trans_db.limit(10).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+--------+--------------+--------------------+
|Year| SQLDATE|    Actor1Name|             itemset|
+----+--------+--------------+--------------------+
|2013|20130114|         CHINA|[CHINA_1-RUSSIA_2...|
|2013|20130129|        RUSSIA|[RUSSIA_1-US GOVE...|
|2013|20130227|       RUSSIAN|[RUSSIAN_1-UNITED...|
|2013|20130303|       UKRAINE|[UKRAINE_1-UNITED...|
|2013|20130309| UNITED STATES|[UNITED STATES_1-...|
|2013|20130311| UNITED STATES|[UNITED STATES_1-...|
|2013|20130315|     UKRAINIAN|[UKRAINIAN_1-RUSS...|
|2013|20130322|        RUSSIA|[RUSSIA_1-BRITISH...|
|2013|20130402|         CHINA|[CHINA_1-RUSSIA_2...|
|2013|20130402|VLADIMIR PUTIN|[VLADIMIR PUTIN_1...|
+----+--------+--------------+--------------------+

In [22]:
def fim_years(itemsCol, years, minSupport=0.001, minConfidence=0.3, rules=100):
    """Returns the first 20 association rules for the inputted
    column of itemsets.

    Parameters
    -----
    itemsCol: column of a dataframe
        Column of itemsets.

    years: list
        Included years in the inputted column.

    minSupport: float
        Required minimum support for the FIM.

    minConfidence: float
        Required minimum confidence for the FIM.
        
    rules: integer
        Number of rules to return.

    Output
    -----
    dataframe
        List of first 20 association rules
    """
    fpg = FPGrowth(itemsCol=itemsCol,
                   minSupport=minSupport,
                   minConfidence=minConfidence)
    fpg_trained = fpg.fit(trans_db.filter(trans_db.Year.isin(years))
                          .select(trans_db.SQLDATE,
                                  f.array_distinct(f.split(
                                      trans_db.itemset[0], "-"))
                                  .alias('itemset')))

    return (fpg_trained.associationRules
                       .orderBy(['support', 'confidence'],
                                ascending=[False, False])
                       .select('antecedent', 'consequent',
                               f.round('support', 3).alias('sup'),
                               f.round('confidence', 3).alias('conf'))
            ).show(rules, truncate=False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# 2013 Events

In [23]:
fim_years('itemset', [2013])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------------------------------------------------------+-----------------+-----+-----+
|antecedent                                                      |consequent       |sup  |conf |
+----------------------------------------------------------------+-----------------+-----+-----+
|[RUSSIA_2]                                                      |[RUS]            |0.468|1.0  |
|[RUS]                                                           |[RUSSIA_2]       |0.468|0.618|
|[RUSSIAN_2]                                                     |[RUS]            |0.168|1.0  |
|[Consult, not specified below]                                  |[RUS]            |0.083|0.789|
|[Engage in negotiation]                                         |[RUS]            |0.073|0.842|
|[MOSCOW_2]                                                      |[RUS]            |0.064|1.0  |
|[UKRAINE_2]                                                     |[UKR]            |0.054|1.0  |
|[UKR]                        

# 2014 Events

In [24]:
fim_years('itemset', [2014])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------------------------------------------------+-----------------+-----+-----+
|antecedent                                                  |consequent       |sup  |conf |
+------------------------------------------------------------+-----------------+-----+-----+
|[RUSSIA_2]                                                  |[RUS]            |0.369|1.0  |
|[RUS]                                                       |[RUSSIA_2]       |0.369|0.624|
|[UKRAINE_2]                                                 |[UKR]            |0.192|1.0  |
|[UKR]                                                       |[UKRAINE_2]      |0.192|0.603|
|[RUSSIAN_2]                                                 |[RUS]            |0.143|1.0  |
|[UKRAINIAN_2]                                               |[UKR]            |0.057|1.0  |
|[MOSCOW_2]                                                  |[RUS]            |0.046|1.0  |
|[Engage in negotiation]                                     |[RUS]   

# 2015-2021 Events

In [25]:
fim_years('itemset', [2015, 2016, 2017, 2018, 2019, 2020, 2021])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------------------------------------------------------+-----------------+-----+-----+
|antecedent                                                      |consequent       |sup  |conf |
+----------------------------------------------------------------+-----------------+-----+-----+
|[RUSSIA_2]                                                      |[RUS]            |0.399|1.0  |
|[RUS]                                                           |[RUSSIA_2]       |0.399|0.599|
|[RUSSIAN_2]                                                     |[RUS]            |0.167|1.0  |
|[UKRAINE_2]                                                     |[UKR]            |0.145|1.0  |
|[UKR]                                                           |[UKRAINE_2]      |0.145|0.64 |
|[MOSCOW_2]                                                      |[RUS]            |0.05 |1.0  |
|[Consult, not specified below]                                  |[RUS]            |0.05 |0.62 |
|[Make statement, not specifie

# 2022 Events

In [26]:
fim_years('itemset', [2022])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------------------------------------------+-----------------+-----+-----+
|antecedent                                            |consequent       |sup  |conf |
+------------------------------------------------------+-----------------+-----+-----+
|[RUSSIA_2]                                            |[RUS]            |0.288|1.0  |
|[RUS]                                                 |[RUSSIA_2]       |0.288|0.545|
|[UKRAINE_2]                                           |[UKR]            |0.286|1.0  |
|[UKR]                                                 |[UKRAINE_2]      |0.286|0.7  |
|[RUSSIAN_2]                                           |[RUS]            |0.181|1.0  |
|[RUS]                                                 |[RUSSIAN_2]      |0.181|0.343|
|[UKRAINIAN_2]                                         |[UKR]            |0.078|1.0  |
|[Praise or endorse]                                   |[UKR]            |0.037|0.64 |
|[Appeal, not specified below]             