In [1]:
# Importing necessary libraries to extract the data properly from Postgres using PySpark
import pandas as pd
import numpy as np
from API_Key import username, password
import requests
import time
import psycopg2 as pg2
import json
from pandas.io.json import json_normalize
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as psf
import pyspark.sql.types as pst
from scipy.stats import pointbiserialr

In [2]:
spark = SparkSession \
    .builder \
    .appName("TFT Analysis") \
    .config("spark.jars", "/Users/hiowatah/downloads/postgresql-42.2.18.jar") \
    .master("local[10]") \
    .getOrCreate()

In [21]:
df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/TFT") \
    .option("dbtable", "matches") \
    .option("user", username) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [22]:
df.printSchema()

root
 |-- match: string (nullable = true)
 |-- game_datetime: long (nullable = true)
 |-- game_length: double (nullable = true)
 |-- game_version: string (nullable = true)
 |-- gold_left: integer (nullable = true)
 |-- last_round: integer (nullable = true)
 |-- level: integer (nullable = true)
 |-- placement: integer (nullable = true)
 |-- puuid: string (nullable = true)
 |-- time_eliminated: double (nullable = true)
 |-- total_damage: integer (nullable = true)
 |-- traits: string (nullable = true)
 |-- units: string (nullable = true)



## Need to make a UDF to parse through the json strings and create arrays to make data processing easier

In [6]:
def jsonColParser(df, *cols, sanitize=True):
    res = df
    for i in cols:
        if sanitize:
            res = (
                res.withColumn(
                    i,
                    psf.concat(psf.lit('{"data": '), i, psf.lit('}'))
                )
            )
        schema = spark.read.json(res.rdd.map(lambda x: x[i])).schema
        res = res.withColumn(i, psf.from_json(psf.col(i), schema))
        
        if sanitize:
            res = res.withColumn(i, psf.col(i).data)
            
    return res

In [34]:
df = jsonColParser(df, "units", "traits")

In [9]:
df.count()

140816

In [10]:
df.corr("placement", "total_damage")

-0.7804422281556648

In [11]:
df = df.where("game_version LIKE '%Version 11.2%'")

In [12]:
df.count()

52416

In [13]:
df = df.withColumn("unit_names", df.units.character_id)\
        .withColumn("units_tier", df.units.tier)\
        .withColumn("chosen", df.units.chosen)\
        .withColumn("items", df.units.items)\
        .withColumn("trait_name", df.traits['name'])\
        .withColumn("num_units_trait", df.traits.num_units)\
        .withColumn("trait_tier", df.traits.tier_current)\
        .withColumn("style", df.traits.style)\
        .withColumn("trait_tier_total", df.traits.tier_total)\
        .drop("units", "traits")

In [14]:
df.count()

52416

In [17]:
# df.toPandas().to_excel("matches.xlsx")

In [20]:
df.select(
    "Champion_1").where("Champion_1 LIKE 'TFT4_Maokai'").show()

+-----------+
| Champion_1|
+-----------+
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
|TFT4_Maokai|
+-----------+
only showing top 20 rows



In [16]:
df = df.withColumn("Champion_1", df['unit_names'].getItem(0))\
    .withColumn("Champion_2", df['unit_names'].getItem(1))\
    .withColumn("Champion_3", df['unit_names'].getItem(2))\
    .withColumn("Champion_4", df['unit_names'].getItem(3))\
    .withColumn("Champion_5", df['unit_names'].getItem(4))\
    .withColumn("Champion_6", df['unit_names'].getItem(5))\
    .withColumn("Champion_7", df['unit_names'].getItem(6))\
    .withColumn("Champion_8", df['unit_names'].getItem(7))\
    .withColumn("Champion_9", df['unit_names'].getItem(8))\
    .withColumn("Champion_10", df['unit_names'].getItem(9))\
    .withColumn("Champion_1_tier", df['units_tier'].getItem(0))\
    .withColumn("Champion_2_tier", df['units_tier'].getItem(1))\
    .withColumn("Champion_3_tier", df['units_tier'].getItem(2))\
    .withColumn("Champion_4_tier", df['units_tier'].getItem(3))\
    .withColumn("Champion_5_tier", df['units_tier'].getItem(4))\
    .withColumn("Champion_6_tier", df['units_tier'].getItem(5))\
    .withColumn("Champion_7_tier", df['units_tier'].getItem(6))\
    .withColumn("Champion_8_tier", df['units_tier'].getItem(7))\
    .withColumn("Champion_9_tier", df['units_tier'].getItem(8))\
    .withColumn("Champion_10_tier", df['units_tier'].getItem(9))\
    .withColumn("Champion_1_chosen", df['chosen'].getItem(0))\
    .withColumn("Champion_2_chosen", df['chosen'].getItem(1))\
    .withColumn("Champion_3_chosen", df['chosen'].getItem(2))\
    .withColumn("Champion_4_chosen", df['chosen'].getItem(3))\
    .withColumn("Champion_5_chosen", df['chosen'].getItem(4))\
    .withColumn("Champion_6_chosen", df['chosen'].getItem(5))\
    .withColumn("Champion_7_chosen", df['chosen'].getItem(6))\
    .withColumn("Champion_8_chosen", df['chosen'].getItem(7))\
    .withColumn("Champion_9_chosen", df['chosen'].getItem(8))\
    .withColumn("Champion_10_chosen", df['chosen'].getItem(9))\
    .withColumn("Champion_1_items", df['items'].getItem(0))\
    .withColumn("Champion_2_items", df['items'].getItem(1))\
    .withColumn("Champion_3_items", df['items'].getItem(2))\
    .withColumn("Champion_4_items", df['items'].getItem(3))\
    .withColumn("Champion_5_items", df['items'].getItem(4))\
    .withColumn("Champion_6_items", df['items'].getItem(5))\
    .withColumn("Champion_7_items", df['items'].getItem(6))\
    .withColumn("Champion_8_items", df['items'].getItem(7))\
    .withColumn("Champion_9_items", df['items'].getItem(8))\
    .withColumn("Champion_10_items", df['items'].getItem(9))\
    .drop("unit_names", "units_tier", "chosen", "items")
    


In [34]:
df.select("*").withColumn("gold", psf.array_contains(psf.col('style'), 2)).where("gold = 'true'").show(1)

+--------------+-------------+----------------+--------------------+---------+----------+-----+---------+--------------------+------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+
|         match|game_datetime|     game_length|        game_version|gold_left|last_round|level|placement|               puuid|   time_eliminated|total_damage|          unit_names|          units_tier|              chosen|               items|          trait_name|     num_units_trait|          trait_tier|               style|    trait_tier_total|gold|
+--------------+-------------+----------------+--------------------+---------+----------+-----+---------+--------------------+------------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------

In [27]:
df.printSchema()

root
 |-- match: string (nullable = true)
 |-- game_datetime: long (nullable = true)
 |-- game_length: double (nullable = true)
 |-- game_version: string (nullable = true)
 |-- gold_left: integer (nullable = true)
 |-- last_round: integer (nullable = true)
 |-- level: integer (nullable = true)
 |-- placement: integer (nullable = true)
 |-- puuid: string (nullable = true)
 |-- time_eliminated: double (nullable = true)
 |-- total_damage: integer (nullable = true)
 |-- unit_names: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- units_tier: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- chosen: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- items: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: long (containsNull = true)
 |-- trait_name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- num_units_trait: array (nullable = true)
 |  