# Library Initilization

In [160]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import seaborn as sns
import datetime as dt
from scipy import stats
import os
import glob
sns.set()
pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline

## Cleaning functions

In [161]:
def item_header_clean(df):
    column_map = {
    "index" : "League",
    "League": "Date",
    "Date": "Id",
    "Id" : "Type"
    }
    df_indexreset = df.reset_index()
    df_renamed = df_indexreset.rename(columns = column_map)
    return df_renamed

def item_info_fix(df):
    cols_to_drop = ["Links", "Variant"]
    df_cols_dropped = df.drop(cols_to_drop, axis = 1)
    df_cols_dropped.BaseType = df_cols_dropped.BaseType.fillna(df_cols_dropped.Type)
    df_fixed = df_cols_dropped
    return df_fixed

def item_top_15_filter(df):
    median_series = df.groupby("Name")["Value"].agg("median")
    sorted_series = median_series.sort_values(ascending = False)
    top_15_list = list(sorted_series[:15].index)
    df_top_15 = df.loc[df["Name"].isin(top_15_list)]
    return df_top_15

def add_relative_date(df):
    df_copy = df
    df_copy["Date"] = df_copy["Date"].astype('datetime64[D]')
    startdate = df_copy["Date"].min()
    df_copy["RelativeDate"] = (df_copy["Date"] - startdate)
    df_edited = df_copy.drop("Date", axis = 1)
    df_edited["RelativeDate"] = pd.to_timedelta(df_edited["RelativeDate"], unit = "D")
    return df_edited

def league_lifespan(row):
    early_league = dt.timedelta(days = 14)
    mid_league = dt.timedelta(days = 60)
    if row["RelativeDate"] <= early_league:
        return "Early"
    elif row["RelativeDate"] <= mid_league:
        return "Mid"
    return "End"

def item_file_clean(df):
    df1 = item_header_clean(df)
    df2 = item_info_fix(df1)
    df3 = item_top_15_filter(df2)
    df4 = add_relative_date(df3)
    df4["League Lifespan"] = df4.apply(league_lifespan, axis = 1)
    return df4

def currency_info_fix(df):
    currency_to_drop = ["Portal Scroll", "Scroll of Wisdom", "Armourer's Scrap", "Perandus Coin", "Orb of Transmutation", "Blacksmith's Whetstone", "Orb of Augmentation", "Orb of Alteration", "Splinter of Tul", "Chromatic Orb", "Splinter of Esh", "Splinter of Xoph", "Orb of Chance", "Glassblower's Bauble", "Splinter of Uul-Netol", "Silver Coin"]
    df_currency_fixed = df.loc[~(df["Get"].isin(currency_to_drop) | df["Pay"].isin(currency_to_drop))]
    return df_currency_fixed

def currency_file_clean(df):
    df1 = currency_info_fix(df)
    df2 = add_relative_date(df1)
    df2["League Lifespan"] = df2.apply(league_lifespan, axis = 1)
    return df2

## Data Import

In [162]:
breach_currency = pd.read_csv("1_currency.csv", sep = ";", index_col = None)
breach_items = pd.read_csv("1_items.csv", sep = ";", index_col = None)
breach_currency.head()

Unnamed: 0,League,Date,Get,Pay,Value,Confidence
0,Breach,2016-12-02,Silver Coin,Chaos Orb,0.96667,High
1,Breach,2016-12-03,Silver Coin,Chaos Orb,0.21937,High
2,Breach,2016-12-04,Silver Coin,Chaos Orb,0.18339,High
3,Breach,2016-12-05,Silver Coin,Chaos Orb,0.23571,High
4,Breach,2016-12-06,Silver Coin,Chaos Orb,0.26395,High


In [163]:
breach_items_indexreset = breach_items.reset_index()
breach_items_indexreset.head()

Unnamed: 0,index,League,Date,Id,Name,BaseType,Variant,Links,Value,Confidence
0,Breach,2016-12-02,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.788,High
1,Breach,2016-12-03,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.29912,High
2,Breach,2016-12-04,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.15429,High
3,Breach,2016-12-05,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.18333,High
4,Breach,2016-12-06,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.20815,High


In [164]:
column_map = {
    "index" : "League",
    "League": "Date",
    "Date": "Id",
    "Id" : "Type"
}
breach_items = breach_items_indexreset.rename(columns = column_map)
breach_items.head()

Unnamed: 0,League,Date,Id,Type,Name,BaseType,Variant,Links,Value,Confidence
0,Breach,2016-12-02,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.788,High
1,Breach,2016-12-03,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.29912,High
2,Breach,2016-12-04,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.15429,High
3,Breach,2016-12-05,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.18333,High
4,Breach,2016-12-06,1079,UniqueArmour,Sadima's Touch,Wool Gloves,,1-4 links,0.20815,High


In [165]:
breach_currency.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9932 entries, 0 to 9931
Data columns (total 6 columns):
League        9932 non-null object
Date          9932 non-null object
Get           9932 non-null object
Pay           9932 non-null object
Value         9932 non-null float64
Confidence    9932 non-null object
dtypes: float64(1), object(5)
memory usage: 465.6+ KB


# Wrangling the item dataset

In [166]:
breach_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133166 entries, 0 to 133165
Data columns (total 10 columns):
League        133166 non-null object
Date          133166 non-null object
Id            133166 non-null int64
Type          133166 non-null object
Name          133166 non-null object
BaseType      100461 non-null object
Variant       18075 non-null object
Links         54672 non-null object
Value         133166 non-null float64
Confidence    133166 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 10.2+ MB


In [167]:
breach_items.columns

Index(['League', 'Date', 'Id', 'Type', 'Name', 'BaseType', 'Variant', 'Links',
       'Value', 'Confidence'],
      dtype='object')

## Removing Unnecessary Information and Filling in Prophecies and Divination Cards

In [168]:
breach_items_edited1 = breach_items.drop(["Links", "Variant"], axis = 1)
breach_items_edited1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133166 entries, 0 to 133165
Data columns (total 8 columns):
League        133166 non-null object
Date          133166 non-null object
Id            133166 non-null int64
Type          133166 non-null object
Name          133166 non-null object
BaseType      100461 non-null object
Value         133166 non-null float64
Confidence    133166 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 8.1+ MB


In [169]:
_ = breach_items_edited1[breach_items_edited1.BaseType.isnull()]
_.Type.value_counts()

Prophecy          17317
DivinationCard    15388
Name: Type, dtype: int64

In [170]:
_2 = breach_items_edited1
_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133166 entries, 0 to 133165
Data columns (total 8 columns):
League        133166 non-null object
Date          133166 non-null object
Id            133166 non-null int64
Type          133166 non-null object
Name          133166 non-null object
BaseType      100461 non-null object
Value         133166 non-null float64
Confidence    133166 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 8.1+ MB


In [171]:
_2.BaseType = _2.BaseType.fillna(_2.Type)
_2.info()
breach_items_edited2 = _2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133166 entries, 0 to 133165
Data columns (total 8 columns):
League        133166 non-null object
Date          133166 non-null object
Id            133166 non-null int64
Type          133166 non-null object
Name          133166 non-null object
BaseType      133166 non-null object
Value         133166 non-null float64
Confidence    133166 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 8.1+ MB


In [172]:
medianseries = breach_items_edited2.groupby("Name")["Value"].agg("median")
sorted_series = medianseries.sort_values(ascending = False)
top_15 = list(sorted_series[:15].index)
top_15

['Headhunter',
 'Trash to Treasure',
 'Eyes of the Greatwolf',
 'Skyforth',
 "Rigwald's Savagery",
 'Natural Hierarchy',
 "Demigod's Bounty",
 "Atziri's Acuity",
 "Emperor's Mastery",
 "Rigwald's Command",
 'House of Mirrors',
 'The Doctor',
 'Fated Connections',
 "Atziri's Disfavour",
 "Demigod's Dominance"]

In [173]:
breach_items_top_15 = breach_items_edited2.loc[breach_items_edited2["Name"].isin(top_15)]
breach_items = breach_items_top_15
breach_items_top_15.head()

Unnamed: 0,League,Date,Id,Type,Name,BaseType,Value,Confidence
68886,Breach,2016-12-03,1476,DivinationCard,The Doctor,DivinationCard,577.705,Low
68887,Breach,2016-12-04,1476,DivinationCard,The Doctor,DivinationCard,407.7814,High
68888,Breach,2016-12-05,1476,DivinationCard,The Doctor,DivinationCard,249.16214,High
68889,Breach,2016-12-06,1476,DivinationCard,The Doctor,DivinationCard,253.43036,High
68890,Breach,2016-12-07,1476,DivinationCard,The Doctor,DivinationCard,302.59298,High


## Adding Relative Date Column

In [174]:
breach_items["Date"] = breach_items["Date"].astype('datetime64[D]')
breach_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1352 entries, 68886 to 132876
Data columns (total 8 columns):
League        1352 non-null object
Date          1352 non-null datetime64[ns]
Id            1352 non-null int64
Type          1352 non-null object
Name          1352 non-null object
BaseType      1352 non-null object
Value         1352 non-null float64
Confidence    1352 non-null object
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 95.1+ KB


In [175]:
startdate = breach_items["Date"].min()
startdate

Timestamp('2016-12-03 00:00:00')

In [176]:
breach_items["RelativeDate"] = (breach_items["Date"] - startdate).astype('timedelta64[D]')

In [177]:
breach_edited = breach_items.drop("Date", axis = 1)

In [178]:
breach_edited["RelativeDate"] = pd.to_timedelta(breach_edited["RelativeDate"], unit = "D")

In [179]:
breach_edited.head()

Unnamed: 0,League,Id,Type,Name,BaseType,Value,Confidence,RelativeDate
68886,Breach,1476,DivinationCard,The Doctor,DivinationCard,577.705,Low,0 days
68887,Breach,1476,DivinationCard,The Doctor,DivinationCard,407.7814,High,1 days
68888,Breach,1476,DivinationCard,The Doctor,DivinationCard,249.16214,High,2 days
68889,Breach,1476,DivinationCard,The Doctor,DivinationCard,253.43036,High,3 days
68890,Breach,1476,DivinationCard,The Doctor,DivinationCard,302.59298,High,4 days


## Add column indicating league's lifespan

In [180]:
def league_lifespan(row):
    early_league = dt.timedelta(days = 14)
    mid_league = dt.timedelta(days = 60)
    if row["RelativeDate"] <= early_league:
        return "Early"
    elif row["RelativeDate"] <= mid_league:
        return "Mid"
    return "End"

In [181]:
breach_edited["League Lifespan"] = breach_edited.apply(league_lifespan, axis = 1)
breach_edited.head()

Unnamed: 0,League,Id,Type,Name,BaseType,Value,Confidence,RelativeDate,League Lifespan
68886,Breach,1476,DivinationCard,The Doctor,DivinationCard,577.705,Low,0 days,Early
68887,Breach,1476,DivinationCard,The Doctor,DivinationCard,407.7814,High,1 days,Early
68888,Breach,1476,DivinationCard,The Doctor,DivinationCard,249.16214,High,2 days,Early
68889,Breach,1476,DivinationCard,The Doctor,DivinationCard,253.43036,High,3 days,Early
68890,Breach,1476,DivinationCard,The Doctor,DivinationCard,302.59298,High,4 days,Early


# Wrangling Currency Files

In [182]:
currency_to_drop = ["Portal Scroll", "Scroll of Wisdom", "Armourer's Scrap", "Perandus Coin", "Orb of Transmutation", "Blacksmith's Whetstone", "Orb of Augmentation", "Orb of Alteration", "Splinter of Tul", "Chromatic Orb", "Splinter of Esh", "Splinter of Xoph", "Orb of Chance", "Glassblower's Bauble", "Splinter of Uul-Netol", "Silver Coin"]
currency_copy = breach_currency
currency_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9932 entries, 0 to 9931
Data columns (total 6 columns):
League        9932 non-null object
Date          9932 non-null object
Get           9932 non-null object
Pay           9932 non-null object
Value         9932 non-null float64
Confidence    9932 non-null object
dtypes: float64(1), object(5)
memory usage: 465.6+ KB


In [183]:
def currency_info_remove(df):
    currency_to_drop = ["Portal Scroll", "Scroll of Wisdom", "Armourer's Scrap", "Perandus Coin", "Orb of Transmutation", "Blacksmith's Whetstone", "Orb of Augmentation", "Orb of Alteration", "Splinter of Tul", "Chromatic Orb", "Splinter of Esh", "Splinter of Xoph", "Orb of Chance", "Glassblower's Bauble", "Splinter of Uul-Netol", "Silver Coin"]
    df_currency_dropped = df.loc[~(df["Get"].isin(currency_to_drop) | df["Pay"].isin(currency_to_drop))]
    return df_currency_dropped

In [184]:
currency_dropped = currency_info_remove(currency_copy)

In [185]:
currency_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7149 entries, 356 to 9931
Data columns (total 6 columns):
League        7149 non-null object
Date          7149 non-null object
Get           7149 non-null object
Pay           7149 non-null object
Value         7149 non-null float64
Confidence    7149 non-null object
dtypes: float64(1), object(5)
memory usage: 391.0+ KB


In [186]:
currency_cleaned = currency_file_clean(breach_currency)
currency_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7149 entries, 356 to 9931
Data columns (total 7 columns):
League             7149 non-null object
Get                7149 non-null object
Pay                7149 non-null object
Value              7149 non-null float64
Confidence         7149 non-null object
RelativeDate       7149 non-null timedelta64[ns]
League Lifespan    7149 non-null object
dtypes: float64(1), object(5), timedelta64[ns](1)
memory usage: 446.8+ KB


# Generalized File Import and Concatenate

## File Import

In [187]:
item_df_list = []

In [188]:
for filename in glob.glob("*_items.csv"):
    df = pd.read_csv(filename, delimiter = ";", low_memory = False)
    df_edited = item_file_clean(df)
    item_df_list.append(df_edited)

In [189]:
item_df_list[1].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2600 entries, 50551 to 246383
Data columns (total 9 columns):
League             2600 non-null object
Id                 2600 non-null int64
Type               2600 non-null object
Name               2600 non-null object
BaseType           2600 non-null object
Value              2600 non-null float64
Confidence         2600 non-null object
RelativeDate       2600 non-null timedelta64[ns]
League Lifespan    2600 non-null object
dtypes: float64(1), int64(1), object(6), timedelta64[ns](1)
memory usage: 203.1+ KB


In [190]:
currency_df_list = []

In [191]:
for filename in glob.glob("*_currency.csv"):
    df = pd.read_csv(filename, delimiter = ";", low_memory = False)
    df_edited = currency_file_clean(df)
    currency_df_list.append(df_edited)

In [192]:
currency_df_list[2].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12387 entries, 369 to 16162
Data columns (total 7 columns):
League             12387 non-null object
Get                12387 non-null object
Pay                12387 non-null object
Value              12387 non-null float64
Confidence         12387 non-null object
RelativeDate       12387 non-null timedelta64[ns]
League Lifespan    12387 non-null object
dtypes: float64(1), object(5), timedelta64[ns](1)
memory usage: 774.2+ KB


## Concatenation

In [193]:
item_df = pd.concat(item_df_list, axis = 0).reset_index(drop = True)
del item_df_list

In [194]:
item_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6164 entries, 0 to 6163
Data columns (total 9 columns):
League             6164 non-null object
Id                 6164 non-null int64
Type               6164 non-null object
Name               6164 non-null object
BaseType           6164 non-null object
Value              6164 non-null float64
Confidence         6164 non-null object
RelativeDate       6164 non-null timedelta64[ns]
League Lifespan    6164 non-null object
dtypes: float64(1), int64(1), object(6), timedelta64[ns](1)
memory usage: 433.5+ KB


In [200]:
item_df.head()

Unnamed: 0,League,Id,Type,Name,BaseType,Value,Confidence,RelativeDate,League Lifespan
0,Breach,1476,DivinationCard,The Doctor,DivinationCard,577.705,Low,0 days,Early
1,Breach,1476,DivinationCard,The Doctor,DivinationCard,407.7814,High,1 days,Early
2,Breach,1476,DivinationCard,The Doctor,DivinationCard,249.16214,High,2 days,Early
3,Breach,1476,DivinationCard,The Doctor,DivinationCard,253.43036,High,3 days,Early
4,Breach,1476,DivinationCard,The Doctor,DivinationCard,302.59298,High,4 days,Early


In [196]:
currency_df = pd.concat(currency_df_list, axis = 0).reset_index(drop = True)
del currency_df_list

In [197]:
currency_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38885 entries, 0 to 38884
Data columns (total 7 columns):
League             38885 non-null object
Get                38885 non-null object
Pay                38885 non-null object
Value              38885 non-null float64
Confidence         38885 non-null object
RelativeDate       38885 non-null timedelta64[ns]
League Lifespan    38885 non-null object
dtypes: float64(1), object(5), timedelta64[ns](1)
memory usage: 2.1+ MB


In [198]:
currency_df.head()

Unnamed: 0,League,Get,Pay,Value,Confidence,RelativeDate,League Lifespan
0,Breach,Orb of Alchemy,Chaos Orb,0.68571,Medium,0 days,Early
1,Breach,Orb of Alchemy,Chaos Orb,0.23652,High,1 days,Early
2,Breach,Orb of Alchemy,Chaos Orb,0.16667,High,2 days,Early
3,Breach,Orb of Alchemy,Chaos Orb,0.19363,High,3 days,Early
4,Breach,Orb of Alchemy,Chaos Orb,0.21778,High,4 days,Early
