In [146]:
# from mp (preamble)

import pandas as pd
import numpy as np
import copy

from scipy import stats
from scipy.stats import chi2_contingency
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

demos = pd.read_csv("demos.csv")
demos['player_loadout_array'] = demos['player_loadout'].str.split(', ')
unique_items = demos['player_loadout_array'].explode().unique().tolist()
knives = [i for i in unique_items if
          "knife" in i.lower() or
          "daggers" in i.lower() or
          "bayonet" in i.lower() or
          "karambit" in i.lower()]

knives
demos['player_loadout_array'] = demos['player_loadout_array'].apply(lambda l: [i for i in l if i not in knives])
demos['player_loadout_array'].head(10)


0              [Glock-18, C4 Explosive]
1    [Glock-18, Smoke Grenade, Molotov]
2                            [Glock-18]
3      [P250, Smoke Grenade, Flashbang]
4                               [USP-S]
5                [P2000, Smoke Grenade]
6                       [Dual Berettas]
7                               [USP-S]
8                    [USP-S, Flashbang]
9                            [Glock-18]
Name: player_loadout_array, dtype: object

In [147]:
import pandas as pd
import matplotlib as plt
import numpy as np
import adjustText as at
from adjustText import adjust_text

demos = pd.read_csv("demos.csv")
matches = pd.read_csv("matches.csv")
players = pd.read_csv("players.csv")
teams = pd.read_csv("teams.csv")

### Team Names

If we read the list of team names in the `players.csv`, `teams.csv` and `demos.csv` files, they differ from each other:

In [148]:
teams_team_names = sorted(teams['team_name'].unique())
demos_team_names = sorted(demos['team_name'].unique())
players_team_names = sorted(players['team'].unique())

pd.DataFrame({
    'teams': teams_team_names,
    'demos': demos_team_names,
    'players': players_team_names
})

Unnamed: 0,teams,demos,players
0,3DMAX,3DMAX,3DMAX
1,Aurora,Aurora,Aurora
2,BC.Game,BCG,BC.Game
3,FURIA,FURIA,FURIA
4,FaZe,FaZe Clan,FaZe
5,Falcons,G2 Esports,Falcons
6,G2,GamerLegion,G2
7,GamerLegion,HEROIC,GamerLegion
8,HEROIC,LEGACY,HEROIC
9,Legacy,Lynn Vision Gaming,Legacy


Both the `teams.csv` and `players.csv` team names are the same, but the ones in `demos.csv` are different.
We will replace the values in `players.csv` and `teams.csv` since they are smaller files, for performance reasons.

First, we get the differences:

In [149]:
union_ser = pd.Series(np.union1d(teams_team_names, demos_team_names))
intersection_ser = pd.Series(np.intersect1d(teams_team_names, demos_team_names))
differences = union_ser[~union_ser.isin(intersection_ser)]

differences

2                BC.Game
3                    BCG
5                   FaZe
6              FaZe Clan
7                Falcons
8                     G2
9             G2 Esports
12                LEGACY
13                Legacy
14                Liquid
15           Lynn Vision
16    Lynn Vision Gaming
19          Team Falcons
20           Team Liquid
21         Team Vitality
23              Vitality
dtype: object

From here, we can see the following that need to change:

- `BC.Game` -> `BCG`
- `FaZe` -> `FaZe Clan`
- `Falcons` -> `Team Falcons`
- `G2` -> `G2 Esports`
- `Legacy` -> `LEGACY`
- `Liquid` -> `Team Liquid`
- `Lynn Vision` -> `Lynn Vision Gaming`
- `Vitality` -> `Team Vitality`

Now we can fix this:

In [150]:
teams['team_name'] = teams['team_name'].replace('BC.Game', 'BCG')
teams['team_name'] = teams['team_name'].replace('FaZe', 'FaZe Clan')
teams['team_name'] = teams['team_name'].replace('Falcons', 'Team Falcons')
teams['team_name'] = teams['team_name'].replace('G2', 'G2 Esports')
teams['team_name'] = teams['team_name'].replace('Legacy', 'LEGACY')
teams['team_name'] = teams['team_name'].replace('Liquid', 'Team Liquid')
teams['team_name'] = teams['team_name'].replace('Lynn Vision', 'Lynn Vision Gaming')
teams['team_name'] = teams['team_name'].replace('Vitality', 'Team Vitality')

players['team'] = players['team'].replace('BC.Game', 'BCG')
players['team'] = players['team'].replace('FaZe', 'FaZe Clan')
players['team'] = players['team'].replace('Falcons', 'Team Falcons')
players['team'] = players['team'].replace('G2', 'G2 Esports')
players['team'] = players['team'].replace('Legacy', 'LEGACY')
players['team'] = players['team'].replace('Liquid', 'Team Liquid')
players['team'] = players['team'].replace('Lynn Vision', 'Lynn Vision Gaming')
players['team'] = players['team'].replace('Vitality', 'Team Vitality')

Now, we check for differences:

In [151]:
teams_team_names = teams['team_name'].unique()
demos_team_names = demos['team_name'].unique()
players_team_names = players['team'].unique()

union_ser = pd.Series(np.union1d(teams_team_names, demos_team_names))
intersection_ser = pd.Series(np.intersect1d(teams_team_names, demos_team_names))
differences = union_ser[~union_ser.isin(intersection_ser)]

len(differences)

0

As you can see, there are no more differences.

### Player Names

If we read the list of player names in the `players.csv` and `demos.csv` files, they differ from each other:


In [152]:
demos_player_names = demos['player_name'].unique()
players_player_names = players['name'].unique()

union_ser = pd.Series(np.union1d(players_player_names, demos_player_names))
intersection_ser = pd.Series(np.intersect1d(players_player_names, demos_player_names))
differences = union_ser[~union_ser.isin(intersection_ser)]

differences

0           910
1          910-
11     HeavyGoD
12     HeavyGod
22       Mzinho
29       Skullz
36       Techno
37     Techno4K
39    Westmelon
63       mzinho
74       skullz
78    westmelon
84         z4KR
85         z4kr
dtype: object

To determine which names are equivalent to which, we make a DataFrame:

In [153]:
demos_player_names = pd.Series(demos['player_name'].unique())
players_player_names = pd.Series(players['name'].unique())

demos_player_names = demos_player_names[demos_player_names.isin(differences)]
players_player_names = players_player_names[players_player_names.isin(differences)]

pd.DataFrame({'demos': demos_player_names, 'players': players_player_names})

Unnamed: 0,demos,players
7,Skullz,
17,,Techno
18,,mzinho
19,,910
33,,HeavyGod
50,HeavyGoD,
51,,skullz
52,z4KR,
56,westmelon,
62,,Westmelon


From here, we can see the following that need to change:

- `910` -> `910-`
- `HeavyGod` -> `HeavyGoD`
- `mzinho` -> `mzinho`
- `skullz` -> `Skullz`
- `Techno` -> `Techno4K`
- `Westmelon` -> `westmelon`
- `z4kr` -> `z4KR`

Now we can fix this:

In [154]:
players['name'] = players['name'].replace('910', '910-')
players['name'] = players['name'].replace('HeavyGod', 'HeavyGoD')
players['name'] = players['name'].replace('mzinho', 'Mzinho')
players['name'] = players['name'].replace('skullz', 'Skullz')
players['name'] = players['name'].replace('Techno', 'Techno4K')
players['name'] = players['name'].replace('Westmelon', 'westmelon')
players['name'] = players['name'].replace('z4kr', 'z4KR')

In [155]:
demos_player_names = demos['player_name'].unique()
players_player_names = players['name'].unique()

union_ser = pd.Series(np.union1d(players_player_names, demos_player_names))
intersection_ser = pd.Series(np.intersect1d(players_player_names, demos_player_names))
differences = union_ser[~union_ser.isin(intersection_ser)]

len(differences)

0

Now we can see there are no differences.

## Player loadouts

Player loadouts are encoded as comma-separated lists, so we turn it into an array. In addition, we need to remove knives,
as it is impossible to drop knives in the game, and different knife skins (cosmetics) are counted as unique items rather
than the same one, even if there is no gameplay effect.

First, we convert to list and get a list of knives:

In [156]:
demos['player_loadout_array'] = demos['player_loadout'].str.split(', ')
unique_items = demos['player_loadout_array'].explode().unique().tolist()
knives = [i for i in unique_items if
          "knife" in i.lower() or
          "daggers" in i.lower() or
          "bayonet" in i.lower() or
          "karambit" in i.lower()]

knives

['Butterfly Knife',
 'Talon Knife',
 'M9 Bayonet',
 'Karambit',
 'Shadow Daggers',
 'Skeleton Knife',
 'Flip Knife',
 'Bayonet',
 'Survival Knife',
 'Nomad Knife',
 'Stiletto Knife',
 'knife_t',
 'knife',
 'Ursus Knife']

Now, we filter:

In [157]:
demos['player_loadout_array'] = demos['player_loadout_array'].apply(lambda l: [i for i in l if i not in knives])
demos['player_loadout_array'].head(10)

0              [Glock-18, C4 Explosive]
1    [Glock-18, Smoke Grenade, Molotov]
2                            [Glock-18]
3      [P250, Smoke Grenade, Flashbang]
4                               [USP-S]
5                [P2000, Smoke Grenade]
6                       [Dual Berettas]
7                               [USP-S]
8                    [USP-S, Flashbang]
9                            [Glock-18]
Name: player_loadout_array, dtype: object

## Removing problematic game

As mentioned before, match `25` game `60` was erroneously encoded by the tournament organizers, leading to potential problems with the data.

In [158]:
demos = demos[demos['map_id'] != 60]

## Removing problematic rounds

Because of the nature of Source 2 demos, there are some rounds where the round_bomb_plant_time is less than or equal to 0, which is not possible in a real game. We will remove these rounds from our data.



In [159]:
demos = demos[(demos["round_bomb_plant_time"] > 0) | (demos["round_bomb_plant_time"].isna())]

## Imputing hit sites

In the data set there were instances where the first site hit was not recorded but a bomb plant was recorded. In this case, we will impute the first site hit based on the bomb plant site. A limitation of this however is this may not cover instances where there are split pushes or rotation to the other site, but this is a good approximation for the data we have.

In [160]:
def impute_round_first_site_hit(row: pd.Series):
    if pd.isna(row["round_first_site_hit"]):
        return row["round_bomb_plant_site"]
    else:
        return row["round_first_site_hit"]


demos["round_first_site_hit"] = demos.apply(impute_round_first_site_hit, axis=1)

pd.set_option("display.max_columns", None)

# show sample imputed row
demos[demos["round_id"] == 8]

Unnamed: 0,match_id,map_id,round_id,team_name,map_name,round_number,round_ct_team,round_first_site_hit,round_site_hit_time,round_bomb_plant_site,player_planted_bomb,round_bomb_plant_time,round_bomb_defuser,bomb_defuse_time,round_length,round_result,round_timeout_called_before,player_name,player_flashes_used,player_smokes_used,player_grenades_used,player_molotovs_used,player_incendiaries_used,player_kills,player_died,player_spent_amount,player_loadout,player_damage,round_first_killer,round_first_death,player_headshots,player_upperbodyshots,player_stomachshots,player_legshots,player_loadout_array
80,0,0,8,Team Liquid,de_ancient,9,FaZe Clan,B,,B,True,112.21875,False,,134.60938,T,,NertZ,0,2,1,1,0,2,False,3800,"Butterfly Knife, Glock-18, Galil AR, Smoke Gre...",195,False,False,1,1,1,0,"[Glock-18, Galil AR, Smoke Grenade, Molotov, H..."
81,0,0,8,Team Liquid,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,NAF,2,1,0,0,0,1,True,4400,"Talon Knife, Glock-18, C4 Explosive, AK-47, Sm...",163,False,False,0,4,2,0,"[Glock-18, C4 Explosive, AK-47, Smoke Grenade,..."
82,0,0,8,Team Liquid,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,ultimate,1,1,0,1,0,0,True,6650,"M9 Bayonet, Glock-18, AWP, Smoke Grenade, Flas...",6,False,False,0,0,0,0,"[Glock-18, AWP, Smoke Grenade, Flashbang]"
83,0,0,8,Team Liquid,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,Twistzz,0,0,1,1,0,2,False,4700,"Karambit, Glock-18, AK-47, Molotov, High Explo...",104,False,False,1,1,0,0,"[Glock-18, AK-47, Molotov, High Explosive Gren..."
84,0,0,8,FaZe Clan,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,frozen,1,0,0,0,0,0,True,800,"Karambit, USP-S, AK-47, High Explosive Grenade...",0,False,False,0,0,0,0,"[USP-S, AK-47, High Explosive Grenade, Flashba..."
85,0,0,8,FaZe Clan,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,EliGE,1,1,2,0,2,0,True,4200,"Shadow Daggers, P2000, M4A4, Flashbang, Smoke ...",12,False,False,0,0,0,1,"[P2000, M4A4, Flashbang, Smoke Grenade, High E..."
86,0,0,8,FaZe Clan,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,s1mple,1,1,1,0,0,2,True,2500,"Butterfly Knife, USP-S, AWP, Smoke Grenade, Fl...",200,False,False,1,1,0,0,"[USP-S, AWP, Smoke Grenade, Flashbang, Incendi..."
87,0,0,8,FaZe Clan,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,Skullz,1,1,1,0,1,1,True,2300,"Talon Knife, USP-S, AK-47, High Explosive Gren...",70,True,False,1,1,0,0,"[USP-S, AK-47, High Explosive Grenade, Incendi..."
88,0,0,8,FaZe Clan,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,karrigan,2,1,1,0,2,0,True,1300,"Karambit, USP-S, AK-47, Smoke Grenade, High Ex...",100,False,False,0,2,0,0,"[USP-S, AK-47, Smoke Grenade, High Explosive G..."
89,0,0,8,Team Liquid,de_ancient,9,FaZe Clan,B,,B,False,112.21875,False,,134.60938,T,,siuhy,2,1,0,1,0,0,True,3900,"Karambit, Glock-18, Galil AR, Smoke Grenade, M...",32,False,True,0,2,0,0,"[Glock-18, Galil AR, Smoke Grenade, Molotov, F..."


===================================

### Determining if Mean Round Lengths are Different in Rounds With and Without Timeout

For this part, we will use an unpaired t-test to determine if there is a difference in mean round lengths in rounds with and without timeout

Our null hypothesis $H_0$ will be there is no difference in the average round length in rounds with versus without timeouts

Our alternate  hypothesis $H_a$ will be there is a difference in the average round length in rounds with versus without timeouts

These tests will be performed with $\alpha=0.05$

First, we obtain our two groups from our dataset:

In [161]:
rounds_with_timeouts = demos.loc[~demos['round_timeout_called_before'].isna()].drop_duplicates(subset=['round_id'])['round_length']
rounds_with_timeouts.head(5)

60     66.093750
190    97.437500
220    42.312500
290    60.062500
340    85.171875
Name: round_length, dtype: float64

In [162]:
rounds_without_timeouts = demos.loc[demos['round_timeout_called_before'].isna()].drop_duplicates(subset=['round_id'])['round_length']
rounds_without_timeouts.head(5)

0      57.937500
10     65.125000
20    113.546875
30    115.312500
40     48.343750
Name: round_length, dtype: float64

Next, we perform the t-test:

In [163]:
stats.ttest_ind(rounds_with_timeouts, rounds_without_timeouts)

TtestResult(statistic=np.float64(2.7686405664351414), pvalue=np.float64(0.005704278101797772), df=np.float64(1382.0))

Since we obtained $p=0.0057 < \alpha=0.05$, we reject $H_0$

Therefore, there is a difference in round times if a timeout was called versus without

### Determining if Site Hit Time is Different where T versus CT win

For this part, we will use an unpaired t-test to determine if there is a difference in mean site hit time if T wins versus if CT wins

Our null hypothesis $H_0$ will be there is no difference in the average site hit time in rounds where T wins versus if CT wins

Our alternate  hypothesis $H_a$ will be there is a difference in the average site hit time in rounds where T wins versus if CT wins

These tests will be performed with $\alpha=0.05$

First, we obtain our two groups from our dataset:

In [None]:
site_hit_times_t = demos.loc[demos['round_result'] == 'T'].drop_duplicates(subset='round_id')['round_site_hit_time'].dropna()
site_hit_times_t.head(5)

20     93.000000
30     45.093750
40     37.640625
50     33.328125
110    15.296875
Name: round_site_hit_time, dtype: float64

In [167]:
site_hit_times_ct = demos.loc[demos['round_result'] == 'CT'].drop_duplicates(subset='round_id')['round_site_hit_time'].dropna()
site_hit_times_ct.head(5)

0       23.078125
60      64.765625
70     100.890625
200     76.843750
210     25.046875
Name: round_site_hit_time, dtype: float64

Next, we perform the t-test:

In [168]:
stats.ttest_ind(site_hit_times_t, site_hit_times_ct)

TtestResult(statistic=np.float64(-0.657191973893648), pvalue=np.float64(0.5112467435502706), df=np.float64(799.0))

Since we obtained $p=0.5362 > \alpha=0.05$, we do not reject $H_0$

Therefore, there is no evidence to suggest that the site hit time is different between rounds that were won by T versus rounds won by CT.

However, a limitation of this test is that there are a significant number of rounds where `site_hit_time` is `NaN`, which places doubts on this test.

### Determining if Bomb Plant Time is Different where T versus CT Win

For this part, we will use an unpaired t-test to determine if there is a difference in mean bomb plant time if T wins versus if CT wins

Our null hypothesis $H_0$ will be there is no difference in the average bomb plant time in rounds where T wins versus if CT wins

Our alternate  hypothesis $H_a$ will be there is a difference in the average bomb plant time in rounds where T wins versus if CT wins

These tests will be performed with $\alpha=0.05$

First, we obtain our two groups from our dataset:

In [173]:
round_bomb_plant_times_t = demos.loc[demos['round_result'] == 'T'].drop_duplicates(subset='round_id')['round_bomb_plant_time'].dropna()
round_bomb_plant_times_t.head(5)

20    103.500000
30     80.546875
50     40.890625
80    112.218750
90     94.953125
Name: round_bomb_plant_time, dtype: float64

In [174]:
round_bomb_plant_times_ct = demos.loc[demos['round_result'] == 'CT'].drop_duplicates(subset='round_id')['round_bomb_plant_time'].dropna()
round_bomb_plant_times_ct.head(5)

70     113.093750
120     46.437500
210     43.281250
240    113.234375
400    100.984375
Name: round_bomb_plant_time, dtype: float64

Next, we perform the t-test:

In [175]:
stats.ttest_ind(site_hit_times_t, site_hit_times_ct)

TtestResult(statistic=np.float64(7.867161263480484), pvalue=np.float64(1.0444211216279626e-14), df=np.float64(894.0))

Since we obtained $p=0.0000 < \alpha=0.05$, we reject $H_0$

Therefore, there is no evidence to suggest that the bomb time is different between rounds that were won by T versus rounds won by CT.

However, a limitation of this test is that there are a significant number of rounds where `bomb_plant_time` is `NaN`, which places doubts on this test. Even still, this could indicate things like the success rate of a retake, if a round took longer, among other things. One potential reason for this difference is if a round drags on for longer, player's HP values may be lower than in shorter rounds, affecting how rounds are played.