In [1]:
#library imports
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix  # evaluation metrics

from sklearn.datasets import fetch_openml 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [2]:
file_path = 'CS2_HLTV_MATCH_DATA.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to get an overview
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1202 entries, 0 to 1201
Columns: 158 entries, matchID to T2_player4_Rating 1.0
dtypes: float64(140), int64(2), object(16)
memory usage: 1.4+ MB


(   matchID  team1   team2       map  team1_win T1_mapwinrate T2_mapwinrate  \
 0  2368924   Case    Hype   Ancient          1         76.9%          0.0%   
 1  2368925    AJF  Evolve   Vertigo          1         66.7%          0.0%   
 2  2368916  Space    IKLA    Anubis          1         27.3%         33.3%   
 3  2368916  Space    IKLA  Overpass          0         20.0%         28.6%   
 4  2368916  Space    IKLA    Mirage          0         41.7%         00.0%   
 
    T1_player0_Total kills T1_player0_Headshot %  T1_player0_Total deaths  ...  \
 0                   234.0                 26.9%                    171.0  ...   
 1                    42.0                 23.8%                     46.0  ...   
 2                   159.0                 59.1%                    177.0  ...   
 3                    67.0                 55.2%                     88.0  ...   
 4                   166.0                 57.8%                    207.0  ...   
 
   T1_player0_Rating 2.0  T2_p

In [3]:
# Check for columns that are either completely empty or have a high proportion of missing values.
# Any column that has more than 90% missing values can be considered faulty or empty.

# Calculating the percentage of missing values in each column
missing_percentage = data.isnull().mean() * 100

# Identifying columns that have more than 90% missing values
faulty_columns = missing_percentage[missing_percentage > 90].index.tolist()

# Dropping these faulty columns from the dataset
cleaned_data = data.drop(columns=faulty_columns)

# Display the list of dropped columns and the shape of the cleaned dataset
dropped_columns = faulty_columns
cleaned_data_shape = cleaned_data.shape

dropped_columns, cleaned_data_shape

(['T1_player0_Rating 1.0',
  'T1_player1_Rating 1.0',
  'T2_player0_Rating 1.0',
  'T2_player1_Rating 1.0',
  'Unnamed: 152',
  'T1_player2_Rating 1.0',
  'T1_player3_Rating 1.0',
  'T1_player4_Rating 1.0',
  'T2_player2_Rating 1.0',
  'T2_player3_Rating 1.0',
  'T2_player4_Rating 1.0'],
 (1202, 147))

In [4]:
# Dropping irrelevant columns such as team names and map name
columns_to_drop = ['team1', 'team2', 'map','matchID']

# Dropping the columns
cleaned_data = cleaned_data.drop(columns=columns_to_drop)


# Now we will proceed with converting the map win rates from percentage strings to numeric values.
# Removing the '%' and converting to float
cleaned_data['T1_mapwinrate'] = cleaned_data['T1_mapwinrate'].str.rstrip('%').astype('float') / 100
cleaned_data['T2_mapwinrate'] = cleaned_data['T2_mapwinrate'].str.rstrip('%').astype('float') / 100

# Let's check if the conversion is successful and inspect the first few rows of these columns
cleaned_data[['T1_mapwinrate', 'T2_mapwinrate']].head()
remaining_columns = cleaned_data.columns.tolist()
cleaned_data.shape, remaining_columns[:10]  # Show the shape and first 10 remaining columns as a sample


((1202, 143),
 ['team1_win',
  'T1_mapwinrate',
  'T2_mapwinrate',
  'T1_player0_Total kills',
  'T1_player0_Headshot %',
  'T1_player0_Total deaths',
  'T1_player0_K/D Ratio',
  'T1_player0_Damage / Round',
  'T1_player0_Grenade dmg / Round',
  'T1_player0_Maps played'])

In [6]:

# Export the cleaned data to a new CSV file for the user to review
export_file_path = 'cleaned_preprocessed.csv'
cleaned_data.to_csv(export_file_path, index=False)

# Provide the file path for the user to download
export_file_path


'cleaned_preprocessed.csv'