In [1]:
import pandas as pd
from datetime import datetime
import os

**Searching for each .csv file in the 'raw_data' folder**

In [2]:
data_path = '../data/raw_data'
lst_df = []
for root, dirs, files in os.walk(data_path):
    for filename in files:
        xlsx_file, file_extension = os.path.splitext(filename)
        print('.csv file found:\n')
        if(file_extension == '.csv'):
            print(filename)
            file_path = root +'\\' + filename
            lst_df.append(pd.read_csv(file_path))  

.csv file found:

21-04-24 16h14m31s.csv


**Drop NA**

In [3]:
final_df = pd.concat(lst_df)

print('Dataframe shape:', final_df.shape)
print('Total nan: \n\n', final_df.isna().sum())

final_df.dropna(inplace=True)
print('\nDataframe shape:', final_df.shape)

Dataframe shape: (46000, 9)
Total nan: 

 Unnamed: 0      0
match_id        0
radiant_win     0
avg_mmr         0
duration        0
lobby_type      0
game_mode       0
radiant_team    0
dire_team       0
dtype: int64

Dataframe shape: (46000, 9)


**Remove duplicated rows**

In [4]:
final_df.drop_duplicates(subset=['match_id'])
print('\nDataframe shape:', final_df.shape)


Dataframe shape: (46000, 9)


**Adding column for each hero on radiant and dire team**

In [5]:
radiant_columns = ['Radiant 1','Radiant 2', 'Radiant 3', 'Radiant 4', 'Radiant 5']
dire_columns = ['Dire 1','Dire 2', 'Dire 3', 'Dire 4', 'Dire 5']

final_df[radiant_columns] = final_df.radiant_team.str.split(",",expand=True,)
final_df[dire_columns] = final_df.dire_team.str.split(",",expand=True,)
print('\nDataframe shape:', final_df.shape)


Dataframe shape: (46000, 19)


**Removing Unnamed and dire_team and radiant_team column**

In [6]:
final_df.drop(columns=['Unnamed: 0', 'dire_team', 'radiant_team', 'match_id'], inplace=True)
print('\nDataframe shape:', final_df.shape)


Dataframe shape: (46000, 15)


**Converting string to numerical**

In [7]:
final_df['radiant_win'] = final_df['radiant_win'].astype(int)
final_df = final_df.apply(pd.to_numeric)

In [9]:
final_df

Unnamed: 0,radiant_win,avg_mmr,duration,lobby_type,game_mode,Radiant 1,Radiant 2,Radiant 3,Radiant 4,Radiant 5,Dire 1,Dire 2,Dire 3,Dire 4,Dire 5
0,1,3439,1649,7,22,121,119,2,13,70,129,83,80,63,30
1,1,3774,1848,0,22,109,10,108,128,84,123,69,67,73,71
2,1,3311,1951,7,22,129,81,35,88,5,32,135,37,121,126
3,1,3408,995,0,22,83,56,17,62,9,26,32,51,6,135
4,0,4621,1818,7,22,11,100,26,44,2,135,42,25,30,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45995,0,3414,2507,7,22,62,70,35,33,26,84,109,99,81,36
45996,1,3349,2156,7,22,26,121,22,102,70,43,8,30,38,75
45997,0,3448,2534,7,22,113,26,76,121,99,41,7,9,17,47
45998,1,3702,1650,7,3,123,11,57,18,88,109,112,37,97,96


**Saving data frame on 'working data' folder**

In [10]:
working_data_path = '../data/working_data/'
start_file = datetime.now().strftime("%Y-%m-%d")
output_file = working_data_path + start_file + '_working_data.csv'

final_df.to_csv(output_file, index=False)