In [1]:
import pandas as pd
from datetime import datetime
import os

**Searching for each .csv file in the 'raw_data' folder**

In [2]:
data_path = '../data/raw_data'
lst_df = []
for root, dirs, files in os.walk(data_path):
    for filename in files:
        xlsx_file, file_extension = os.path.splitext(filename)
        print('.csv file found:\n')
        if(file_extension == '.csv'):
            print(filename)
            file_path = root +'\\' + filename
            lst_df.append(pd.read_csv(file_path))  

.csv file found:

data.csv
.csv file found:

.csv file found:

.csv file found:



**Drop NA**

In [3]:
final_df = pd.concat(lst_df)

print('Dataframe shape:', final_df.shape)
print('Total nan: \n\n', final_df.isna().sum())

final_df.dropna(inplace=True)
print('\nDataframe shape:', final_df.shape)

Dataframe shape: (113400, 9)
Total nan: 

 Unnamed: 0      0
match_id        0
radiant_win     0
avg_mmr         0
duration        0
lobby_type      0
game_mode       0
radiant_team    0
dire_team       0
dtype: int64

Dataframe shape: (113400, 9)


**Remove duplicated rows**

In [4]:
final_df.drop_duplicates(subset=['match_id'])
print('\nDataframe shape:', final_df.shape)


Dataframe shape: (113400, 9)


**Adding column for each hero on radiant and dire team**

In [5]:
radiant_columns = ['Radiant 1','Radiant 2', 'Radiant 3', 'Radiant 4', 'Radiant 5']
dire_columns = ['Dire 1','Dire 2', 'Dire 3', 'Dire 4', 'Dire 5']

final_df[radiant_columns] = final_df.radiant_team.str.split(",",expand=True,)
final_df[dire_columns] = final_df.dire_team.str.split(",",expand=True,)
print('\nDataframe shape:', final_df.shape)


Dataframe shape: (113400, 19)


**Removing Unnamed and dire_team and radiant_team column**

In [6]:
final_df.drop(columns=['Unnamed: 0', 'dire_team', 'radiant_team', 'match_id'], inplace=True)
print('\nDataframe shape:', final_df.shape)


Dataframe shape: (113400, 15)


**Converting string to numerical**

In [7]:
final_df['radiant_win'] = final_df['radiant_win'].astype(int)
final_df = final_df.apply(pd.to_numeric)

In [8]:
final_df

Unnamed: 0,radiant_win,avg_mmr,duration,lobby_type,game_mode,Radiant 1,Radiant 2,Radiant 3,Radiant 4,Radiant 5,Dire 1,Dire 2,Dire 3,Dire 4,Dire 5
0,0,4104,2147,7,22,94,67,112,5,16.0,14,90,52,1,121
1,0,6421,1526,7,22,72,9,58,89,16.0,79,25,12,119,120
2,1,5062,1323,7,22,111,48,53,101,26.0,31,41,10,88,62
3,1,3943,1956,7,22,41,83,74,123,114.0,109,22,98,107,30
4,0,3515,2087,7,3,59,112,4,41,31.0,88,11,96,110,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113395,0,3719,1979,7,3,97,101,62,1,110.0,112,17,70,16,119
113396,0,3976,822,0,22,7,63,22,64,10.0,86,11,87,3,54
113397,0,3699,2124,7,22,108,32,112,7,74.0,48,71,20,10,111
113398,0,3757,1670,7,22,14,27,28,1,113.0,3,73,9,87,129


**Saving data frame on 'working data' folder**

In [9]:
working_data_path = '../data/working_data/'
start_file = datetime.now().strftime("%Y-%m-%d")
output_file = working_data_path + start_file + '_working_data.csv'

final_df.to_csv(output_file, index=False)