# Predict Covid Spread - Data Cleaning

In [150]:
import pandas as pd
from os.path import exists
import warnings

warnings.filterwarnings('ignore')

# Reading in file
data = pd.read_csv('archive/covid_19_clean_complete.csv',parse_dates=['Date'])

# Dropping unnecessary columns
data.drop('Lat', axis=1, inplace=True)
data.drop('Long', axis=1, inplace=True)
data.drop('WHO Region', axis=1, inplace=True)

# Renaming columns
data.rename(columns={'Province/State': 'State',
                     'Country/Region': 'Country'}, inplace=True)

# Grouping states into one Country Value
data = data.groupby(['Country','Date'], as_index=False).sum()

# Sorting by Date
data = data.sort_values(by=['Date','Country'])
data = data.reset_index(drop=True)
data.head()

# Finding train test split indices
ind = data[data['Date'] == '2020-03-12']
start = ind.index.tolist()[0]

ind2 = data[data['Date'] == '2020-04-23']
end = ind2.index.tolist()[-1]

# Making train and test sets
train = data.iloc[0:start, :]
test = data.iloc[start:end+1,:]

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active
0,Afghanistan,2020-01-22,0,0,0,0
1,Albania,2020-01-22,0,0,0,0
2,Algeria,2020-01-22,0,0,0,0
3,Andorra,2020-01-22,0,0,0,0
4,Angola,2020-01-22,0,0,0,0
...,...,...,...,...,...,...
9345,West Bank and Gaza,2020-03-11,30,0,0,30
9346,Western Sahara,2020-03-11,0,0,0,0
9347,Yemen,2020-03-11,0,0,0,0
9348,Zambia,2020-03-11,0,0,0,0


In [151]:
test

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active
0,Afghanistan,2020-03-12,7,0,0,7
1,Albania,2020-03-12,23,1,0,22
2,Algeria,2020-03-12,24,1,8,15
3,Andorra,2020-03-12,1,0,1,0
4,Angola,2020-03-12,0,0,0,0
...,...,...,...,...,...,...
8036,West Bank and Gaza,2020-04-23,336,2,74,260
8037,Western Sahara,2020-04-23,6,0,5,1
8038,Yemen,2020-04-23,1,0,0,1
8039,Zambia,2020-04-23,76,3,37,36


In [152]:
# Exporting files
file_path = '[Cleaned] Week 1 Forecasting Data/train.csv'
file_path2 = '[Cleaned] Week 1 Forecasting Data/test.csv'

if not exists(file_path):
    train.to_csv(file_path)

if not exists(file_path2):
    test.to_csv(file_path2)