# Анализ данных по COVID-19

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [18]:
data = pd.read_csv("covid_19_data.csv")
data.head(10)

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
5,6,01/22/2020,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0
6,7,01/22/2020,Guangxi,Mainland China,1/22/2020 17:00,2.0,0.0,0.0
7,8,01/22/2020,Guizhou,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
8,9,01/22/2020,Hainan,Mainland China,1/22/2020 17:00,4.0,0.0,0.0
9,10,01/22/2020,Hebei,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


In [5]:
data.shape

(156292, 8)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156292 entries, 0 to 156291
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   SNo              156292 non-null  int64  
 1   ObservationDate  156292 non-null  object 
 2   Province/State   111979 non-null  object 
 3   Country/Region   156292 non-null  object 
 4   Last Update      156292 non-null  object 
 5   Confirmed        156292 non-null  float64
 6   Deaths           156292 non-null  float64
 7   Recovered        156292 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 9.5+ MB


In [10]:
data.columns  # в columns хранится вектор со спец класса index в котором нах-ся названия ключей для проверки на АРТЕФАКТЫ

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
      dtype='object')

In [19]:
# удаление ненужных столбцов по ключам
data = data.drop(['SNo', 'Last Update'], axis=1)  # axis=1 это слобец, по умолчанию равен 0
# data = data.drop(['SNo', 'Last Update'], axis=1, inplace=True)
data.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,0.0,0.0,0.0


In [15]:
data.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,0.0,0.0,0.0


In [21]:
# проверка на дубликаты
data.duplicated(['ObservationDate', 'Province/State', 'Country/Region']).sum()  # в каких строках дубликаты? использовать маскирование 

4

In [22]:
# удаление дубликатов
data = data.drop_duplicates(['ObservationDate', 'Province/State', 'Country/Region'])

In [23]:
data.duplicated(['ObservationDate', 'Province/State', 'Country/Region']).sum()

0

In [26]:
# изучаем страны
country_list = data['Country/Region'].unique()

for country in np.sort(country_list):
    print(country)

 Azerbaijan
('St. Martin',)
Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas
Bahamas, The
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Brunei
Bulgaria
Burkina Faso
Burma
Burundi
Cabo Verde
Cambodia
Cameroon
Canada
Cape Verde
Cayman Islands
Central African Republic
Chad
Channel Islands
Chile
Colombia
Comoros
Congo (Brazzaville)
Congo (Kinshasa)
Costa Rica
Croatia
Cuba
Curacao
Cyprus
Czech Republic
Denmark
Diamond Princess
Djibouti
Dominica
Dominican Republic
East Timor
Ecuador
Egypt
El Salvador
Equatorial Guinea
Eritrea
Estonia
Eswatini
Ethiopia
Faroe Islands
Fiji
Finland
France
French Guiana
Gabon
Gambia
Gambia, The
Georgia
Germany
Ghana
Gibraltar
Greece
Greenland
Grenada
Guadeloupe
Guam
Guatemala
Guernsey
Guinea
Guinea-Bissau
Guyana
Haiti
Holy See
Honduras
Hong Kong
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Ivory Coast
Jamaic

In [27]:
# смотрим на Others регион   МАСКИРОВАНИЕ 
data[data['Country/Region'] == 'Others']

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
933,02/07/2020,Cruise Ship,Others,61.0,0.0,0.0
1005,02/08/2020,Cruise Ship,Others,61.0,0.0,0.0
1077,02/09/2020,Diamond Princess cruise ship,Others,64.0,0.0,0.0
1143,02/10/2020,Diamond Princess cruise ship,Others,135.0,0.0,0.0
1215,02/11/2020,Diamond Princess cruise ship,Others,135.0,0.0,0.0
1286,02/12/2020,Diamond Princess cruise ship,Others,175.0,0.0,0.0
1359,02/13/2020,Diamond Princess cruise ship,Others,175.0,0.0,0.0
1433,02/14/2020,Diamond Princess cruise ship,Others,218.0,0.0,0.0
1506,02/15/2020,Diamond Princess cruise ship,Others,285.0,0.0,0.0
1578,02/16/2020,Diamond Princess cruise ship,Others,355.0,0.0,0.0


In [28]:
data.columns

Index(['ObservationDate', 'Province/State', 'Country/Region', 'Confirmed',
       'Deaths', 'Recovered'],
      dtype='object')

In [29]:
data['ObservationDate']

0         01/22/2020
1         01/22/2020
2         01/22/2020
3         01/22/2020
4         01/22/2020
             ...    
156287    11/15/2020
156288    11/15/2020
156289    11/15/2020
156290    11/15/2020
156291    11/15/2020
Name: ObservationDate, Length: 156288, dtype: object

In [31]:
# date_str ='11/15/2020'
# pd.to_datetime(date_str).date()

datetime.date(2020, 11, 15)

In [32]:
# конвертация временных данных в тип 

data['ObservationDate'] = pd.to_datetime(data['ObservationDate'])
data['ObservationDate']

0        2020-01-22
1        2020-01-22
2        2020-01-22
3        2020-01-22
4        2020-01-22
            ...    
156287   2020-11-15
156288   2020-11-15
156289   2020-11-15
156290   2020-11-15
156291   2020-11-15
Name: ObservationDate, Length: 156288, dtype: datetime64[ns]

In [34]:
# data['ObservationDate'][0] # [0]  обращение к нулевому индексу

Timestamp('2020-01-22 00:00:00')

In [39]:
# извлечение из TimeStamp даты

data['Date'] = data['ObservationDate'].apply(lambda element: element.date())

Timestamp('2020-01-22 00:00:00')

In [40]:
data.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered,Date
0,2020-01-22,Anhui,Mainland China,1.0,0.0,0.0,2020-01-22
1,2020-01-22,Beijing,Mainland China,14.0,0.0,0.0,2020-01-22
2,2020-01-22,Chongqing,Mainland China,6.0,0.0,0.0,2020-01-22
3,2020-01-22,Fujian,Mainland China,1.0,0.0,0.0,2020-01-22
4,2020-01-22,Gansu,Mainland China,0.0,0.0,0.0,2020-01-22


In [45]:
# группировка данных по дате и вывод суммарного значения по миру
df_by_date = data.groupby(['Date']).sum().reset_index()
df_by_date

Unnamed: 0,Date,Confirmed,Deaths,Recovered
0,2020-01-22,555.0,17.0,28.0
1,2020-01-23,653.0,18.0,30.0
2,2020-01-24,941.0,26.0,36.0
3,2020-01-25,1438.0,42.0,39.0
4,2020-01-26,2118.0,56.0,52.0
...,...,...,...,...
294,2020-11-11,52126793.0,1284472.0,33926351.0
295,2020-11-12,52740580.0,1293954.0,34149223.0
296,2020-11-13,53365400.0,1302556.0,34450332.0
297,2020-11-14,53927158.0,1311192.0,34726418.0


In [59]:
# статистика по странам
df_by_country = data.groupby(['Country/Region'])[['Confirmed', 'Deaths', 'Recovered']].sum()
# df_by_country.loc[:, df_by_country.loc[:, 'Country/Region'] == 'Russia']