In [None]:
import random
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from scipy.stats import bernoulli,norm,geom,poisson,expon,binom
from warnings import filterwarnings
from sklearn import preprocessing

In [None]:
filterwarnings('ignore')
pd.set_option('display.float_format',lambda x:'%.2f'%x) #不使用科学计数法
sns.set(font_scale=1.5) #设置统计图字体大小
plt.rcParams['font.sans-serif']=['SimHei'] #在统计图上显示中文
plt.style.use({'figure.figsize':(24, 8)})  #设置画布大小

# 读取样本

In [None]:
df_us = pd.read_csv('./dataset/covid/us.csv')
df_state = pd.read_csv('./dataset/covid/us-states.csv')
df_county = pd.read_csv('./dataset/covid/us-counties.csv')

# 日期类型转换并按日期排序

In [None]:
df_us['date'] = pd.to_datetime(df_us['date'])
df_us['date'] = df_us['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_state['date'] = pd.to_datetime(df_state['date'])
df_state['date'] = df_state['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_county['date'] = pd.to_datetime(df_county['date'])
df_county['date'] = df_county['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_us.sort_values('date',ascending=True,inplace=True)
df_state.sort_values('date',ascending=True,inplace=True)
df_county.sort_values('date',ascending=True,inplace=True)

# 查看数据结构

In [None]:
print("国家汇总数据(%d)"%df_us.size)
print("时间范围:",df_us['date'].min(),df_us['date'].max())
df_us.info()
print("州汇总数据(%d):"%df_state.size)
print("时间范围:",df_state['date'].min(),df_state['date'].max())
df_state.info()
print("县汇总数据(%d):"%df_county.size)
print("时间范围:",df_county['date'].min(),df_county['date'].max())
df_county.info()

# 样本质量检查

In [None]:
print("国家数据空值检查")
print(df_us.isnull().sum())
print("州数据空值检查")
print(df_state.isnull().sum())
print("县数据空值检查")
print(df_county.isnull().sum())
sample_ts = pd.to_datetime(df_us['date'])
sample_ts = sample_ts.apply(lambda x: x.strftime('%Y-%m-%d'))
idx = pd.date_range(df_us.date.min(), df_us.date.max(),freq='d')
idx = idx.format(formatter=lambda x: x.strftime('%Y-%m-%d'))
for dt in idx:
    if not dt in sample_ts.tolist():
        print("缺失日期：",dt)

# 检查数据是否正确 
分别统计国家、州、县数据的每天感染人数和死亡人数，然后再判断数据是否一致

In [None]:
gf_us = df_us.groupby('date')
test_us = pd.DataFrame()
for dt,value in gf_us:
    cases = int(value['cases'].sum())
    deaths = int(value['deaths'].sum())
    df = pd.DataFrame([[dt,cases,deaths]], columns=['date','cases','deaths'])
    test_us = test_us.append(df, ignore_index=True)
test_us = test_us.set_index('date')
gf_state = df_state.groupby('date')
test_state = pd.DataFrame()
for dt,value in gf_state:
    cases = int(value['cases'].sum())
    deaths = int(value['deaths'].sum())
    df = pd.DataFrame([[dt,cases,deaths]], columns=['date','cases','deaths'])
    test_state = test_state.append(df, ignore_index=True)
test_state = test_state.set_index('date')
gf_county = df_county.groupby('date')
test_county = pd.DataFrame()
for dt,value in gf_county:
    cases = int(value['cases'].sum())
    deaths = int(value['deaths'].sum())
    df = pd.DataFrame([[dt,cases,deaths]], columns=['date','cases','deaths'])
    test_county = test_county.append(df, ignore_index=True)
test_county = test_county.set_index('date')
compare = test_us.compare(test_state,align_axis=1,keep_shape=False,keep_equal=False)
print("国家数据VS州数据：不同的样本数 ",compare.values.size)
compare = test_us.compare(test_county,align_axis=1,keep_shape=False,keep_equal=False)
print("国家数据VS县数据：不同的样本数 ",compare.values.size)

# 熟悉数据分布范围

In [None]:
print("国家数据统计")
print(df_us.describe())
print("州数据统计")
print(df_state.describe())
print("县数据统计")
print(df_county.describe())

# 数据集成

In [None]:
df_summary = df_us.copy()
df_summary['case_increase'] = df_summary['cases'].diff()
df_summary['death_increase'] = df_summary['deaths'].diff()
gf_state = df_state.groupby('date')
state_count = []
for dt,value in gf_state:
    count = len(value['state'].unique())
    state_count.append(count)
df_summary['state_count'] = pd.Series(state_count)
df_summary.set_index('date',inplace=True)

df_total = pd.DataFrame()
df_total['date'] = df_state['date']
df_total['cases'] = df_state['cases']
df_total['deaths'] = df_state['deaths']
df_total['case_increase'] = df_state['cases'].diff()
df_total['death_increase'] = df_state['deaths'].diff()
df_total['state'] = df_state['state']
df_total['state_code'] = df_state['fips']

# 单变量分析

In [None]:
df_us['case_increase'] = df_us['cases'].diff()
df_us['death_increase'] = df_us['deaths'].diff()
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文
fig,axes=plt.subplots(2,2,figsize=(16,10))
ax = sns.lineplot(x=df_us.index, y="cases",data=df_us,ax=axes[0,0])
ax.set_title('感染人数趋势图')
ax.ticklabel_format(style='plain')
ax = sns.lineplot(x=df_us.index, y="case_increase",data=df_us,ax=axes[1,0])
ax.set_title('感染人数增长趋势图')
ax = sns.lineplot(x=df_us.index, y="deaths",data=df_us,ax=axes[0,1])
ax.set_title('死亡人数趋势图')
ax = sns.lineplot(x=df_us.index, y="death_increase",data=df_us,ax=axes[1,1])
ax.set_title('死亡人数增长趋势图')

# 多变量分析

In [None]:
df_state['month'] = pd.to_datetime(df_state['date']).dt.month
pt = df_state.pivot_table(index='state',columns='month',values='cases',aggfunc=np.mean)
plt.style.use({'figure.figsize':(16, 6)})
cmap = sns.cubehelix_palette(start = 1, rot = 30, gamma=0.8, as_cmap = True)
sns.heatmap(pt,cmap=cmap,linewidths=0.01,annot=False)

In [None]:
df_us.corr()