# Template notebook

It's good to start with an introduction, to set the scene and introduce your audience to the data, and the problem you're solving as a team.

<br>

## Libraries
As always, we'll start by importing the necessary libraries.

In [6]:
# Import Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Question / Task 1**

Insert context about question / task 1 here.

In [7]:
# Read in the data for the hackathon

df = pd.read_csv('data/corona_tested_individuals_ver_006.english.csv')

#df = pd.read_csv("data/rental_data.csv")

In [8]:
df.head()

In [9]:
#define a function to summarise columns for a given dataframe:

def summarise_columns(input_df):
    df_summary = pd.DataFrame(input_df.isnull().sum(),columns = ['null_count'])
    df_summary['null_proportion'] = df_summary['null_count']/len(input_df)
    df_summary['count_unique'] = input_df.nunique()
    df_summary['data_type'] = input_df.dtypes
    return df_summary

#define a function to show different values for each column

def value_counts_summary(input_df,factor_name,sort_index = False, max_levels = 1000):
    #create 
    summary_df = pd.DataFrame(columns = ['count','proportion','cumul_proportion'])

    if sort_index:
        summary_df['count'] = input_df[factor_name].value_counts().sort_index()
    else:
        summary_df['count'] = input_df[factor_name].value_counts()

    summary_df['proportion'] = summary_df['count']/len(input_df)
    summary_df['cumul_proportion'] = summary_df['proportion'].cumsum()
    summary_df.reset_index(inplace= True)
    return summary_df[:min(max_levels,len(summary_df))]


In [10]:
value_counts_summary(df,'age_60_and_above')

In [11]:
summarise_columns(df)

In [12]:
#look at time period for test_date
#noticed that this spans a longer tie period than in paper (22nd March to 7th April)

value_counts_summary(df,'test_date', sort_index = True)

In [13]:
#convert test date to datetime

df['test_date'] = pd.to_datetime(df['test_date'])

In [14]:
list_columns = df.columns.to_list()

In [15]:
for column in list_columns:
    print(column)
    print(value_counts_summary(df,column, sort_index = True))
    print('-'*20)

In [16]:
#create dataframes that match those used in the paper
df_paper_pre_train = df[df['test_date']<'2020-03-22']
df_paper_train = df[df['test_date'].between('2020-03-21','2020-03-31','right')]
df_paper_test = df[df['test_date'].between('2020-03-31','2020-04-07','right')]
df_paper_post_test = df[df['test_date']>'2020-04-07']


In [17]:
value_counts_summary(df_paper_test,'test_date',sort_index = True)

print(len(df_paper_test))

In [18]:
print(f' shape of train-validation date range used in paper {df_paper_train.shape}')
print(f' shape of test date range used in paper {df_paper_test.shape}')

print('records in train-validation quoted in paper: 51831')
print('records in test quoted in paper: 47401')


print('sizes slightly different but could be due to a lter reevisoon of the dataset')

In [19]:
#summarise_columns

df_paper_pre_train['test_indication']

#summarise_columns(df_paper_pre_train)


In [20]:
summarise_columns(df_paper_train)



In [21]:
summarise_columns(df_paper_test)

In [22]:
summarise_columns(df_paper_post_test)

In [23]:
df['corona_result'].value_counts(normalize = True)

In [24]:
list_test_periods = [
df_paper_pre_train, 
df_paper_train, 
df_paper_test, 
df_paper_post_test
]

In [25]:
for period in list_test_periods:
    print(f'period from: {period.test_date.min()}')
    print(period['corona_result'].value_counts(normalize = True))
    print('-'*20)

In [31]:
#define a function that splits data as per the paper (default)
#alternatively split the data between train and test at a single point
#the end dates are the last dates that are included in the split
#include data only from 22nd March

def get_data(df,source = 'paper', train_end = '2020-03-31',test_end = '2020-04-30'):#,train_start,test_start,test_end:
    if source == 'paper':
        df_trainvalidation = df[df['test_date'].between('2020-03-21','2020-03-31','right')]
        df_test = df[df['test_date'].between('2020-03-31','2020-04-07','right')]
    else:
        df_trainvalidation = df[df['test_date'].between('2020-03-21',train_end,'right')]
        df_test = df[df['test_date'].between(train_end,test_end,'right')]
    return df_trainvalidation,df_test

    


In [32]:
#use function to split dataset as per paper

df_train,df_test = get_data(df,source = 'paper')

In [33]:
value_counts_summary(df_train,'test_date',sort_index = True)

In [34]:
value_counts_summary(df_test,'test_date',sort_index = True)


# Exploratory Data Analysis


In [42]:
#df['test_date']

#df['test_date_wc'] = 

#df['test_date'].dt.isocalendar().day

df['test_date_wc'] = df['test_date'] - pd.to_timedelta(df['test_date'].dt.isocalendar().day-1, unit='D')

In [43]:
df[['test_date','test_date_wc']]

In [44]:
sns.histplot(data = df,x ='test_date_wc')

#seaborn.histplot(data=None, *, x=None, y=None, hue=None, weights=None, stat='count', bins='auto', binwidth=None, binrange=None, discrete=None, cumulative=False, common_bins=True, common_norm=True, multiple='layer', element='bars', fill=True, shrink=1, kde=False, kde_kws=None, line_kws=None, thresh=0, pthresh=None, pmax=None, cbar=False, cbar_ax=None, cbar_kws=None, palette=None, hue_order=None, hue_norm=None, color=None, log_scale=None, legend=True, ax=None, **kwargs)

In [56]:
list_features = df.drop(columns = ['test_date','corona_result']).columns.to_list()
#print(list_features)

In [57]:
#produce histogram plots 

sns.set_style("darkgrid")


plt.figure(figsize=(14, len(list_features) * 3))
for idx, feature in enumerate(list_features, 1):
    plt.subplot(len(list_features), 2, idx)
    #sns.histplot(df[feature], kde=False,hue = df[feature])
    sns.histplot(data = df,x = feature, kde=False,hue = 'corona_result')
    plt.title(f"{feature}")
#plt.title(f"{feature} | Skewness: {round(df_modelling[feature].skew(), 2)}")

plt.tight_layout()
plt.show()

In [50]:
sns.histplot(data = df,x = 'gender', kde=False,hue = 'corona_result')

In [59]:
df_positive = df[df['corona_result']=='positive']

In [60]:
#look at distributions only where positive covid test results.

sns.set_style("darkgrid")


plt.figure(figsize=(14, len(list_features) * 3))
for idx, feature in enumerate(list_features, 1):
    plt.subplot(len(list_features), 2, idx)
    #sns.histplot(df[feature], kde=False,hue = df[feature])
    sns.histplot(data = df_positive,x = feature, kde=False,hue = 'corona_result')
    plt.title(f"{feature}")
#plt.title(f"{feature} | Skewness: {round(df_modelling[feature].skew(), 2)}")

plt.tight_layout()
plt.show()

In [66]:
#replicating figures in paper re: biased reporting
#of those who report positive for a symptom , what proportion had covid.
#query whehter appropriate i.e. if posutive is only confirmed on positive test, strng target leakage

corona_mask = df_paper_train.corona_result == 'positive'

fever_mask =  df_paper_train.fever == 1.0
cough_mask =  df_paper_train.cough == 1.0
head_ache_mask =  df_paper_train.head_ache == 1.0

print(df_paper_train.loc[corona_mask & fever_mask].shape[0] / df_paper_train.loc[fever_mask].shape[0])
print(df_paper_train.loc[corona_mask & cough_mask].shape[0] / df_paper_train.loc[cough_mask].shape[0])
print(df_paper_train.loc[corona_mask & head_ache_mask].shape[0] / df_paper_train.loc[head_ache_mask].shape[0])
 

In [None]:
corona_mask = df_paper_train.corona_result == 'positive'

fever_mask =  df_paper_train.fever == 1.0

df_paper_train.loc[corona_mask & fever_mask].shape[0] / df_paper_train.loc[fever_mask].shape[0]

In [67]:
# class imbalance in the target

sns.histplot(data = df,x = 'corona_result', kde=False)

In [68]:
sns.histplot(data = df_paper_train,x = 'corona_result', kde=False)

In [70]:
#import datacompy

In [78]:
df_summary_test = pd.DataFrame(df_paper_train['corona_result'].value_counts(normalize = True),columns = ['proportion']).reset_index()

In [79]:
df_summary_test 

In [108]:
def compare_features(df1,df2,feature_name):
    df1_summary = pd.DataFrame(df1[feature_name].value_counts(normalize = True,dropna=False),columns = ['proportion']).reset_index()
    df2_summary = pd.DataFrame(df2[feature_name].value_counts(normalize = True,dropna=False),columns = ['proportion']).reset_index()
    #df1_summary = pd.DataFrame(df1[feature_name].value_counts(normalize = False),columns = ['proportion']).reset_index()
    #df2_summary = pd.DataFrame(df2[feature_name].value_counts(normalize = False),columns = ['proportion']).reset_index()
    #df1_summary['proportion'] = df1_summary['proportion']/len(df1_summary)
    #df2_summary['proportion'] = df2_summary['proportion']/len(df2_summary)
    
    df_compare = df1_summary.merge(df2_summary, left_on=feature_name, right_on=feature_name,how = 'outer')
    return df_compare

In [109]:
compare_features(df_paper_train,df_paper_test,'corona_result')

In [98]:
print(compare_features(df_paper_train,df_paper_train,'gender'))

In [100]:
#compare proportions in train and test datasets

for column in list_columns:
    print(column)
    print(compare_features(df_train,df_test,column))
    print('-'*40)

In [101]:
for column in list_columns:
    print(column)
    print(compare_features(df_paper_train,df_paper_post_test,column))
    print('-'*40)