In [None]:
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

## Fixing columns

In [None]:
dataset = pd.read_excel('dataset.xlsx', index_col=0)

In [None]:
dataset['Urine - pH'].replace('Não Realizado', np.nan, inplace=True)

In [None]:
dataset['Urine - pH'] = dataset['Urine - pH'].astype('float64')

In [None]:
dataset.replace('not_done', np.nan, inplace=True)

In [None]:
dataset['Urine - Leukocytes'].replace('<1000', '999', inplace=True)

In [None]:
dataset['Urine - Leukocytes'] = dataset['Urine - Leukocytes'].astype('float64')

In [None]:
dataset['SARS-Cov-2 exam result'] = [0 if a == 'negative' else 1 for a in dataset['SARS-Cov-2 exam result'].values]

## Analysis of missing data

In [None]:
dataset.info(max_cols=111)

In [None]:
def plot_missing_data(missing_data, title):
    f, ax = plt.subplots(figsize=(15, 6))
    plt.xticks(rotation='90')
    sns.barplot(x=missing_data.index, y=missing_data['Percent'])
    plt.xlabel('Features', fontsize=15)
    plt.ylabel('Percent of missing values', fontsize=15)
    plt.title(title, fontsize=15)

In [None]:
total = dataset.isnull().sum().sort_values(ascending=False)
percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
plot_missing_data(missing_data, 'Percent missing data by feature')
missing_data.head(10)

In [None]:
columns_to_exclude = missing_data.index[missing_data['Percent'] > 0.998].tolist()
dataset.drop(columns=columns_to_exclude, inplace=True)

In [None]:
dataset_positive = dataset[dataset['SARS-Cov-2 exam result'] == 1]

total_positive = dataset_positive.isnull().sum().sort_values(ascending=False)
percent_positive = (dataset_positive.isnull().sum()/dataset_positive.isnull().count()).sort_values(ascending=False)
missing_data_positive = pd.concat([total_positive, percent_positive], axis=1, keys=['Total', 'Percent'])

plot_missing_data(missing_data_positive, 'Percent positive missing data by feature')
missing_data_positive.head(10)

In [None]:
dataset_negative = dataset[dataset['SARS-Cov-2 exam result'] == 0]

total_negative = dataset_negative.isnull().sum().sort_values(ascending=False)
percent_negative = (dataset_negative.isnull().sum()/dataset_negative.isnull().count()).sort_values(ascending=False)
missing_data_negative = pd.concat([total_negative, percent_negative], axis=1, keys=['Total', 'Percent'])

plot_missing_data(missing_data_negative, 'Percent negative missing data by feature')
missing_data_negative.head(10)

In [None]:
dataset.drop(columns=['Albumin'], inplace=True)

## Correlation Matrix

In [None]:
corrmat = abs(dataset.corr())

In [None]:
# Correlation with output variable
cor_target = corrmat["SARS-Cov-2 exam result"]
# Selecting highly correlated features
relevant_features = cor_target[cor_target>0.1].index.tolist()

In [None]:
f, ax = plt.subplots(figsize=(16, 8))
sns.heatmap(abs(dataset[relevant_features].corr().iloc[0:1, :]), yticklabels=[relevant_features[0]], xticklabels=relevant_features, vmin = 0.0, square=True, annot=True, vmax=1.0, cmap='RdPu')

## Negative and Positive Cases

In [None]:
nof_positive_cases = len(dataset_positive.index)
nof_negative_cases = len(dataset_negative.index)

In [None]:
fig1, ax1 = plt.subplots()
ax1.pie([nof_positive_cases, nof_negative_cases], labels=['Positive cases', 'Negative cases'], autopct='%1.1f%%', startangle=90, colors=['#c0ffd5', '#ffc0cb'])
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.