In [14]:
import pandas as pd

def dataset_exploration(dataset_path:str, timeseries=False):
    # Load the dataset
    df = pd.read_csv(dataset_path)
    
    print("\nBasic Information about the dataset:")
    df.info()
    
    print("\nSummary statistics of the dataset:")
    display(df.describe())
    
    print("\nMissing values in each column:")
    print(df.isnull().sum())
    
    print(f"\n\nThe dataset has {df.shape[0]} rows and {df.shape[1]} columns.\n")
    
    print("\nData types of each column:")
    print(df.dtypes)
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print("\nDistribution of categorical variables:")
        for col in categorical_cols:
            print(f"\nValue counts for {col}:")
            print(df[col].value_counts())
    else:
        print("\nNo categorical variables found.")

    if timeseries != False:
        time_series_column = timeseries  # time series id
        
        # Count the number of unique time series
        num_time_series = df[time_series_column].nunique()
        print(f"\nNumber of unique time series in the dataset: {num_time_series}")

In [10]:
dataset_exploration('heart.csv')


Basic Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB

Summary statistics of the dataset:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,4.734219,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,1.939706,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0



Missing values in each column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64


The dataset has 303 rows and 14 columns.


Data types of each column:
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca          float64
thal        float64
num           int64
dtype: object

No categorical variables found.


In [11]:
dataset_exploration('breast.csv')


Basic Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   radius1             569 non-null    float64
 1   texture1            569 non-null    float64
 2   perimeter1          569 non-null    float64
 3   area1               569 non-null    float64
 4   smoothness1         569 non-null    float64
 5   compactness1        569 non-null    float64
 6   concavity1          569 non-null    float64
 7   concave_points1     569 non-null    float64
 8   symmetry1           569 non-null    float64
 9   fractal_dimension1  569 non-null    float64
 10  radius2             569 non-null    float64
 11  texture2            569 non-null    float64
 12  perimeter2          569 non-null    float64
 13  area2               569 non-null    float64
 14  smoothness2         569 non-null    float64
 15  compactness2       

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075



Missing values in each column:
radius1               0
texture1              0
perimeter1            0
area1                 0
smoothness1           0
compactness1          0
concavity1            0
concave_points1       0
symmetry1             0
fractal_dimension1    0
radius2               0
texture2              0
perimeter2            0
area2                 0
smoothness2           0
compactness2          0
concavity2            0
concave_points2       0
symmetry2             0
fractal_dimension2    0
radius3               0
texture3              0
perimeter3            0
area3                 0
smoothness3           0
compactness3          0
concavity3            0
concave_points3       0
symmetry3             0
fractal_dimension3    0
Diagnosis             0
dtype: int64


The dataset has 569 rows and 31 columns.


Data types of each column:
radius1               float64
texture1              float64
perimeter1            float64
area1                 float64
smoothness1        

In [16]:
dataset_exploration('diabetes.csv', 'patient_id')


Basic Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29042 entries, 0 to 29041
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   datetime    29042 non-null  object
 1   patient_id  29042 non-null  int64 
 2   code        29042 non-null  int64 
 3   value       29042 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 907.7+ KB

Summary statistics of the dataset:


Unnamed: 0,patient_id,code,value
count,29042.0,29042.0,29042.0
mean,36.516115,46.373872,0.503168
std,20.119425,13.324152,0.798276
min,1.0,33.0,0.0
25%,21.0,33.0,0.0
50%,34.0,48.0,0.0
75%,55.0,60.0,1.0
max,70.0,71.0,4.0



Missing values in each column:
datetime      0
patient_id    0
code          0
value         0
dtype: int64


The dataset has 29042 rows and 4 columns.


Data types of each column:
datetime      object
patient_id     int64
code           int64
value          int64
dtype: object

Distribution of categorical variables:

Value counts for datetime:
datetime
1989-04-07 08:00:00    15
1989-04-22 08:00:00    14
1989-05-06 08:00:00    14
1989-04-30 08:00:00    13
1989-04-29 08:00:00    13
                       ..
1991-03-21 19:30:00     1
1990-07-13 21:58:00     1
1991-03-21 19:27:00     1
1990-07-13 22:00:00     1
1991-04-24 06:14:00     1
Name: count, Length: 14619, dtype: int64

Number of unique time series in the dataset: 70


In [15]:
dataset_exploration('baseline.csv', 'user_id')


Basic Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1442450 entries, 0 to 1442449
Data columns (total 13 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   time           1442450 non-null  int64  
 1   user_id        1442450 non-null  int64  
 2   session_id     1442450 non-null  int64  
 3   acc_var        742594 non-null   float64
 4   hr             347706 non-null   float64
 5   rmssd          282464 non-null   float64
 6   sdnn           432048 non-null   float64
 7   temp           746809 non-null   float64
 8   eda            748572 non-null   float64
 9   eda_freq       731813 non-null   float64
 10  bm             731813 non-null   float64
 11  location       1442450 non-null  int64  
 12  concentration  68891 non-null    float64
dtypes: float64(9), int64(4)
memory usage: 143.1 MB

Summary statistics of the dataset:


Unnamed: 0,time,user_id,session_id,acc_var,hr,rmssd,sdnn,temp,eda,eda_freq,bm,location,concentration
count,1442450.0,1442450.0,1442450.0,742594.0,347706.0,282464.0,432048.0,746809.0,748572.0,731813.0,731813.0,1442450.0,68891.0
mean,1418794000.0,7.867871,194.791,1.129791,0.50639,0.146566,0.075309,-0.721254,16.396688,0.206268,0.389753,1.480742,3.147799
std,5167419.0,4.239626,128.2366,3.524345,2.676866,1.541201,0.035139,3.495024,111.044135,0.132331,0.100061,0.8194515,1.158715
min,1414658000.0,1.0,1.0,-2.265828,-6.633171,-6.760859,0.01119,-36.57754,-4.968889,0.0,0.0,0.0,1.0
25%,1415304000.0,4.0,69.0,-0.885101,-0.902526,-0.953537,0.05181,-1.23262,-0.740645,0.096774,0.322581,1.0,2.0
50%,1416017000.0,7.0,171.0,0.0,0.0,0.0,0.06837,0.0,0.0,0.193548,0.387097,2.0,3.0
75%,1425693000.0,12.0,320.0,1.338536,1.136445,1.054243,0.08974,0.880239,2.819276,0.290323,0.451613,2.0,4.0
max,1429295000.0,16.0,480.0,24.47868,31.47322,12.35198,0.4344,6.427273,5295.721,0.83871,0.806452,2.0,5.0



Missing values in each column:
time                   0
user_id                0
session_id             0
acc_var           699856
hr               1094744
rmssd            1159986
sdnn             1010402
temp              695641
eda               693878
eda_freq          710637
bm                710637
location               0
concentration    1373559
dtype: int64


The dataset has 1442450 rows and 13 columns.


Data types of each column:
time               int64
user_id            int64
session_id         int64
acc_var          float64
hr               float64
rmssd            float64
sdnn             float64
temp             float64
eda              float64
eda_freq         float64
bm               float64
location           int64
concentration    float64
dtype: object

No categorical variables found.

Number of unique time series in the dataset: 16
