In [62]:
from pyspark.sql import SparkSession

In [63]:
# Kreiranje Spark sesije
spark = SparkSession.builder.appName("HeartDiseaseAnalysis").getOrCreate()

In [21]:
try:
    # Putanja do dataset-a (pretpostavljamo da je CSV datoteka)
    file_path = "heart.csv"

    # Učitavanje podataka u PySpark DataFrame
    df = spark.read.csv(file_path, header=True, inferSchema=True)

    # Provjera učitanih podataka (prvih 5 redova)
    df.show(5)

except Exception as e:
    print(f"Došlo je do pogreške prilikom učitavanja podataka: {e}")

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 52|  1|  0|     125| 212|  0|      1|    168|    0|    1.0|    2|  2|   3|     0|
| 53|  1|  0|     140| 203|  1|      0|    155|    1|    3.1|    0|  0|   3|     0|
| 70|  1|  0|     145| 174|  0|      1|    125|    1|    2.6|    0|  0|   3|     0|
| 61|  1|  0|     148| 203|  0|      1|    161|    0|    0.0|    2|  1|   3|     0|
| 62|  0|  0|     138| 294|  1|      1|    106|    0|    1.9|    1|  3|   2|     0|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [16]:
df.count()

1025

In [30]:
from pyspark.sql.functions import count, col

# Brojanje ukupnih redaka
total_rows = df.count()

# Brojanje redaka nakon uklanjanja duplikata
unique_rows = df.dropDuplicates().count()

# Brojanje duplikata
duplicate_rows = total_rows - unique_rows
print(f"Broj duplikata: {duplicate_rows}")


Broj duplikata: 723


In [31]:
duplicates = df.groupBy(df.columns).count().filter("count > 1")
duplicates.show()


+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+-----+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|count|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+-----+
| 56|  1|  1|     130| 221|  0|      0|    163|    0|    0.0|    2|  0|   3|     1|    4|
| 60|  1|  0|     125| 258|  0|      0|    141|    1|    2.8|    1|  1|   3|     0|    3|
| 56|  1|  0|     130| 283|  1|      0|    103|    1|    1.6|    0|  0|   3|     0|    4|
| 54|  1|  1|     108| 309|  0|      1|    156|    0|    0.0|    2|  0|   3|     1|    4|
| 52|  1|  0|     108| 233|  1|      1|    147|    0|    0.1|    2|  3|   3|     1|    3|
| 43|  1|  0|     120| 177|  0|      0|    120|    1|    2.5|    1|  0|   3|     0|    4|
| 51|  0|  0|     130| 305|  0|      1|    142|    1|    1.2|    1|  0|   3|     0|    3|
| 55|  0|  1|     135| 250|  0|      0|    161|    0|    1.4|    1|  0|   2|     1|    3|
| 58|  0| 

In [32]:
# Nećemo brisati duplikate
#df = df.dropDuplicates()
duplicates.count()

302

In [33]:
df.count()

1025

In [36]:
from pyspark.sql.functions import col, sum

# Brojanje nedostajućih vrijednosti po stupcima
missing_counts = df.select(
    [sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]
)

missing_counts.show()


+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|  0|  0|  0|       0|   0|  0|      0|      0|    0|      0|    0|  0|   0|     0|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [43]:
from pyspark.sql.functions import col, expr
numerical_columns = ["age", "trestbps", "chol", "thalach", "oldpeak"]
# Definiraj funkciju za izračun IQR-a i identifikaciju outliera
def detect_outliers(df, column):
    q1, q3 = df.approxQuantile(column, [0.25, 0.75], 0.01)  # Kvartili
    iqr = q3 - q1  # Interkvartilni raspon
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Filtriraj outliere
    outliers_df = df.filter((col(column) < lower_bound) | (col(column) > upper_bound))
    return outliers_df, lower_bound, upper_bound

# Provjeri outliere za svaku numeričku varijablu
for column in numerical_columns:
    outliers_df, lower, upper = detect_outliers(df, column)
    print(f"Outlieri za '{column}':")
    print(f"  Donja granica: {lower}, Gornja granica: {upper}")
    print(f"  Broj outliera: {outliers_df.count()}\n")

#Budući da se radi o podacima o srčanim bolestima, outlieri bi mogli biti od ključne 
#važnosti jer ekstremne vrijednosti često predstavljaju pacijente s ozbiljnijim 
#zdravstvenim stanjima.

Outlieri za 'age':
  Donja granica: 28.5, Gornja granica: 80.5
  Broj outliera: 0

Outlieri za 'trestbps':
  Donja granica: 90.0, Gornja granica: 170.0
  Broj outliera: 30

Outlieri za 'chol':
  Donja granica: 116.5, Gornja granica: 368.5
  Broj outliera: 16

Outlieri za 'thalach':
  Donja granica: 82.5, Gornja granica: 214.5
  Broj outliera: 4

Outlieri za 'oldpeak':
  Donja granica: -2.4000000000000004, Gornja granica: 4.0
  Broj outliera: 17



In [45]:
outliers_age = df.filter((col('age') < 0) | (col('age') > 120))
outliers_age.show()

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [47]:
# Provjera svih jedinstvenih vrijednosti u koloni 'sex'
df.select('sex').distinct().show()

# Filtriranje redaka gdje 'sex' nije 0 ili 1
invalid_sex = df.filter(~col('sex').isin([0, 1]))
invalid_sex.show()


+---+
|sex|
+---+
|  1|
|  0|
+---+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [48]:
# Provjera svih jedinstvenih vrijednosti u koloni 'cp'
df.select('cp').distinct().show()

# Filtriranje redaka gdje 'cp' nije 0, 1, 2, ili 3
invalid_cp = df.filter(~col('cp').isin([0, 1, 2, 3]))
invalid_cp.show()


+---+
| cp|
+---+
|  1|
|  3|
|  2|
|  0|
+---+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [49]:
# Provjera statistike za 'trestbps' (min, max, prosjek)
df.describe('trestbps').show()

# Filtriranje redaka gdje 'trestbps' nije u očekivanom rasponu (npr. 50-250)
invalid_trestbps = df.filter((col('trestbps') < 50) | (col('trestbps') > 250))
invalid_trestbps.show()


+-------+------------------+
|summary|          trestbps|
+-------+------------------+
|  count|              1025|
|   mean|131.61170731707318|
| stddev|17.516718005376408|
|    min|                94|
|    max|               200|
+-------+------------------+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [50]:
# Provjera statistike za 'chol' (min, max, prosjek)
df.describe('chol').show()

# Filtriranje redaka gdje 'chol' nije u očekivanom rasponu (npr. 125-400)
invalid_chol = df.filter((col('chol') < 0))
invalid_chol.show()


+-------+-----------------+
|summary|             chol|
+-------+-----------------+
|  count|             1025|
|   mean|            246.0|
| stddev|51.59251020618203|
|    min|              126|
|    max|              564|
+-------+-----------------+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [51]:
# Provjera svih jedinstvenih vrijednosti u koloni 'fbs'
df.select('fbs').distinct().show()

# Filtriranje redaka gdje 'fbs' nije 0 ili 1
invalid_fbs = df.filter(~col('fbs').isin([0, 1]))
invalid_fbs.show()


+---+
|fbs|
+---+
|  1|
|  0|
+---+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [52]:
# Provjera svih jedinstvenih vrijednosti u koloni 'restecg'
df.select('restecg').distinct().show()

# Filtriranje redaka gdje 'restecg' nije 0, 1 ili 2
invalid_restecg = df.filter(~col('restecg').isin([0, 1, 2]))
invalid_restecg.show()


+-------+
|restecg|
+-------+
|      1|
|      2|
|      0|
+-------+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [53]:
# Provjera svih jedinstvenih vrijednosti u koloni 'thalach'
df.select('thalach').distinct().show()

# Filtriranje redaka gdje 'thalach' nije u očekivanom rasponu (71-202)
invalid_thalach = df.filter((col('thalach') < 70) | (col('thalach') > 220))
invalid_thalach.show()


+-------+
|thalach|
+-------+
|    148|
|    137|
|    133|
|    155|
|    108|
|    115|
|    126|
|    192|
|    159|
|    103|
|    128|
|    122|
|    157|
|    190|
|    111|
|    140|
|    177|
|    152|
|    132|
|    185|
+-------+
only showing top 20 rows

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [54]:
# Provjera svih jedinstvenih vrijednosti u koloni 'exang'
df.select('exang').distinct().show()

# Filtriranje redaka gdje 'exang' nije 0 ili 1
invalid_exang = df.filter(~col('exang').isin([0, 1]))
invalid_exang.show()


+-----+
|exang|
+-----+
|    1|
|    0|
+-----+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [55]:
# Provjera svih jedinstvenih vrijednosti u koloni 'oldpeak'
df.select('oldpeak').distinct().show()

# Filtriranje redaka gdje 'oldpeak' nije u očekivanom rasponu (0 do 6.2)
invalid_oldpeak = df.filter((col('oldpeak') < 0) | (col('oldpeak') > 7))
invalid_oldpeak.show()


+-------+
|oldpeak|
+-------+
|    2.4|
|    0.0|
|    3.5|
|    0.2|
|    2.9|
|    1.4|
|    0.7|
|    2.3|
|    0.1|
|    3.4|
|    2.5|
|    1.0|
|    0.6|
|    3.1|
|    0.8|
|    2.2|
|    2.8|
|    4.0|
|    1.9|
|    6.2|
+-------+
only showing top 20 rows

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [56]:
# Provjera svih jedinstvenih vrijednosti u koloni 'slope'
df.select('slope').distinct().show()

# Filtriranje redaka gdje 'slope' nije 0, 1 ili 2
invalid_slope = df.filter(~col('slope').isin([0, 1, 2]))
invalid_slope.show()


+-----+
|slope|
+-----+
|    1|
|    2|
|    0|
+-----+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [57]:
# Provjera svih jedinstvenih vrijednosti u koloni 'ca'
df.select('ca').distinct().show()

# Filtriranje redaka gdje 'ca' nije između 0 i 3
invalid_ca = df.filter(~col('ca').isin([0, 1, 2, 3]))
invalid_ca.show()


+---+
| ca|
+---+
|  1|
|  3|
|  4|
|  2|
|  0|
+---+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 38|  1|  2|     138| 175|  0|      1|    173|    0|    0.0|    2|  4|   2|     1|
| 38|  1|  2|     138| 175|  0|      1|    173|    0|    0.0|    2|  4|   2|     1|
| 52|  1|  2|     138| 223|  0|      1|    169|    0|    0.0|    2|  4|   2|     1|
| 38|  1|  2|     138| 175|  0|      1|    173|    0|    0.0|    2|  4|   2|     1|
| 38|  1|  2|     138| 175|  0|      1|    173|    0|    0.0|    2|  4|   2|     1|
| 52|  1|  2|     138| 223|  0|      1|    169|    0|    0.0|    2|  4|   2|     1|
| 38|  1|  2|     138| 175|  0|      1|    173|    0|    0.0|    2|  4|   2|     1|
| 43|  1|  0|     132| 247|  1|      0|    143|    1|    0.1|    1|  4|   3|     0|
| 52|  1|  2|     138

In [58]:
# Provjera svih jedinstvenih vrijednosti u koloni 'thal'
df.select('thal').distinct().show()

# Filtriranje redaka gdje 'thal' nije 0, 1 ili 2
invalid_thal = df.filter(~col('thal').isin([0, 1, 2]))
invalid_thal.show()


+----+
|thal|
+----+
|   1|
|   3|
|   2|
|   0|
+----+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 52|  1|  0|     125| 212|  0|      1|    168|    0|    1.0|    2|  2|   3|     0|
| 53|  1|  0|     140| 203|  1|      0|    155|    1|    3.1|    0|  0|   3|     0|
| 70|  1|  0|     145| 174|  0|      1|    125|    1|    2.6|    0|  0|   3|     0|
| 61|  1|  0|     148| 203|  0|      1|    161|    0|    0.0|    2|  1|   3|     0|
| 55|  1|  0|     160| 289|  0|      0|    145|    1|    0.8|    1|  1|   3|     0|
| 46|  1|  0|     120| 249|  0|      0|    144|    0|    0.8|    2|  0|   3|     0|
| 43|  0|  0|     132| 341|  1|      0|    136|    1|    3.0|    1|  0|   3|     0|
| 51|  1|  0|     140| 298|  0|      1|    122|    1|    4.2|    1|  3|   3|     0|
| 54|  1|  0|     1

In [59]:
# Provjera svih jedinstvenih vrijednosti u koloni 'target'
df.select('target').distinct().show()

# Filtriranje redaka gdje 'target' nije 0 ili 1
invalid_target = df.filter(~col('target').isin([0, 1]))
invalid_target.show()


+------+
|target|
+------+
|     1|
|     0|
+------+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+



In [70]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType

# Kreiranje Spark sesije
spark = SparkSession.builder.appName("HeartDiseaseAnalysis").getOrCreate()

# Putanja do dataset-a
file_path = "heart.csv"
output_path = "output_parquet"

try:
    # Učitavanje podataka u PySpark DataFrame
    df = spark.read.csv(file_path, header=True, inferSchema=True)

    
    # Brojanje nedostajućih vrijednosti po stupcima
    missing_counts = df.select(
        [sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]
    )
    print("Nedostajućih vrijednosti po stupcima")
    missing_counts.show()

    # Implementacija konverzije tipova podataka
    def convert_column_types(df):
        try:
            # Definiraj tipove koje trebaš za svaku kolonu
            df = df.withColumn("age", col("age").cast(IntegerType())) \
                   .withColumn("sex", col("sex").cast(IntegerType())) \
                   .withColumn("cp", col("cp").cast(IntegerType())) \
                   .withColumn("trestbps", col("trestbps").cast(IntegerType())) \
                   .withColumn("chol", col("chol").cast(IntegerType())) \
                   .withColumn("fbs", col("fbs").cast(IntegerType())) \
                   .withColumn("restecg", col("restecg").cast(IntegerType())) \
                   .withColumn("thalach", col("thalach").cast(IntegerType())) \
                   .withColumn("exang", col("exang").cast(IntegerType())) \
                   .withColumn("oldpeak", col("oldpeak").cast(DoubleType())) \
                   .withColumn("slope", col("slope").cast(IntegerType())) \
                   .withColumn("ca", col("ca").cast(IntegerType())) \
                   .withColumn("thal", col("thal").cast(IntegerType())) \
                   .withColumn("target", col("target").cast(IntegerType()))
            return df
        except Exception as e:
            print(f"Došlo je do pogreške pri konverziji tipova: {e}")
            return df


    # Provjera svih jedinstvenih vrijednosti u svakoj koloni
    def check_column_values(df, column_name, valid_values):
        """Provjerava sve vrijednosti u zadanoj koloni i filtrira one koje nisu validne"""
        try:
            print(f"Provjera vrijednosti u koloni '{column_name}':")
            # Prikazuje jedinstvene vrijednosti u koloni
            df.select(column_name).distinct().show()

            # Filtrira redove s nevalidnim vrijednostima
            invalid_values = df.filter(~col(column_name).isin(*valid_values))
            invalid_values.show()
            return invalid_values.count()

        except Exception as e:
            print(f"Došlo je do pogreške pri provjeri kolone '{column_name}': {e}")
            return 0

    # Provjera za svaku kolonu s očekivanim vrijednostima
    error_counts = {}

    # Provjera za 'sex' (validne vrijednosti su 0 i 1)
    error_counts['sex'] = check_column_values(df, 'sex', [0, 1])

    # Provjera za 'cp' (validne vrijednosti su 0, 1, 2, 3)
    error_counts['cp'] = check_column_values(df, 'cp', [0, 1, 2, 3])

    # Provjera za 'trestbps' (moramo provjeriti da su vrijednosti u opsegu 90-170 mm Hg)
    error_counts['trestbps'] = check_column_values(df, 'trestbps', list(range(90, 171)))

    # Provjera za 'chol' (serum cholesterol, opseg 116.5 do 368.5 mg/dl)
    error_counts['chol'] = check_column_values(df, 'chol', list(range(117, 369)))

    # Provjera za 'fbs' (validne vrijednosti su 0 i 1)
    error_counts['fbs'] = check_column_values(df, 'fbs', [0, 1])

    # Provjera za 'restecg' (validne vrijednosti su 0, 1, 2)
    error_counts['restecg'] = check_column_values(df, 'restecg', [0, 1, 2])

    # Provjera za 'thalach' (opseg 82.5 do 214.5, max heart rate)
    error_counts['thalach'] = check_column_values(df, 'thalach', list(range(83, 215)))

    # Provjera za 'exang' (validne vrijednosti su 0 i 1)
    error_counts['exang'] = check_column_values(df, 'exang', [0, 1])

    # Provjera za 'oldpeak' (opseg od -2.4 do 4)
    error_counts['oldpeak'] = check_column_values(df, 'oldpeak', list(range(-2, 5)))

    # Provjera za 'slope' (validne vrijednosti su 0, 1, 2)
    error_counts['slope'] = check_column_values(df, 'slope', [0, 1, 2])

    # Provjera za 'ca' (broj glavnih krvnih žila, 0-3)
    error_counts['ca'] = check_column_values(df, 'ca', [0, 1, 2, 3])

    # Provjera za 'thal' (validne vrijednosti su 0, 1, 2)
    error_counts['thal'] = check_column_values(df, 'thal', [0, 1, 2])

    # Provjera za 'target' (validne vrijednosti su 0 i 1)
    error_counts['target'] = check_column_values(df, 'target', [0, 1])


    # Provodi konverziju tipova podataka
    df = convert_column_types(df)

    # Spremanje rezultata u Parquet format
    df.write.parquet(output_path)

except Exception as e:
    print(f"Došlo je do pogreške prilikom učitavanja podataka: {e}")


Nedostajućih vrijednosti po stupcima
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|  0|  0|  0|       0|   0|  0|      0|      0|    0|      0|    0|  0|   0|     0|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+

Provjera vrijednosti u koloni 'sex':
+---+
|sex|
+---+
|  1|
|  0|
+---+

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+

Provjera vrijednosti u koloni 'cp':
+---+
| cp|
+---+
|  1|
|  3|
|  2|
|  0|
+---+

+---+---+---+--------+----+---+-------+-------