In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
data = pd.read_csv('SDnobel.csv')

Exploratory Data Analysis

In [5]:
#information about the dataset
print("Nobel Prize Winners Dataset:")
print("\n\nHead about the dataset:")
print(data.head())
print("----------------------------------------")
print("\n\nShape about the dataset:")
print(data.shape)
print("----------------------------------------")
print("\n\nColumns about the dataset:")
print(data.columns)
print("\n\nChecking for missing values in each column:")
print(data.isnull().sum())
print("----------------------------------------")
print("\n\nNumber of duplicated rows in the dataset:")
print(data.duplicated().sum())
print("----------------------------------------")
print("\n\nInformation about the dataset:")
print(data.info())
print("----------------------------------------")
print("\n\nStatistical summary of the dataset:")
print(data.describe())
print("----------------------------------------")
print("\n\nUnique values in each column:")
for column in data.columns:
    print(f"{column}: {data[column].nunique()}")


Nobel Prize Winners Dataset:


Head about the dataset:
   Unnamed: 0  year    category  \
0           0  1901   Chemistry   
1           1  1901  Literature   
2           2  1901    Medicine   
3           3  1901       Peace   
4           4  1901       Peace   

                                            prize  \
0               The Nobel Prize in Chemistry 1901   
1              The Nobel Prize in Literature 1901   
2  The Nobel Prize in Physiology or Medicine 1901   
3                      The Nobel Peace Prize 1901   
4                      The Nobel Peace Prize 1901   

                                          motivation prize_share  laureate_id  \
0  "in recognition of the extraordinary services ...         1/1          160   
1  "in special recognition of his poetic composit...         1/1          569   
2  "for his work on serum therapy, especially its...         1/1          293   
3                                                NaN         1/2          462   
4         

In [6]:
#dropping Unnamed: 0 column
data = data.drop(columns=['Unnamed: 0'])

In [7]:
# --- STEP 1: CALCULATIONS & FLAGS (Do this while NaNs still exist) ---
# Create the boolean flag before filling NaNs with "Alive"
data["is_alive"] = data["death_date"].isnull()

# Optional: Only set age=0 for organizations to distinguish them from human winners
if 'laureate_type' in data.columns:
    data.loc[data["laureate_type"] == "Organization", "age"] = 0

# --- STEP 2: DISPLAY COLUMNS (Human Readable) ---

# Create a display version of birth_date so the original remains a "date" type
data["birth_date_display"] = data["birth_date"].fillna("Not Applicable")

# --- STEP 3: CATEGORICAL/TEXT FILLING ---

data["age_group"] = data["age_group"].fillna("Unknown")
data["motivation"] = data["motivation"].fillna("Not Available")
data["birth_city"] = data["birth_city"].fillna("Unknown")
data["birth_country"] = data["birth_country"].fillna("Unknown")
data["sex"] = data["sex"].fillna("Organization")

# Organization columns
data["organization_name"] = data["organization_name"].fillna("Independent/None")
data["organization_city"] = data["organization_city"].fillna("Not Mentioned")
data["organization_country"] = data["organization_country"].fillna("Not Mentioned")

# --- STEP 4: DEATH COLUMNS (Only if you are done with date-math!) ---

data["death_date_display"] = data["death_date"].fillna("Alive") # Better to use a display column here too
data["death_city"] = data["death_city"].fillna("Alive")
data["death_country"] = data["death_country"].fillna("Alive")

In [8]:
#verfication of missing values after cleaning
data.isnull().sum()

year                      0
category                  0
prize                     0
motivation                0
prize_share               0
laureate_id               0
laureate_type             0
full_name                 0
birth_date               28
birth_city                0
birth_country             0
sex                       0
organization_name         0
organization_city         0
organization_country      0
death_date              318
death_city                0
death_country             0
usa_born_winner           0
decade                    0
female                    0
age                       2
age_group                 0
is_alive                  0
birth_date_display        0
death_date_display        0
dtype: int64

In [None]:
#filling missing age values with median age
data["age"] = data["age"].fillna(data["age"].median())

In [10]:
#verfication of missing values after cleaning
data.isnull().sum()

year                      0
category                  0
prize                     0
motivation                0
prize_share               0
laureate_id               0
laureate_type             0
full_name                 0
birth_date               28
birth_city                0
birth_country             0
sex                       0
organization_name         0
organization_city         0
organization_country      0
death_date              318
death_city                0
death_country             0
usa_born_winner           0
decade                    0
female                    0
age                       0
age_group                 0
is_alive                  0
birth_date_display        0
death_date_display        0
dtype: int64

In [11]:
#exporting cleaned data to a new csv file
data.to_csv('Nobel_Prize_Winners_Cleaned.csv', index=False)