In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("cwurData.csv")
df.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [3]:
print("\nMissing values BEFORE imputation:")
print(df.isna().sum())


Missing values BEFORE imputation:
world_rank                0
institution               0
country                   0
national_rank             0
quality_of_education      0
alumni_employment         0
quality_of_faculty        0
publications              0
influence                 0
citations                 0
broad_impact            200
patents                   0
score                     0
year                      0
dtype: int64


In [4]:
df.shape

(2200, 14)

In [6]:
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Function to impute missing values using median of non-outliers
def impute_with_non_outlier_median(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    # keep only non-outliers
    non_outliers = series[(series >= lower) & (series <= upper)]

    # median of non-outliers
    median_non_outliers = non_outliers.median()

    # impute missing values
    return series.fillna(median_non_outliers)

In [7]:
for col in numeric_cols:
    df[col] = impute_with_non_outlier_median(df[col])

In [8]:
# Dictionary to store statistics
stats = {}

for col in numeric_cols:
    series = df[col]
    stats[col] = {
        "mean": series.mean(),
        "median": series.median(),
        "mode": series.mode().iloc[0] if not series.mode().empty else None,
        "range": series.max() - series.min(),
        "IQR": series.quantile(0.75) - series.quantile(0.25),
        "std_dev": series.std(),
        "variance": series.var()
    }

# Print Stats
for col, st in stats.items():
    print(f"\n==== {col} ====")
    for k, v in st.items():
        print(f"{k}: {v}")


==== world_rank ====
mean: 459.59090909090907
median: 450.5
mode: 1
range: 999
IQR: 549.5
std_dev: 304.32036252280733
variance: 92610.88304601287

==== national_rank ====
mean: 40.27818181818182
median: 21.0
mode: 1
range: 228
IQR: 43.0
std_dev: 51.740870386587524
variance: 2677.1176683616495

==== quality_of_education ====
mean: 275.10045454545457
median: 355.0
mode: 355
range: 366
IQR: 191.25
std_dev: 121.9351000405299
variance: 14868.168621894034

==== alumni_employment ====
mean: 357.1168181818182
median: 450.5
mode: 478
range: 566
IQR: 302.25
std_dev: 186.77925165403906
variance: 34886.48884844285

==== quality_of_faculty ====
mean: 178.8881818181818
median: 210.0
mode: 210
range: 217
IQR: 42.25
std_dev: 64.05088534140887
variance: 4102.515913018306

==== publications ====
mean: 459.9086363636364
median: 450.5
mode: 101
range: 999
IQR: 549.25
std_dev: 303.7603518529839
variance: 92270.35135784859

==== influence ====
mean: 459.7977272727273
median: 450.5
mode: 101
range: 990
IQR:

In [17]:
df['world_rank'].mean()

np.float64(459.59090909090907)

In [18]:
df['world_rank'].std()

304.32036252280733