In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv("/content/final_combined_data_v3.csv")

In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())

check_df(df)

##################### Shape #####################
(120384, 14)
##################### Types #####################
FIPS                   float64
Admin2                  object
Province_State          object
Country_Region          object
Last_Update             object
Lat                    float64
Long_                  float64
Confirmed                int64
Deaths                   int64
Recovered                int64
Active                   int64
Combined_Key            object
Incident_Rate          float64
Case_Fatality_Ratio    float64
dtype: object
##################### Head #####################
   FIPS Admin2 Province_State Country_Region          Last_Update       Lat  \
0   NaN    NaN            NaN    Afghanistan  2021-01-02 05:22:33  33.93911   
1   NaN    NaN            NaN        Albania  2021-01-02 05:22:33  41.15330   
2   NaN    NaN            NaN        Algeria  2021-01-02 05:22:33  28.03390   
3   NaN    NaN            NaN        Andorra  2021-01-02 05:22:33  42.5063

In [None]:
df_cleaned = df.dropna(subset=["Lat", "Long_","Incident_Rate", "Case_Fatality_Ratio"])
df_cleaned = df_cleaned.drop(columns=["FIPS","Admin2","Province_State"])
df_cleaned.columns

Index(['Country_Region', 'Last_Update', 'Lat', 'Long_', 'Confirmed', 'Deaths',
       'Recovered', 'Active', 'Combined_Key', 'Incident_Rate',
       'Case_Fatality_Ratio'],
      dtype='object')

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df_cleaned)

Observations: 117469
Variables: 11
cat_cols: 0
num_cols: 8
cat_but_car: 3
num_but_cat: 0


In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")

    if na_name:
        return na_columns


missing_values_table(df_cleaned, True)

Empty DataFrame
Columns: [n_miss, ratio]
Index: []


[]

In [None]:
# Numerik verileri hazırlama ve KMeans modeli uygulama
#kmeans_features = ["Confirmed","Deaths","Recovered","Active","Incident_Rate"]
kmeans_features = ["Incident_Rate"]
kmeans_data = df_cleaned[kmeans_features]

scaler = MinMaxScaler()
kmeans_normalized = scaler.fit_transform(kmeans_data)
# KMeans
kmeans = KMeans(n_clusters = 5, random_state=42).fit(kmeans_normalized)
clusters = kmeans.labels_
df_cleaned["Covid_Threat_Level"] = [
    0 if incident_rate == 0 else label + 1
    for incident_rate, label in zip(df_cleaned["Incident_Rate"], clusters)
]

# Sonuçları kontrol et
df_cleaned.head(20)

Unnamed: 0,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio,Covid_Threat_Level
0,Afghanistan,2021-01-02 05:22:33,33.93911,67.709953,52513,2201,41727,8585,Afghanistan,134.896578,4.191343,3
1,Albania,2021-01-02 05:22:33,41.1533,20.1683,58316,1181,33634,23501,Albania,2026.409062,2.025173,3
2,Algeria,2021-01-02 05:22:33,28.0339,1.6596,99897,2762,67395,29740,Algeria,227.809861,2.764848,3
3,Andorra,2021-01-02 05:22:33,42.5063,1.5218,8117,84,7463,570,Andorra,10505.403482,1.034865,2
4,Angola,2021-01-02 05:22:33,-11.2027,17.8739,17568,405,11146,6017,Angola,53.452981,2.305328,3
5,Antigua and Barbuda,2021-01-02 05:22:33,17.0608,-61.7964,159,5,148,6,Antigua and Barbuda,162.364186,3.144654,3
6,Argentina,2021-01-02 05:22:33,-38.4161,-63.6167,1629594,43319,1426676,159599,Argentina,3605.633332,2.658269,1
7,Armenia,2021-01-02 05:22:33,40.0691,45.0382,159738,2828,143355,13555,Armenia,5390.664389,1.770399,1
8,Australia,2021-01-02 05:22:33,-35.4735,149.0124,118,3,114,1,"Australian Capital Territory, Australia",27.563653,2.542373,3
9,Australia,2021-01-02 05:22:33,-33.8688,151.2093,4947,54,0,4893,"New South Wales, Australia",60.938655,1.091571,3


In [None]:
df.to_csv('ornek_guncel.csv', index=False)

In [None]:
df_cleaned["Covid_Threat_Level"] ==  4

Unnamed: 0,Covid_Threat_Level
0,False
1,False
2,False
3,False
4,False
...,...
120378,False
120380,False
120381,False
120382,False
