Data Cleaning

In [2]:
import pandas as pd
import numpy as np

csv_file_path = './archive/Albania_COVID19.csv'
df = pd.read_csv(csv_file_path)

# Display the DataFrame
df.head(70)

Unnamed: 0,Date,State,Country,Cumulative_cases,Cumulative_death,Daily_cases,Daily_death,Latitude,Longitude,Temperature,...,Available Beds/1000,Confirmed Cases/1000,Lung Patients (F),Lung Patients (M),Life Expectancy (M),Life Expectancy (F),Total_tests_conducted,Out_Travels (mill.),In_travels(mill.),Domestic_Travels (mill.)
0,22-01-2020,,Albania,0,0,0,0,41.1533,20.1683,5.72,...,0.725,0.000000,7.02,17.04,76,81.6,1526,5415,5927,Not Reported
1,23-01-2020,,Albania,0,0,0,0,41.1533,20.1683,6.72,...,0.725,0.000000,7.02,17.04,76,81.6,1526,5415,5927,Not Reported
2,24-01-2020,,Albania,0,0,0,0,41.1533,20.1683,8.17,...,0.725,0.000000,7.02,17.04,76,81.6,1526,5415,5927,Not Reported
3,25-01-2020,,Albania,0,0,0,0,41.1533,20.1683,10.83,...,0.725,0.000000,7.02,17.04,76,81.6,1526,5415,5927,Not Reported
4,26-01-2020,,Albania,0,0,0,0,41.1533,20.1683,11.39,...,0.725,0.000000,7.02,17.04,76,81.6,1526,5415,5927,Not Reported
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,26-03-2020,,Albania,174,6,28,1,41.1533,20.1683,15.39,...,0.725,0.060463,7.02,17.04,76,81.6,1526,5415,5927,Not Reported
65,27-03-2020,,Albania,186,8,12,2,41.1533,20.1683,12.94,...,0.725,0.064633,7.02,17.04,76,81.6,1526,5415,5927,Not Reported
66,28-03-2020,,Albania,197,10,11,2,41.1533,20.1683,12.17,...,0.725,0.068455,7.02,17.04,76,81.6,1526,5415,5927,Not Reported
67,29-03-2020,,Albania,212,10,15,0,41.1533,20.1683,12.78,...,0.725,0.073667,7.02,17.04,76,81.6,1526,5415,5927,Not Reported


Handling Missing Values

In [3]:
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Handling missing values
df.fillna(value={'State': 'Unknown'}, inplace=True)  # Fill missing State values with 'Unknown'

# Drop unnecessary columns
columns_to_drop = ['Latitude', 'Longitude', 'Temperature']
df.drop(columns=columns_to_drop, inplace=True)

# Handling outliers or incorrect values (e.g., negative values for 'Cumulative_cases')
df['Cumulative_cases'] = df['Cumulative_cases'].apply(lambda x: max(0, x))

# Ensure data types are correct
df['Out_Travels (mill.)'] = pd.to_numeric(df['Out_Travels (mill.)'], errors='coerce')

# Check for missing values again
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)

# Save the cleaned DataFrame to a new CSV file
df.to_csv('cleaned_data.csv', index=False)


Missing Values:
Date                        0
State                       0
Country                     0
Cumulative_cases            0
Cumulative_death            0
Daily_cases                 0
Daily_death                 0
Min_temperature             0
Max_temperature             0
Wind_speed                  0
Precipitation               0
Fog_Presence                0
Population                  0
Population Density/km       0
Median_Age                  0
Sex_Ratio                   0
Age%_65+                    0
Hospital Beds/1000          0
Available Beds/1000         0
Confirmed Cases/1000        0
Lung Patients (F)           0
Lung Patients (M)           0
Life Expectancy (M)         0
Life Expectancy (F)         0
Total_tests_conducted       0
Out_Travels (mill.)         0
In_travels(mill.)           0
Domestic_Travels (mill.)    0
dtype: int64


Displaying Data Frame After removing missing values

In [4]:
csv_file_path = './archive/cleaned_data.csv'
df = pd.read_csv(csv_file_path)

# Display the DataFrame
df.head()

Unnamed: 0,Date,State,Country,Cumulative_cases,Cumulative_death,Daily_cases,Daily_death,Min_temperature,Max_temperature,Wind_speed,...,Available Beds/1000,Confirmed Cases/1000,Lung Patients (F),Lung Patients (M),Life Expectancy (M),Life Expectancy (F),Total_tests_conducted,Out_Travels (mill.),In_travels(mill.),Domestic_Travels (mill.)
0,2020-01-22,Unknown,Albania,0,0,0,0,-0.39,13.22,0.2,...,0.725,0.0,7.02,17.04,76,81.6,1526,5415,5927,
1,2020-01-23,Unknown,Albania,0,0,0,0,1.22,15.22,0.2,...,0.725,0.0,7.02,17.04,76,81.6,1526,5415,5927,
2,2020-01-24,Unknown,Albania,0,0,0,0,0.0,17.22,0.8,...,0.725,0.0,7.02,17.04,76,81.6,1526,5415,5927,
3,2020-01-25,Unknown,Albania,0,0,0,0,5.78,15.0,0.7,...,0.725,0.0,7.02,17.04,76,81.6,1526,5415,5927,
4,2020-01-26,Unknown,Albania,0,0,0,0,9.78,13.22,0.5,...,0.725,0.0,7.02,17.04,76,81.6,1526,5415,5927,


Interpolate Null Values

In [1]:
# Interpolate null values using extreme method 
df_extreme_interpolated = df.interpolate(method='pad', limit_direction='forward')

# Display the original and interpolated DataFrames
print("Original DataFrame:")
print(df)

print("\nDataFrame with Extreme Interpolation:")
print(df_extreme_interpolated)

NameError: name 'df' is not defined

Data Uniformity

In [6]:
import pandas as pd

# Load the cleaned DataFrame from the CSV file
df = pd.read_csv('cleaned_data.csv')

# Ensure uniformity in column names (convert to lowercase and replace spaces with underscores)
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Print the column names again
print("\nColumn Names After Uniformity:")
print(df.columns)



Column Names After Uniformity:
Index(['date', 'state', 'country', 'cumulative_cases', 'cumulative_death',
       'daily_cases', 'daily_death', 'min_temperature', 'max_temperature',
       'wind_speed', 'precipitation', 'fog_presence', 'population',
       'population_density/km', 'median_age', 'sex_ratio', 'age%_65+',
       'hospital_beds/1000', 'available_beds/1000', 'confirmed_cases/1000',
       'lung_patients_(f)', 'lung_patients_(m)', 'life_expectancy_(m)',
       'life_expectancy_(f)', 'total_tests_conducted', 'out_travels_(mill.)',
       'in_travels(mill.)', 'domestic_travels_(mill.)'],
      dtype='object')


Checking and Removing duplicate data

In [7]:
# Check for duplicate rows
duplicate_rows = df.duplicated()

# Print the duplicate rows (if any)
print("Duplicate Rows:")
print(df[duplicate_rows])

# Drop duplicate rows
df.drop_duplicates(inplace=True)

# Check the DataFrame after dropping duplicates
print("\nDataFrame After Dropping Duplicates:")
print(df)


Duplicate Rows:
Empty DataFrame
Columns: [date, state, country, cumulative_cases, cumulative_death, daily_cases, daily_death, min_temperature, max_temperature, wind_speed, precipitation, fog_presence, population, population_density/km, median_age, sex_ratio, age%_65+, hospital_beds/1000, available_beds/1000, confirmed_cases/1000, lung_patients_(f), lung_patients_(m), life_expectancy_(m), life_expectancy_(f), total_tests_conducted, out_travels_(mill.), in_travels(mill.), domestic_travels_(mill.)]
Index: []

[0 rows x 28 columns]

DataFrame After Dropping Duplicates:
          date    state  country  cumulative_cases  cumulative_death  \
0   2020-01-22  Unknown  Albania                 0                 0   
1   2020-01-23  Unknown  Albania                 0                 0   
2   2020-01-24  Unknown  Albania                 0                 0   
3   2020-01-25  Unknown  Albania                 0                 0   
4   2020-01-26  Unknown  Albania                 0                 0

Handling Class Imbalances

In [8]:
# Specify the target variable
target_column = 'cumulative_cases'

# Check the distribution of the target variable
class_distribution = df[target_column].value_counts()
print("Class Distribution:")
print(class_distribution)


Class Distribution:
cumulative_cases
0      47
70      1
212     1
197     1
186     1
174     1
146     1
123     1
104     1
89      1
76      1
64      1
2       1
59      1
55      1
51      1
42      1
38      1
33      1
23      1
12      1
10      1
223     1
Name: count, dtype: int64


In [9]:
import numpy as np

# Assuming 'confirmed_cases/1000' is your column
threshold = 0.5
df['binary_target'] = np.where(df['cumulative_cases'] > threshold, 1, 0)

# Check the class distribution of the binary target variable
binary_class_distribution = df['binary_target'].value_counts()
print(binary_class_distribution)


binary_target
0    47
1    22
Name: count, dtype: int64


Class Balancing using OverSampling

In [10]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

# Assuming 'Cumulative_cases' is your target variable
threshold = 0.5
df['binary_target'] = (df['cumulative_cases'] > threshold).astype(int)

# Separate features and target variable
X = df.drop(['binary_target', 'cumulative_cases'], axis=1)
y = df['binary_target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply RandomOverSampler to balance the classes
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

# Check the class distribution after resampling
resampled_class_distribution = pd.Series(y_resampled).value_counts()
print("Class distribution after resampling:\n", resampled_class_distribution)


Class distribution after resampling:
 binary_target
1    36
0    36
Name: count, dtype: int64


In [11]:
import numpy as np

# Assuming 'confirmed_cases/1000' is your column
threshold = 0.5
df['binary_target'] = np.where(df['daily_cases'] > threshold, 1, 0)

# Check the class distribution of the binary target variable
binary_class_distribution = df['binary_target'].value_counts()
print(binary_class_distribution)

binary_target
0    47
1    22
Name: count, dtype: int64


Class Balancing using UnderSampling

In [12]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Assuming 'Cumulative_cases' is your target variable
threshold = 0.5
df['binary_target'] = (df['cumulative_cases'] > threshold).astype(int)

# Separate features and target variable
X = df.drop(['binary_target', 'cumulative_cases'], axis=1)
y = df['binary_target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply RandomUnderSampler to balance the classes
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

# Check the class distribution after resampling
resampled_class_distribution = pd.Series(y_resampled).value_counts()
print("Class distribution after resampling:\n", resampled_class_distribution)


Class distribution after resampling:
 binary_target
0    19
1    19
Name: count, dtype: int64


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# Convert 'Date' to Unix timestamp and create a new column 'Unix_Timestamp'
df['Unix_Timestamp'] = df['date'].apply(lambda x: int(x.timestamp()))

# Drop the original 'Date' column if needed
df = df.drop('date', axis=1)

# Now 'Unix_Timestamp' can be used as a numeric feature for the model
X_train = df.drop(['binary_target', 'cumulative_cases'], axis=1)
y_train = df['binary_target']

# Identify columns with non-numeric values
non_numeric_columns = X_train.select_dtypes(exclude=['number']).columns

# Replace 'Unknown' with NaN in non-numeric columns
X_train[non_numeric_columns] = X_train[non_numeric_columns].replace('Unknown', np.nan)

# Use one-hot encoding to convert categorical variables to numeric
X_train = pd.get_dummies(X_train, columns=non_numeric_columns, dummy_na=True)

# Replace NaN values with 0 or mean or any other strategy
X_train = X_train.fillna(0)

# Instantiate Logistic Regression model
clf1 = LogisticRegression()

# Fit the model to the training data
clf1.fit(X_train, y_train)


In [14]:
pre_t1 = clf1.predict(X_train)
f1_score(y_train, pre_t1, average='weighted')

1.0

In [15]:
# Convert 'date' column to datetime type if it exists
if 'date' in X_test.columns:
    X_test['date'] = pd.to_datetime(X_test['date'], format='%Y-%m-%d')
    
    # Convert 'date' to Unix timestamp
    X_test['Unix_Timestamp'] = X_test['date'].apply(lambda x: int(x.timestamp()))
    
    # Drop 'date' column
    X_test = X_test.drop('date', axis=1)
else:
    
    print("Warning: 'date' column not found in test data.")


# Identify columns with non-numeric values
non_numeric_columns_test = X_test.select_dtypes(exclude=['number']).columns

# Replace 'Unknown' with NaN in non-numeric columns
X_test[non_numeric_columns_test] = X_test[non_numeric_columns_test].replace('Unknown', np.nan)

# Use one-hot encoding to convert categorical variables to numeric
X_test = pd.get_dummies(X_test, columns=non_numeric_columns_test, dummy_na=True)

# Replace NaN values with 0 or mean or any other strategy
X_test = X_test.fillna(0)


In [16]:
pre1 = clf1.predict(X_test)
f1_score(y_test, pre1, average='weighted')

1.0