In [28]:
# import necessary libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

## Preliminary Data Inspection

This class is designed to streamline the initial exploration and quality assessment of a daraset. Providing methods like loading the datatset, inspecting the structure, previewing, identifying missing values and detecting duplicates.



In [26]:
class PreliminaryDataInspection():
    '''
    This class performs preliminary data inspection.
    '''

    def __init__(self, df=None, file_path=None, sep=","):
        '''
        Initializes the class by assigning a DataFrame or loading it from a file.

        Args:
            df (pd.DataFrame, optional): DataFrame to initialize the class with.
            file_path (str, optional): Path to the data file.
            sep (str, optional): Separator used in the file. Default is ",".
        '''
        if df is not None and isinstance(df, pd.DataFrame):
            self.df = df
            print("DataFrame provided directly.")
        elif file_path is not None:
            self.df = self.load_data(file_path, sep)
        else:
            self.df = None
            print("No valid DataFrame or file path provided.")

    def load_data(self, file_path, sep=","):
        '''
        Loads a data file and returns a DataFrame.

        Args:
            file_path (str): Path to the data file.
            sep (str): Separator used in the file. Default is ",".

        Returns:
            pd.DataFrame or None: The loaded DataFrame or None if file not found.
        '''
        try:
            df = pd.read_csv(file_path, sep=sep)
            print(f"Data successfully loaded from {file_path}.")
            return df
        except FileNotFoundError:
            print(f"File not found: {file_path}")
            return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    def check_info(self):
        '''
        Displays information about the DataFrame.
        '''
        if self.df is not None:
            print("DataFrame Info:")
            print(self.df.info())
        else:
            print("No DataFrame to inspect.")

    def check_head(self, n=5):
        '''
        Displays the first few rows of the DataFrame.

        Args:
            n (int): Number of rows to display. Default is 5.
        '''
        if self.df is not None:
            print(f"First {n} rows of the DataFrame:")
            print(self.df.head(n))
        else:
            print("No DataFrame to inspect.")

    def check_missing_values(self):
        '''
        Checks for missing values in the DataFrame.
        '''
        if self.df is not None:
            print("Missing Values in DataFrame:")
            missing_values = self.df.isnull().sum()
            print(missing_values[missing_values > 0])
        else:
            print("No DataFrame to inspect.")

    def check_duplicates(self):
        '''
        Checks for duplicate rows in the DataFrame.
        '''
        if self.df is not None:
            duplicates = self.df.duplicated().sum()
            print(f"Number of duplicate rows: {duplicates}")
        else:
            print("No DataFrame to inspect.")


### Alarm Response (`Alarmresponses.txt`)

In [4]:
alarm_response = PreliminaryDataInspection('data/Alarmresponses.txt', sep='\t')

alarm_response.check_info()
alarm_response.check_head()
alarm_response.check_missing_values()
alarm_response.check_duplicates()

Data successfully loaded from data/Alarmresponses.txt.
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4124815 entries, 0 to 4124814
Data columns (total 8 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   alarmid       int64 
 1   userid        int64 
 2   DeviceType    int64 
 3   devices       object
 4   Response      int64 
 5   ResponseDate  object
 6   latitude      object
 7   longitude     object
dtypes: int64(4), object(4)
memory usage: 251.8+ MB
None
First 5 rows of the DataFrame:
   alarmid               userid  DeviceType devices  Response ResponseDate  \
0    78100  2210044967363094894           4     Sms         0          NaN   
1    79629  2210044967363094894           4     Sms         0          NaN   
2    82395  2210044967363094894           4     Sms         0          NaN   
3    89313  2210044967363094894           4     Sms         0          NaN   
4    43918   738033312678202846           4     Sms         0          NaN   



### Data Summary:

Alarm reponses contains 4,124,815 entries with 8 columns (4214815 x 8 matrix), the columns are:

- alarmid: Unique identifier for each alarm event (integer).
- userid: Unique identifier for the user who triggered the alarm (integer).
- DeviceType: Type of device used to trigger the alarm (integer).
- devices: Description of the device used (object). 193 entries are missing values.
- Response: Indicates if a volunteer responded to the alarm (0 - no response, 1 - responded) (integer).
- ResponseDate: Date and time the volunteer responded (object). 3,603,938 entries are missing values.
- latitude: Latitude coordinate of the alarm location (object).
- longitude: Longitude coordinate of the alarm location (object).

**Data Insights:**

- A large portion of the data (over 87%) has missing values in the RespinseDate column. As at now, no conclusion could be made regarding the missing date values. Depending on its correlation with the target variable, I might result to dropping the column if it is negatively correlated with the Response column.

- There are missing values in the devices column, indicating some descriptions of the triggering devices are unavailable.

- The data seems suitable for analyzing volunteer response patterns, effectiveness, and identifying areas for improvement.


### Alarms (`Alarms.txt`)

In [5]:
alarms = PreliminaryDataInspection('data/Alarms.txt', sep='\t')

alarms.check_info()
alarms.check_head()
alarms.check_missing_values()
alarms.check_duplicates()

Data successfully loaded from data/Alarms.txt.
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60329 entries, 0 to 60328
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   alarmid         60329 non-null  int64 
 1   alarmlocaltime  60329 non-null  object
 2   Latitude        60329 non-null  object
 3   Longitude       60329 non-null  object
 4   AlarmStatus     60329 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.3+ MB
None
First 5 rows of the DataFrame:
   alarmid           alarmlocaltime   Latitude Longitude  AlarmStatus
0    29977  2020-01-01 00:23:14.200   52,26168  5,603156            1
1    29979  2020-01-01 00:25:36.730  51,900643  4,484577            1
2    29980  2020-01-01 00:34:54.213   52,63205  4,726312            1
3    29982  2020-01-01 00:58:24.897  51,263437  4,045439            1
4    29983  2020-01-01 00:59:01.363  51,334741  6,129295            1
Missing Values 

### Data Summary:

The data is stored in a pandas DataFrame containing 60,329 entries. There are 5 columns in the DataFrame:
- alarmid: Unique identifier for each alarm event (integer).
- alarmlocaltime: Local time of the alarm event (object).
- Latitude: Latitude coordinate of the alarm location (object).
- Longitude: Longitude coordinate of the alarm location (object).
- AlarmStatus: Status of the alarm (integer, likely indicating active/inactive).

**Data Insights:**

- The data has no missing values.
- The alarmlocaltime column contains timestamp information, which can be useful for temporal analysis.
- The Latitude and Longitude columns provide geographic information, enabling spatial analysis.
- The AlarmStatus column can help identify active alarms and potentially correlate with response times or other factors.

### Volunteer availability



In [6]:
volunteer_availability = PreliminaryDataInspection('data/VolunteerAvailability.txt', sep='\t')

volunteer_availability.check_info()
volunteer_availability.check_head()
volunteer_availability.check_missing_values()
volunteer_availability.check_duplicates()

Data successfully loaded from data/VolunteerAvailability.txt.
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879387 entries, 0 to 2879386
Data columns (total 8 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   userid                int64 
 1   FromHour              int64 
 2   FromMin               int64 
 3   ToHour                int64 
 4   ToMin                 int64 
 5   AvailableWeekDay      int64 
 6   AvailabilityLocation  int64 
 7   CreatedDate           object
dtypes: int64(7), object(1)
memory usage: 175.7+ MB
None
First 5 rows of the DataFrame:
                userid  FromHour  FromMin  ToHour  ToMin  AvailableWeekDay  \
0  2210044967363094894         0        0      23     59                 0   
1  2210044967363094894         0        0      23     59                 1   
2  2210044967363094894         0        0      23     59                 2   
3  2210044967363094894         0        0      23     59              

In [7]:
volunteers = PreliminaryDataInspection('data/Volunteers.txt', sep='\t')

volunteers.check_info()
volunteers.check_head()
volunteers.check_missing_values()
volunteers.check_duplicates()

Data successfully loaded from data/Volunteers.txt.
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246088 entries, 0 to 246087
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   userid         246088 non-null  int64 
 1   DateOfBirth    246088 non-null  object
 2   Latitude       246088 non-null  object
 3   Longitude      246088 non-null  object
 4   WorkLatitude   246088 non-null  object
 5   WorkLongitude  246088 non-null  object
dtypes: int64(1), object(5)
memory usage: 11.3+ MB
None
First 5 rows of the DataFrame:
                userid              DateOfBirth          Latitude  \
0 -4402443911528580314  1973-10-16 00:00:00.000         53,239679   
1  4147689039375744256  1989-07-11 00:00:00.000        52,3007863   
2 -7843085838038396722  1997-07-01 00:00:00.000  52,3327560424805   
3 -1646437989330162528  1976-01-30 00:00:00.000  52,6543579101563   
4  5678998634590506361  1962-05-14 00:0

## Create a Unified Dataset

In [8]:
merged_data = pd.merge(alarm_response.df, alarms.df, on='alarmid', how='left')
merged_data = pd.merge(merged_data, volunteers.df, on='userid', how='left')
merged_data = pd.merge(merged_data, volunteer_availability.df, on='userid', how='left')
merged_data.head()

Unnamed: 0,alarmid,userid,DeviceType,devices,Response,ResponseDate,latitude,longitude,alarmlocaltime,Latitude_x,...,Longitude_y,WorkLatitude,WorkLongitude,FromHour,FromMin,ToHour,ToMin,AvailableWeekDay,AvailabilityLocation,CreatedDate
0,78100,2210044967363094894,4,Sms,0,,519987411499023,477543687820435,2023-09-08 15:19:27.160,51999415,...,477543687820435,0,0,0.0,0.0,23.0,59.0,0.0,0.0,2023-05-25 19:59:53.413
1,78100,2210044967363094894,4,Sms,0,,519987411499023,477543687820435,2023-09-08 15:19:27.160,51999415,...,477543687820435,0,0,0.0,0.0,23.0,59.0,1.0,0.0,2023-05-25 19:59:53.413
2,78100,2210044967363094894,4,Sms,0,,519987411499023,477543687820435,2023-09-08 15:19:27.160,51999415,...,477543687820435,0,0,0.0,0.0,23.0,59.0,2.0,0.0,2023-05-25 19:59:53.413
3,78100,2210044967363094894,4,Sms,0,,519987411499023,477543687820435,2023-09-08 15:19:27.160,51999415,...,477543687820435,0,0,0.0,0.0,23.0,59.0,3.0,0.0,2023-05-25 19:59:53.413
4,78100,2210044967363094894,4,Sms,0,,519987411499023,477543687820435,2023-09-08 15:19:27.160,51999415,...,477543687820435,0,0,0.0,0.0,23.0,59.0,4.0,0.0,2023-05-25 19:59:53.413


In [27]:
data = PreliminaryDataInspection(df=merged_data)

data.check_info()
data.check_head()
data.check_missing_values()
data.check_duplicates()

DataFrame provided directly.
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48348212 entries, 0 to 48348211
Data columns (total 24 columns):
 #   Column                Dtype  
---  ------                -----  
 0   alarmid               int64  
 1   userid                int64  
 2   DeviceType            int64  
 3   devices               object 
 4   Response              int64  
 5   ResponseDate          object 
 6   latitude              object 
 7   longitude             object 
 8   alarmlocaltime        object 
 9   Latitude_x            object 
 10  Longitude_x           object 
 11  AlarmStatus           int64  
 12  DateOfBirth           object 
 13  Latitude_y            object 
 14  Longitude_y           object 
 15  WorkLatitude          object 
 16  WorkLongitude         object 
 17  FromHour              float64
 18  FromMin               float64
 19  ToHour                float64
 20  ToMin                 float64
 21  AvailableWeekDay      float64


## DataPreprocessor Class

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

class DataPreprocessor:
    """
    A class for handling data preprocessing tasks, including cleaning,
    feature engineering, and preparation for analysis or modeling.
    """
    
    def __init__(self, df=None, file_path=None, sep=","):
        """
        Initializes the DataPreprocessor class.

        Args:
            df (pd.DataFrame): A DataFrame provided directly. Defaults to None.
            file_path (str): Path to the data file (CSV). Defaults to None.
            sep (str): Separator used in the file. Defaults to ",".
        """
        if df is not None:
            if isinstance(df, pd.DataFrame):
                self.df = df
                print("DataFrame provided directly.")
            else:
                raise ValueError("Provided data is not a valid DataFrame.")
        elif file_path is not None:
            self.df = self.load_data(file_path, sep)
        else:
            raise ValueError("Either a DataFrame or file path must be provided.")

    def load_data(self, file_path, sep=","):
        """
        Loads a dataset from a file.

        Args:
            file_path (str): Path to the data file.
            sep (str): Separator used in the file. Defaults to ",".

        Returns:
            pd.DataFrame: Loaded DataFrame.
        """
        try:
            df = pd.read_csv(file_path, sep=sep)
            print(f"Data successfully loaded from {file_path}.")
            return df
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {file_path}")
        except Exception as e:
            raise Exception(f"An error occurred while loading data: {e}")

    def handle_missing_values(self, threshold=50):
        """
        Handles missing values by dropping columns with high percentages
        of missing values and imputing remaining missing values.

        Args:
            threshold (float): Percentage threshold for dropping columns. Defaults to 50%.
        """
        missing_percentage = (self.df.isnull().sum() / len(self.df)) * 100
        print("Missing value percentages:\n", missing_percentage)

        # Drop columns with missing values above the threshold
        columns_to_drop = missing_percentage[missing_percentage > threshold].index
        self.df.drop(columns=columns_to_drop, axis=1, inplace=True)
        print(f"Columns dropped: {list(columns_to_drop)}")

        # Fill missing values in numeric columns with median
        numeric_cols = self.df.select_dtypes(include=['float64', 'int64']).columns
        self.df[numeric_cols] = self.df[numeric_cols].fillna(self.df[numeric_cols].median())

        # Fill missing values in categorical columns with "Unknown"
        categorical_cols = self.df.select_dtypes(include=['object']).columns
        self.df[categorical_cols] = self.df[categorical_cols].fillna("Unknown")

    def fix_data_types(self):
        """
        Fixes data types by converting latitude/longitude to numeric
        and converting date columns to datetime.
        """
        # Fix latitude and longitude columns
        lat_long_cols = ['latitude', 'longitude', 'Latitude_x', 'Longitude_x', 'Latitude_y', 'Longitude_y']
        for col in lat_long_cols:
            if col in self.df.columns:
                # Replace commas with dots and non-numeric values with NaN
                self.df[col] = (
                    self.df[col]
                    .replace(',', '.', regex=True)
                    .apply(pd.to_numeric, errors='coerce')
                )

        # Convert date columns to datetime
        date_cols = ['ResponseDate', 'alarmlocaltime', 'CreatedDate']
        for col in date_cols:
            if col in self.df.columns:
                self.df[col] = pd.to_datetime(self.df[col], errors='coerce')

    def remove_duplicates(self):
        """
        Removes duplicate rows from the dataset.
        """
        initial_rows = len(self.df)
        self.df.drop_duplicates(inplace=True)
        final_rows = len(self.df)
        print(f"Removed {initial_rows - final_rows} duplicate rows.")

    def scale_numeric_features(self, features):
        """
        Scales numeric features using StandardScaler.

        Args:
            features (list): List of numeric feature column names to scale.
        """
        scaler = StandardScaler()
        self.df[features] = scaler.fit_transform(self.df[features])

    def engineer_features(self):
        """
        Performs feature engineering, such as deriving response time
        and creating date-related features.
        """
        if 'ResponseDate' in self.df.columns and 'alarmlocaltime' in self.df.columns:
            self.df['response_time'] = (self.df['ResponseDate'] - self.df['alarmlocaltime']).dt.total_seconds() / 60

        # Add new features based on dates
        for col in ['ResponseDate', 'alarmlocaltime']:
            if col in self.df.columns:
                self.df[f"{col}_year"] = self.df[col].dt.year
                self.df[f"{col}_month"] = self.df[col].dt.month
                self.df[f"{col}_day"] = self.df[col].dt.day

    def visualize_data(self, feature):
        """
        Visualizes the distribution of a numeric feature.

        Args:
            feature (str): The feature to visualize.
        """
        if feature in self.df.columns:
            sns.histplot(self.df[feature], bins=50)
            plt.title(f"Distribution of {feature}")
            plt.show()
        else:
            print(f"Feature {feature} not found in the DataFrame.")

    def save_data(self, file_path):
        """
        Saves the cleaned DataFrame to a CSV file.

        Args:
            file_path (str): Path to save the file.
        """
        self.df.to_csv(file_path, index=False)
        print(f"Data saved to {file_path}.")


In [None]:
# Initialize the preprocessor with a DataFrame or file path
preprocessor = DataPreprocessor(df=data.df)

# Handle missing values
preprocessor.handle_missing_values(threshold=50)

# Fix data types
preprocessor.fix_data_types()

# Remove duplicates
preprocessor.remove_duplicates()

# Scale numeric features
numeric_features = ['FromHour', 'ToHour', 'FromMin', 'ToMin', 'AvailableWeekDay', 'AvailabilityLocation']
preprocessor.scale_numeric_features(numeric_features)

# Engineer new features
preprocessor.engineer_features()

# Visualize a feature
preprocessor.visualize_data('response_time')

# Save the cleaned data
preprocessor.save_data("data/cleaned_data.csv")


DataFrame provided directly.
Missing value percentages:
 alarmid                 0.0
userid                  0.0
DeviceType              0.0
devices                 0.0
Response                0.0
latitude                0.0
longitude               0.0
alarmlocaltime          0.0
Latitude_x              0.0
Longitude_x             0.0
AlarmStatus             0.0
DateOfBirth             0.0
Latitude_y              0.0
Longitude_y             0.0
WorkLatitude            0.0
WorkLongitude           0.0
FromHour                0.0
FromMin                 0.0
ToHour                  0.0
ToMin                   0.0
AvailableWeekDay        0.0
AvailabilityLocation    0.0
CreatedDate             0.0
dtype: float64
Columns dropped: []
