## Import Libraries

In [1]:
import pandas as pd
import numpy as np


## Read the data


In [2]:
df = pd.read_csv("../datasets/railway.csv" , sep =",")

## check missing values

In [3]:
df.isnull().sum()

Transaction ID             0
Date of Purchase           0
Time of Purchase           0
Purchase Type              0
Payment Method             0
Railcard               20918
Ticket Class               0
Ticket Type                0
Price                      0
Departure Station          0
Arrival Destination        0
Date of Journey            0
Departure Time             0
Arrival Time               0
Actual Arrival Time     1880
Journey Status             0
Reason for Delay       27481
Refund Request             0
dtype: int64

## check for duplicates

In [4]:
df.duplicated().sum()

np.int64(0)

### there is no duplicated values

## Check types of columns

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31653 entries, 0 to 31652
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Transaction ID       31653 non-null  object
 1   Date of Purchase     31653 non-null  object
 2   Time of Purchase     31653 non-null  object
 3   Purchase Type        31653 non-null  object
 4   Payment Method       31653 non-null  object
 5   Railcard             10735 non-null  object
 6   Ticket Class         31653 non-null  object
 7   Ticket Type          31653 non-null  object
 8   Price                31653 non-null  int64 
 9   Departure Station    31653 non-null  object
 10  Arrival Destination  31653 non-null  object
 11  Date of Journey      31653 non-null  object
 12  Departure Time       31653 non-null  object
 13  Arrival Time         31653 non-null  object
 14  Actual Arrival Time  29773 non-null  object
 15  Journey Status       31653 non-null  object
 16  Reas

### all coulmns don't have dedicated data type

## Change type of columns

### For date columns

In [6]:
df['Date of Purchase'] = pd.to_datetime(df['Date of Purchase'], errors='coerce')
df['Date of Journey'] = pd.to_datetime(df['Date of Journey'], errors='coerce')
df['Time of Purchase'] = pd.to_datetime(df['Time of Purchase'], format="%H:%M:%S", errors='coerce').dt.time
df['Departure Time'] = pd.to_datetime(df['Departure Time'], format="%H:%M:%S", errors='coerce').dt.time
df['Arrival Time'] = pd.to_datetime(df['Arrival Time'], format="%H:%M:%S", errors='coerce').dt.time
df['Actual Arrival Time'] = pd.to_datetime(df['Actual Arrival Time'], format="%H:%M:%S", errors='coerce').dt.time

### For categeory columns

In [8]:
categorical_cols = [
    "Purchase Type",       
    "Payment Method",     
    "Railcard",            
    "Ticket Class",        
    "Ticket Type",        
    "Departure Station",  
    "Arrival Destination",
    "Journey Status",     
    "Reason for Delay",   
    "Refund Request"     
]
df[categorical_cols] = df[categorical_cols].astype("category")

### For Numeric data

In [9]:
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

## Data after change the categores

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31653 entries, 0 to 31652
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Transaction ID       31653 non-null  object        
 1   Date of Purchase     31653 non-null  datetime64[ns]
 2   Time of Purchase     31653 non-null  object        
 3   Purchase Type        31653 non-null  category      
 4   Payment Method       31653 non-null  category      
 5   Railcard             10735 non-null  category      
 6   Ticket Class         31653 non-null  category      
 7   Ticket Type          31653 non-null  category      
 8   Price                31653 non-null  int64         
 9   Departure Station    31653 non-null  category      
 10  Arrival Destination  31653 non-null  category      
 11  Date of Journey      31653 non-null  datetime64[ns]
 12  Departure Time       31653 non-null  object        
 13  Arrival Time         31653 non-

## Clean the String Data

In [11]:
categorical_cols = [
    "Purchase Type",       
    "Payment Method",     
    "Railcard",            
    "Ticket Class",        
    "Ticket Type",        
    "Departure Station",  
    "Arrival Destination",
    "Journey Status",     
    "Reason for Delay",   
    "Refund Request"     
]

for col in categorical_cols:
    df[col] = df[col].astype(str).str.strip().str.title()




## check the data

In [12]:
df.head()

Unnamed: 0,Transaction ID,Date of Purchase,Time of Purchase,Purchase Type,Payment Method,Railcard,Ticket Class,Ticket Type,Price,Departure Station,Arrival Destination,Date of Journey,Departure Time,Arrival Time,Actual Arrival Time,Journey Status,Reason for Delay,Refund Request
0,da8a6ba8-b3dc-4677-b176,2023-12-08,12:41:11,Online,Contactless,Adult,Standard,Advance,43,London Paddington,Liverpool Lime Street,2024-01-01,11:00:00,13:30:00,13:30:00,On Time,Nan,No
1,b0cdd1b0-f214-4197-be53,2023-12-16,11:23:01,Station,Credit Card,Adult,Standard,Advance,23,London Kings Cross,York,2024-01-01,09:45:00,11:35:00,11:40:00,Delayed,Signal Failure,No
2,f3ba7a96-f713-40d9-9629,2023-12-19,19:51:27,Online,Credit Card,Nan,Standard,Advance,3,Liverpool Lime Street,Manchester Piccadilly,2024-01-02,18:15:00,18:45:00,18:45:00,On Time,Nan,No
3,b2471f11-4fe7-4c87-8ab4,2023-12-20,23:00:36,Station,Credit Card,Nan,Standard,Advance,13,London Paddington,Reading,2024-01-01,21:30:00,22:30:00,22:30:00,On Time,Nan,No
4,2be00b45-0762-485e-a7a3,2023-12-27,18:22:56,Online,Contactless,Nan,Standard,Advance,76,Liverpool Lime Street,London Euston,2024-01-01,16:45:00,19:00:00,19:00:00,On Time,Nan,No


## Handle missing values

In [13]:
df.isnull().sum()

Transaction ID            0
Date of Purchase          0
Time of Purchase          0
Purchase Type             0
Payment Method            0
Railcard                  0
Ticket Class              0
Ticket Type               0
Price                     0
Departure Station         0
Arrival Destination       0
Date of Journey           0
Departure Time            0
Arrival Time              0
Actual Arrival Time    1880
Journey Status            0
Reason for Delay          0
Refund Request            0
dtype: int64

In [20]:
df['Railcard'] = df['Railcard'].fillna("None")
df[df['Journey Status']!='On Time']['Reason for Delay']
df['Reason for Delay'] = df['Reason for Delay'].fillna("Not Delayed")
df.isna().sum()

Transaction ID            0
Date of Purchase          0
Time of Purchase          0
Purchase Type             0
Payment Method            0
Railcard                  0
Ticket Class              0
Ticket Type               0
Price                     0
Departure Station         0
Arrival Destination       0
Date of Journey           0
Departure Time            0
Arrival Time              0
Actual Arrival Time    1880
Journey Status            0
Reason for Delay          0
Refund Request            0
dtype: int64

## Fix date missing

In [21]:
import datetime
df['Actual Arrival Time'] = df['Actual Arrival Time'].fillna(datetime.time(0, 0, 0))

## Final look the data after cleaning

In [23]:
df.head()
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31653 entries, 0 to 31652
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Transaction ID       31653 non-null  object        
 1   Date of Purchase     31653 non-null  datetime64[ns]
 2   Time of Purchase     31653 non-null  object        
 3   Purchase Type        31653 non-null  object        
 4   Payment Method       31653 non-null  object        
 5   Railcard             31653 non-null  object        
 6   Ticket Class         31653 non-null  object        
 7   Ticket Type          31653 non-null  object        
 8   Price                31653 non-null  int64         
 9   Departure Station    31653 non-null  object        
 10  Arrival Destination  31653 non-null  object        
 11  Date of Journey      31653 non-null  datetime64[ns]
 12  Departure Time       31653 non-null  object        
 13  Arrival Time         31653 non-

Transaction ID         0
Date of Purchase       0
Time of Purchase       0
Purchase Type          0
Payment Method         0
Railcard               0
Ticket Class           0
Ticket Type            0
Price                  0
Departure Station      0
Arrival Destination    0
Date of Journey        0
Departure Time         0
Arrival Time           0
Actual Arrival Time    0
Journey Status         0
Reason for Delay       0
Refund Request         0
dtype: int64