In [11]:
import pandas as pd
from datetime import datetime

class NJCleaner():
    def __init__(self, csv_path: str):
        df = pd.read_csv(csv_path)
        self.data = df
    
    def order_by_scheduled_time(self) -> pd.DataFrame:
        self.data = self.data.sort_values(by="scheduled_time")
        return self.data
    
    def drop_columns_and_nan(self) -> pd.DataFrame:
        self.data = self.data.drop(columns=['from', 'to'])
        droped = self.data.dropna()
        self.data = droped
        return self.data
    
    def convert_day_to_date(self) -> pd.DataFrame:
        self.data["day"] = self.data["date"].apply(pd.to_datetime).dt.strftime('%A')
        self.data.drop(columns=["date"], inplace=True)

        return self.data
    
    def convert_scheduled_time_to_part_of_the_day(self) -> pd.DataFrame:
        
        self.data['part_of_day'] = self.data["scheduled_time"].apply(self.convert_datetime_to_time_of_day)
        self.data.drop(columns=["scheduled_time"], inplace=True)
        return self.data
    
    def convert_datetime_to_time_of_day(self, datetime_string):
        time_range_dict = {'early_morning': range(4, 8), 'morning': range(8, 12), 'afternoon': range(12, 16),
                        'evening': range(16, 20), 'night': range(20, 24), 'late_night': range(0, 4)}

        datetime_obj = datetime.strptime(datetime_string, '%Y-%m-%d %H:%M:%S')

        hour = datetime_obj.hour

        time_of_day = next(key for key, value in time_range_dict.items() if hour in value)

        return time_of_day
    
    def convert_delay_minutes_to_bool(self, delay) -> int:
        if 0 <= int(delay) and int(delay) < 5:
            return 0
        else:
            return 1
        
    def convert_delay(self) -> pd.DataFrame:
        self.data['delay'] = self.data["delay_minutes"].apply(self.convert_delay_minutes_to_bool)
        return self.data
    
    def drop_unnecessary_columns(self) -> pd.DataFrame:
        self.data.drop(columns=['train_id', 'actual_time', 'delay_minutes'], inplace=True)
        return self.data
    
    def save_first_60k(self, path: str):
        df_first_60 = self.data.head(60000)
        df_first_60.to_csv(path, index=False)
    
    def prep_df(self, csv_path: str='data/NJ.csv'):
        self.order_by_scheduled_time()
        self.drop_columns_and_nan()
        self.convert_day_to_date()
        self.convert_scheduled_time_to_part_of_the_day()
        self.convert_delay()
        self.drop_unnecessary_columns()
        self.save_first_60k(csv_path)

In [13]:
cleaner = NJCleaner("E:\\Bevadat\\BEVADAT2022232\\HAZI\\HAZI06\\data\\2018_03.csv")
data = cleaner.data
data

Unnamed: 0,date,train_id,stop_sequence,from,from_id,to,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
0,2018-03-01,3805,1.0,New York Penn Station,105,New York Penn Station,105,2018-03-02 01:22:00,2018-03-02 01:21:05,0.000000,departed,Northeast Corrdr,NJ Transit
1,2018-03-01,3805,2.0,New York Penn Station,105,Secaucus Upper Lvl,38187,2018-03-02 01:31:00,2018-03-02 01:31:08,0.133333,departed,Northeast Corrdr,NJ Transit
2,2018-03-01,3805,3.0,Secaucus Upper Lvl,38187,Newark Penn Station,107,2018-03-02 01:40:00,2018-03-02 01:40:07,0.116667,departed,Northeast Corrdr,NJ Transit
3,2018-03-01,3805,4.0,Newark Penn Station,107,Newark Airport,37953,2018-03-02 01:45:00,2018-03-02 01:45:10,0.166667,departed,Northeast Corrdr,NJ Transit
4,2018-03-01,3805,5.0,Newark Airport,37953,North Elizabeth,109,2018-03-02 01:49:00,2018-03-02 01:49:10,0.166667,departed,Northeast Corrdr,NJ Transit
...,...,...,...,...,...,...,...,...,...,...,...,...,...
256503,2018-03-31,0534,2.0,Bay Street,14,Glen Ridge,50,2018-03-31 19:02:00,2018-03-31 19:05:07,3.116667,departed,Montclair-Boonton,NJ Transit
256504,2018-03-31,0534,3.0,Glen Ridge,50,Bloomfield,19,2018-03-31 19:05:00,2018-03-31 19:08:06,3.100000,departed,Montclair-Boonton,NJ Transit
256505,2018-03-31,0534,4.0,Bloomfield,19,Watsessing Avenue,154,2018-03-31 19:07:00,2018-03-31 19:10:11,3.183333,departed,Montclair-Boonton,NJ Transit
256506,2018-03-31,0534,5.0,Watsessing Avenue,154,Newark Broad Street,106,2018-03-31 19:13:00,2018-03-31 19:21:02,8.033333,departed,Montclair-Boonton,NJ Transit


In [30]:
sorted = cleaner.order_by_scheduled_time()
sorted

Unnamed: 0,date,train_id,stop_sequence,from,from_id,to,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
5283,2018-03-01,3806,1.0,Trenton,148,Trenton,148,2018-03-01 03:48:00,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit
1270,2018-03-01,0042,1.0,Port Jervis,123,Port Jervis,123,2018-03-01 03:50:00,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit
5284,2018-03-01,3806,2.0,Trenton,148,Hamilton,32905,2018-03-01 03:54:00,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit
208,2018-03-01,3202,1.0,Long Branch,74,Long Branch,74,2018-03-01 03:58:00,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit
5285,2018-03-01,3806,3.0,Hamilton,32905,Princeton Junction,125,2018-03-01 04:00:00,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit
...,...,...,...,...,...,...,...,...,...,...,...,...,...
256459,2018-03-31,A663,,Trenton,148,Philadelphia,1,,2018-03-31 11:20:08,,departed,KEYSTONE,Amtrak
256460,2018-03-31,A664,,Philadelphia,1,Philadelphia,1,,2018-03-31 11:26:07,,departed,Amtrak,Amtrak
256461,2018-03-31,A664,,Philadelphia,1,Trenton,148,,2018-03-31 12:07:04,,departed,Amtrak,Amtrak
256462,2018-03-31,A664,,Trenton,148,Newark Penn Station,107,,2018-03-31 12:47:04,,departed,Amtrak,Amtrak


In [31]:
droped = cleaner.drop_columns_and_nan()
droped

Unnamed: 0,date,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
5283,2018-03-01,3806,1.0,148,148,2018-03-01 03:48:00,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit
1270,2018-03-01,0042,1.0,123,123,2018-03-01 03:50:00,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit
5284,2018-03-01,3806,2.0,148,32905,2018-03-01 03:54:00,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit
208,2018-03-01,3202,1.0,74,74,2018-03-01 03:58:00,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit
5285,2018-03-01,3806,3.0,32905,125,2018-03-01 04:00:00,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit
...,...,...,...,...,...,...,...,...,...,...,...
253745,2018-03-31,4705,7.0,15,141,2018-04-01 03:04:00,2018-04-01 03:04:03,0.050000,departed,No Jersey Coast,NJ Transit
256387,2018-03-31,0709,13.0,117,49,2018-04-01 03:05:00,2018-04-01 03:04:00,0.000000,estimated,Gladstone Branch,NJ Transit
253746,2018-03-31,4705,8.0,141,79,2018-04-01 03:07:00,2018-04-01 03:08:07,1.116667,departed,No Jersey Coast,NJ Transit
253747,2018-03-31,4705,9.0,79,122,2018-04-01 03:13:00,2018-04-01 03:13:09,0.150000,departed,No Jersey Coast,NJ Transit


In [32]:
day = cleaner.convert_day_to_date()
day

Unnamed: 0,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type,day
5283,3806,1.0,148,148,2018-03-01 03:48:00,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit,Thursday
1270,0042,1.0,123,123,2018-03-01 03:50:00,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit,Thursday
5284,3806,2.0,148,32905,2018-03-01 03:54:00,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit,Thursday
208,3202,1.0,74,74,2018-03-01 03:58:00,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit,Thursday
5285,3806,3.0,32905,125,2018-03-01 04:00:00,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit,Thursday
...,...,...,...,...,...,...,...,...,...,...,...
253745,4705,7.0,15,141,2018-04-01 03:04:00,2018-04-01 03:04:03,0.050000,departed,No Jersey Coast,NJ Transit,Saturday
256387,0709,13.0,117,49,2018-04-01 03:05:00,2018-04-01 03:04:00,0.000000,estimated,Gladstone Branch,NJ Transit,Saturday
253746,4705,8.0,141,79,2018-04-01 03:07:00,2018-04-01 03:08:07,1.116667,departed,No Jersey Coast,NJ Transit,Saturday
253747,4705,9.0,79,122,2018-04-01 03:13:00,2018-04-01 03:13:09,0.150000,departed,No Jersey Coast,NJ Transit,Saturday


In [33]:
convert = cleaner.convert_scheduled_time_to_part_of_the_day()
convert

Unnamed: 0,train_id,stop_sequence,from_id,to_id,actual_time,delay_minutes,status,line,type,day,part_of_day
5283,3806,1.0,148,148,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night
1270,0042,1.0,123,123,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit,Thursday,late_night
5284,3806,2.0,148,32905,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night
208,3202,1.0,74,74,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit,Thursday,late_night
5285,3806,3.0,32905,125,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit,Thursday,early_morning
...,...,...,...,...,...,...,...,...,...,...,...
253745,4705,7.0,15,141,2018-04-01 03:04:03,0.050000,departed,No Jersey Coast,NJ Transit,Saturday,late_night
256387,0709,13.0,117,49,2018-04-01 03:04:00,0.000000,estimated,Gladstone Branch,NJ Transit,Saturday,late_night
253746,4705,8.0,141,79,2018-04-01 03:08:07,1.116667,departed,No Jersey Coast,NJ Transit,Saturday,late_night
253747,4705,9.0,79,122,2018-04-01 03:13:09,0.150000,departed,No Jersey Coast,NJ Transit,Saturday,late_night


In [34]:
delay = cleaner.convert_delay()
delay

Unnamed: 0,train_id,stop_sequence,from_id,to_id,actual_time,delay_minutes,status,line,type,day,part_of_day,delay
5283,3806,1.0,148,148,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,1
1270,0042,1.0,123,123,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit,Thursday,late_night,0
5284,3806,2.0,148,32905,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,1
208,3202,1.0,74,74,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit,Thursday,late_night,0
5285,3806,3.0,32905,125,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit,Thursday,early_morning,0
...,...,...,...,...,...,...,...,...,...,...,...,...
253745,4705,7.0,15,141,2018-04-01 03:04:03,0.050000,departed,No Jersey Coast,NJ Transit,Saturday,late_night,0
256387,0709,13.0,117,49,2018-04-01 03:04:00,0.000000,estimated,Gladstone Branch,NJ Transit,Saturday,late_night,0
253746,4705,8.0,141,79,2018-04-01 03:08:07,1.116667,departed,No Jersey Coast,NJ Transit,Saturday,late_night,0
253747,4705,9.0,79,122,2018-04-01 03:13:09,0.150000,departed,No Jersey Coast,NJ Transit,Saturday,late_night,0


In [35]:
drop = cleaner.drop_unnecessary_columns()
drop

Unnamed: 0,stop_sequence,from_id,to_id,status,line,type,day,part_of_day,delay
5283,1.0,148,148,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,1
1270,1.0,123,123,departed,Bergen Co. Line,NJ Transit,Thursday,late_night,0
5284,2.0,148,32905,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,1
208,1.0,74,74,departed,No Jersey Coast,NJ Transit,Thursday,late_night,0
5285,3.0,32905,125,departed,Northeast Corrdr,NJ Transit,Thursday,early_morning,0
...,...,...,...,...,...,...,...,...,...
253745,7.0,15,141,departed,No Jersey Coast,NJ Transit,Saturday,late_night,0
256387,13.0,117,49,estimated,Gladstone Branch,NJ Transit,Saturday,late_night,0
253746,8.0,141,79,departed,No Jersey Coast,NJ Transit,Saturday,late_night,0
253747,9.0,79,122,departed,No Jersey Coast,NJ Transit,Saturday,late_night,0


In [None]:
cleaner.save_first_60k("data/NJ.csv")