In [1]:
from preprocessing.preprocessing import DataPreprocessing
from training.train import DataTraining
from mining.routes import RoutesMining
from hmmlearn import hmm
import pandas as pd
from joblib import dump, load
import numpy as np
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from generate.generate import GenerateData



In [196]:
preprocessing = DataPreprocessing("data/train_trips.csv")
df = preprocessing.create_formatted()

### TEST FÜR 1 TRIP

In [279]:
trip = df.loc[df['TripID'] == 1]

In [46]:
df['going_next_station'].shift(1).fillna(0)

0           0.0
1          89.0
2         104.0
3         110.0
4         116.0
          ...  
272255     69.0
272256     65.0
272257     52.0
272258     29.0
272259     22.0
Name: going_next_station, Length: 272260, dtype: float64

In [280]:
trip['percent_leaving'] = (trip['Alightings'] / trip['going_next_station'].shift(1).fillna(0)).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip['percent_leaving'] = (trip['Alightings'] / trip['going_next_station'].shift(1).fillna(0)).fillna(0)


In [281]:
trip['percent_staying'] = None

# Set the first row for the transition from Station 1 to Station 1 to 1
trip.loc[0, 'percent_staying'] = 1

# Calculate the percentage of staying for the rest of the DataFrame
for i in range(1, len(trip)):
    trip.loc[i, 'percent_staying'] = trip.loc[i - 1, 'percent_staying'] * (1 - trip.loc[i - 1, 'percent_leaving'])

# Handle NaN values if necessary (e.g., fill with 0 or drop)
trip.fillna(0, inplace=True)

trip['probability_leaving'] =  trip['percent_staying'].shift(1).fillna(1) - trip['percent_staying']

# View the DataFrame
trip

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip['percent_staying'] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trip['probability_leaving'] =  trip['percent_staying'].shift(1).fillna(1) - trip['percent_staying']


Unnamed: 0,TrainID,Station,Boardings,Alightings,Arrival,Departure,StationNameShort,sBahnID,ArrivalDate,ArrivalHour,TripID,on_train,going_next_station,percent_leaving,percent_staying,probability_leaving
0,247283,Wedel,89,0,2016-12-10 11:39:06,2016-12-10 11:43:29,WL,S1,2016-12-10,11,1,89,89,0.0,1.0,0.0
1,247283,Rissen,17,2,2016-12-10 11:47:17,2016-12-10 11:47:50,RI,S1,2016-12-10,11,1,15,104,0.022472,1.0,0.0
2,247283,Sülldorf,14,8,2016-12-10 11:50:32,2016-12-10 11:52:07,SDF,S1,2016-12-10,11,1,6,110,0.076923,0.977528,0.022472
3,247283,Iserbrook,6,0,2016-12-10 11:53:21,2016-12-10 11:53:57,IS,S1,2016-12-10,11,1,6,116,0.0,0.902334,0.075194
4,247283,Blankenese,12,7,2016-12-10 11:55:54,2016-12-10 11:57:31,B,S1,2016-12-10,11,1,5,121,0.060345,0.902334,0.0
5,247283,Hochkamp,5,3,2016-12-10 11:59:45,2016-12-10 12:00:22,HPS,S1,2016-12-10,11,1,2,123,0.024793,0.847882,0.054451
6,247283,Klein Flottbek,3,0,2016-12-10 12:01:49,2016-12-10 12:02:26,FB,S1,2016-12-10,12,1,3,126,0.0,0.826861,0.021022
7,247283,Othmarschen,46,7,2016-12-10 12:03:58,2016-12-10 12:04:49,OH,S1,2016-12-10,12,1,39,165,0.055556,0.826861,0.0
8,247283,Bahrenfeld,9,5,2016-12-10 12:06:28,2016-12-10 12:07:10,BAF,S1,2016-12-10,12,1,4,169,0.030303,0.780924,0.045937
9,247283,Altona,136,53,2016-12-10 12:09:59,2016-12-10 12:11:50,AS,S1,2016-12-10,12,1,83,252,0.313609,0.75726,0.023664


### NUMERATE STATIONS WITHIN A TRIP

In [87]:
def numerate_stations(group):
    i = 0
    for idx in range(0, len(group)-1):
        i = i + 1
        group.iloc[idx, group.columns.get_loc('station_id')] = i 

    return group

In [112]:
df = df.groupby(['TripID']).apply(numerate_stations)

### Aggregate DATA - Average Boardings, Alightings, Next Station

In [122]:
aggregated_data = df.groupby('Station',  sort=False).mean(['Boardings', 'Alightings','going_next_station']).reset_index()

In [114]:
aggregated_data.iloc[3:len(aggregated_data)]

Unnamed: 0,Station,Boardings,Alightings,ArrivalHour,TripID,on_train,going_next_station,station_id
3,Iserbrook,7.312748,7.181999,13.100085,5174.196763,0.13075,52.060619,14.267462
4,Blankenese,16.941216,17.75336,13.273422,5189.807986,-0.812143,48.914532,13.351542
5,Hochkamp,5.648071,3.459441,13.33762,5189.807986,2.18863,62.212221,13.669728
6,Klein Flottbek,8.734313,10.140772,13.355023,5189.807986,-1.406458,67.235425,13.666731
7,Othmarschen,13.239393,13.184885,13.35044,5191.25215,0.054509,81.76225,13.662511
8,Bahrenfeld,13.874456,14.463999,13.356528,5191.25215,-0.589543,98.7003,13.659901
9,Altona,43.250385,40.152408,13.377746,5189.408189,3.097977,106.156069,13.616667
10,Königstraße,5.824566,5.752505,13.338632,5189.408189,0.072062,103.302312,13.616859
11,Reeperbahn,18.838536,16.306166,13.329287,5189.408189,2.53237,113.094316,13.617052
12,Landungsbrücken,10.369846,12.854432,13.327649,5189.408189,-2.484586,116.011561,13.617245


### Try to iterate over dataset - WIP ...

In [94]:
n_stations = len(aggregated_data)
transition_matrix = np.zeros((n_stations, n_stations))

In [270]:
def calculate_probabilities(group):
    id = group['TripID'].unique()
    id = id[0]
    

    for idx in range(0, len(group)):

        group['percent_leaving_' + str(idx)] =  (group['Alightings'] / group['going_next_station'].shift(idx + 1).fillna(1)).fillna(0)
        #print(group['percent_leaving_' + str(idx)])

        group['percent_staying_' + str(idx)] = None
        # Set the first row for the transition from Station 1 to Station 1 to 1
        group.loc[idx, 'percent_staying_' + str(idx)] = 1

        j = 1

        if idx == 1:
            #print('tt',  group['percent_leaving_' + str(idx)])
            print(group['going_next_station'].shift(idx + 1).fillna(1))
            print(group['Alightings'][idx:len(group)])

        for j in range(j, len(group)):
        
            group.loc[j, 'percent_staying_' + str(idx)] = group.loc[j - 1, 'percent_staying_' + str(idx)] * (1 - group.loc[j - 1, 'percent_leaving_' + str(idx)])

        # Handle NaN values if necessary (e.g., fill with 0 or drop)
        group.fillna(0)
        
        j = j + 1
        # Initialize 'percent_staying'
        group['probability_leaving_' + str(idx)] =  group['percent_staying_' + str(idx)].shift(idx + 1).fillna(idx + 1) - group['percent_staying_' + str(idx)]
        #print(group['probability_leaving_' + str(idx)])
        #print('probability', 'mm')

    return group

##### gibt es irgendein Fehler...das problem mit der zweiten Runde..ich erstelle dynamisch Spalte für Wahrescheinlichkeiten für jeden Trip. Allerdings muss man auch die Wahrscheinlichkeiten für den Trip zurück betrachten. Von 2 nach 1 und nach 0...

In [272]:
test = trip.groupby(['TripID']).apply(calculate_probabilities)

0       1.0
1       1.0
2      89.0
3     104.0
4     110.0
5     116.0
6     121.0
7     123.0
8     126.0
9     165.0
10    169.0
11    252.0
12    253.0
13    261.0
14    254.0
15    249.0
16    130.0
17     95.0
18    102.0
19    100.0
20    103.0
21     99.0
22     94.0
23     99.0
24     96.0
25     87.0
Name: going_next_station, dtype: float64
1       2
2       8
3       0
4       7
5       3
6       0
7       7
8       5
9      53
10      1
11     20
12     43
13     11
14    131
15     89
16      5
17      8
18     14
19     21
20      9
21     17
22      6
23     18
24     41
25     82
Name: Alightings, dtype: int64


KeyError: 'TripID'