In this script we analyse the results from outlier detection using isolation forest.

The result csv was normalised but we matched it again with the original data.

In [1]:
import pandas as pd
from sqlalchemy import create_engine


dbname = 'DataMining'
user = 'postgres'
password = 'datamining'
host = 'localhost'  # localhost or the server address
port = '5433'  # default PostgreSQL port is 5432

# Establish a connection to the database
connection_str = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
engine = create_engine(connection_str)

In [2]:
query = """
select count(*) from iso_forest_analysis;
"""

# Execute the query and fetch the data into a DataFrame
print(pd.read_sql_query(query, engine))

    count
0  176666


In [3]:
query = """
select count(*) from iso_forest_analysis
where (RS_E_RPM_PC1 = 0) and (RS_E_RPM_PC2 = 0);
"""

# Execute the query and fetch the data into a DataFrame
print(pd.read_sql_query(query, engine))

    count
0  133039


Here we see that a lot of the detected outliers are entries where the train is not moving. This is a result that could be expected. We can continue looking at the results where the train is moving (where one of the rpm values is not zero). We already analysed the points where the train is not moving (-> research question 12)

In [9]:
query = """
select * from iso_forest_analysis
where (RS_E_RPM_PC1 != 0) and (RS_E_RPM_PC2 != 0);
"""

# Execute the query and fetch the data into a DataFrame
df = pd.read_sql_query(query, engine)

stats = df[['rs_e_inairtemp_pc1', 'rs_e_inairtemp_pc2']].describe()

# Display the statistics
print(stats)


       rs_e_inairtemp_pc1  rs_e_inairtemp_pc2
count        18603.000000        18603.000000
mean           534.215491          719.048116
std           5860.538483         6795.348801
min              0.000000            0.000000
25%              3.000000            2.000000
50%              5.000000            5.000000
75%              7.000000            7.000000
max          65535.000000        65535.000000


So here we see the stats for rpm values that are not zero for both engines.

We can start by looking at the data for one vehicle id. Let's start with veh_id = 102

In [10]:
df_102 = df[df['mapped_veh_id'] == 102]

In [13]:
df_102

Unnamed: 0,mapped_veh_id,timestamps_utc,lat,lon,rs_e_inairtemp_pc1,rs_e_inairtemp_pc2,rs_e_oilpress_pc1,rs_e_oilpress_pc2,rs_e_rpm_pc1,rs_e_rpm_pc2,rs_e_wattemp_pc1,rs_e_wattemp_pc2,rs_t_oiltemp_pc1,rs_t_oiltemp_pc2,pg_point,utm_point
68,102,2023-05-19 01:37:50,50.855865,3.607164,10.0,8.000000,596.0,569.000000,801.0,802.000000,17.0,18.0,16.000000,14.0,0101000020E6100000E6779ACC78DB0C404165FCFB8C6D...,0101000020777F0000D57C3F502090204156C66216ED7D...
179,102,2023-04-22 03:33:15,51.016010,3.774037,6.0,8.000000,562.0,583.000000,802.0,801.000000,20.0,18.0,14.000000,16.0,0101000020E61000006B195F6A3A310E40C474C69C0C82...,0101000020777F00009EC8B1AF6DEA20414F8AD4956C8F...
203,102,2023-04-22 03:32:14,51.016027,3.774070,6.0,8.000000,565.0,610.000000,788.0,801.000000,16.0,16.0,13.000000,14.0,0101000020E61000003F8B4A8F4B310E40EDABBC2E0D82...,0101000020777F000084C5AB3B72EA20414DD934136D8F...
300,102,2023-03-27 02:28:47,50.774428,3.872819,4.0,5.000000,558.0,565.000000,797.0,799.000000,25.0,21.0,8.000000,8.0,0101000020E6100000909E228788FB0E403D4DC1752063...,0101000020777F0000DC1699BE0A232141CF7AF1034475...
345,102,2023-03-27 02:26:51,50.774074,3.872653,4.0,4.000000,576.0,589.000000,799.0,800.000000,21.0,17.0,15.000000,6.0,0101000020E6100000F920BA5631FB0E40CC18D5D81463...,0101000020777F0000E5724239F42221413EE9F6203A75...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17868,102,2023-03-12 22:53:01,50.856710,3.608108,10.0,9.000000,596.0,579.000000,801.0,799.000000,18.0,15.0,9.000000,9.0,0101000020E6100000F590DF9167DD0C405012B7AFA86D...,0101000020777F00009218BD9CA3902041ACD4E1B9047E...
18038,102,2023-02-27 05:39:49,51.013972,3.778861,1.0,1.000000,596.0,527.000000,799.0,797.000000,16.0,33.0,12.000000,19.0,0101000020E61000008DFCB0941B3B0E407B08F4D3C981...,0101000020777F00000819023A17ED2041841FF7D0348F...
18183,102,2023-01-25 22:39:17,51.015482,3.775960,3.0,3.333333,1.0,36.666667,34.0,80.333333,26.0,27.0,77.333333,80.0,0101000020E61000009D1CF45B2A350E400817974FFB81...,0101000020777F00004EA770A07CEB20416DBFD7425E8F...
18249,102,2023-04-03 08:56:56,51.014289,3.779160,6.0,6.000000,552.0,569.000000,802.0,799.000000,28.0,21.0,11.000000,10.0,0101000020E6100000AEE3AE14B83B0E4052DA7635D481...,0101000020777F00002FC6115C40ED2041E617C5AD3D8F...


By analysing this data we see that the rpm's and oilpressures are high for both engines, but all the temperatures are extremely low. This might indicate that something is wrong. As we don't now what is an acceptible oil pressure we can not make further conclusions. 

In [16]:
df_102.describe()
df_102.to_csv('iso_forest_102.csv')

In [15]:
df.describe()

Unnamed: 0,mapped_veh_id,lat,lon,rs_e_inairtemp_pc1,rs_e_inairtemp_pc2,rs_e_oilpress_pc1,rs_e_oilpress_pc2,rs_e_rpm_pc1,rs_e_rpm_pc2,rs_e_wattemp_pc1,rs_e_wattemp_pc2,rs_t_oiltemp_pc1,rs_t_oiltemp_pc2
count,18603.0,18603.0,18603.0,18603.0,18603.0,18603.0,18603.0,18603.0,18603.0,18603.0,18603.0,18603.0,18603.0
mean,150.885986,50.779155,4.204294,534.215491,719.048116,564.025603,575.834971,800.694411,805.607056,19.621102,18.230022,12.484307,11.809591
std,27.653919,0.287674,0.555511,5860.538483,6795.348801,101.255078,93.621472,170.228904,157.761327,9.080518,8.474851,9.363052,9.197544
min,102.0,50.062296,3.582963,0.0,0.0,0.0,0.0,6.666667,13.333333,0.0,-17.0,0.0,0.0
25%,128.0,50.455,3.774702,3.0,2.0,552.0,562.0,798.0,798.0,13.0,12.0,8.0,7.0
50%,151.0,50.856217,3.873062,5.0,5.0,576.0,586.0,800.0,800.0,19.0,17.0,11.0,10.0
75%,174.0,51.014326,4.527237,7.0,7.0,603.0,610.0,801.0,802.0,25.0,23.0,16.0,14.5
max,197.0,51.246052,5.541531,65535.0,65535.0,690.0,690.0,1994.0,2011.0,97.0,93.0,94.5,94.666667
