In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('ar41_for_ulb.csv', sep=";")
df = df.drop_duplicates()

Drop duplicates

In [None]:
df_dropped = df.dropna()

Drop rows where both motors are off

In [None]:
df_motor_stopped = (df_dropped['RS_E_RPM_PC1'] == 0) & (df_dropped['RS_E_RPM_PC2'] == 0)
df_dropped = df_dropped[~df_motor_stopped] 




In [None]:
features = ['RS_E_InAirTemp', 'RS_E_OilPress', 'RS_E_RPM', 'RS_E_WatTemp', 'RS_T_OilTemp']

In [None]:
for feature in features:
    pc1_values = df_dropped[f'{feature}_PC1']
    pc2_values = df_dropped[f'{feature}_PC2']
    combined_values = pd.concat([pc1_values, pc2_values])

    hist, bins = np.histogram(combined_values, bins='auto')

    plt.figure(figsize=(12, 6))
    plt.bar(bins[:-1], hist, width=(bins[1]-bins[0]), color='skyblue', edgecolor='black')

    plt.title(f'Histogram of {feature}', size=15)
    plt.xlabel(feature, size=12)
    plt.ylabel('Quantity', size=12)

    #plt.show()
    plt.savefig(f'before_{feature}.png')
    

In [None]:
location_counts = df.groupby(['lat', 'lon']).size().reset_index(name='count')

plt.figure(figsize=(12, 8))
plt.scatter(location_counts['lon'], location_counts['lat'], s=location_counts['count']*10, alpha=0.5)

plt.title('Density of coordinates', size=15)
plt.xlabel('Longitude', size=12)
plt.ylabel('Latitude', size=12)

plt.show()

Clean in_air_temp
- Remove values outside of range [0, 100] -> sensor problem : values outside are not possible

In [None]:
df_dropped = df_dropped[(df_dropped['RS_E_InAirTemp_PC1'] >= 0) & (df_dropped['RS_E_InAirTemp_PC1'] <= 100)]
df_dropped = df_dropped[(df_dropped['RS_E_InAirTemp_PC2'] >= 0) & (df_dropped['RS_E_InAirTemp_PC2'] <= 100)]

Clean oil_press


Clean RPM

In [None]:
df_dropped = df_dropped[(df_dropped['RS_E_RPM_PC1'] > 0) & (df_dropped['RS_E_RPM_PC1'] <= 2100)]
df_dropped = df_dropped[(df_dropped['RS_E_RPM_PC2'] > 0) & (df_dropped['RS_E_RPM_PC2'] <= 2100)]

Clean water_temp
- Remove values outside of range [0, 100] -> sensor problem : values outside are abnormal

In [None]:
df_dropped = df_dropped[(df_dropped['RS_E_WatTemp_PC1'] >= 0) & (df_dropped['RS_E_WatTemp_PC1'] <= 100)]
df_dropped = df_dropped[(df_dropped['RS_E_WatTemp_PC2'] >= 0) & (df_dropped['RS_E_WatTemp_PC2'] <= 100)]

Clean oil_temp
- Remove values outside of range [0, 150] -> sensor problem : values outside are abnormal

In [None]:
df_dropped = df_dropped[(df_dropped['RS_T_OilTemp_PC1'] >= 0) & (df_dropped['RS_T_OilTemp_PC1'] <= 150)]
df_dropped = df_dropped[(df_dropped['RS_T_OilTemp_PC2'] >= 0) & (df_dropped['RS_T_OilTemp_PC2'] <= 150)]

Clean coordinates

In [None]:
len(df_dropped)

In [None]:
for feature in features:
    pc1_values = df_dropped[f'{feature}_PC1']
    pc2_values = df_dropped[f'{feature}_PC2']
    combined_values = pd.concat([pc1_values, pc2_values])

    hist, bins = np.histogram(combined_values, bins='auto')

    plt.figure(figsize=(12, 6))
    plt.bar(bins[:-1], hist, width=(bins[1]-bins[0]), color='skyblue', edgecolor='black')

    plt.title(f'Histogram of {feature}', size=15)
    plt.xlabel(feature, size=12)
    plt.ylabel('Quantity', size=12)

    plt.show()
    plt.savefig(f'after_{feature}.png')