In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
errors = pd.read_csv("PdM_errors.csv")
failures = pd.read_csv("PdM_failures.csv")
machines = pd.read_csv("PdM_machines.csv")
maintenance = pd.read_csv("PdM_maint.csv")
telemetry = pd.read_csv("PdM_telemetry.csv")

In [4]:
errors

Unnamed: 0,datetime,machineID,errorID
0,2015-01-03 07:00:00,1,error1
1,2015-01-03 20:00:00,1,error3
2,2015-01-04 06:00:00,1,error5
3,2015-01-10 15:00:00,1,error4
4,2015-01-22 10:00:00,1,error4
...,...,...,...
3914,2015-11-21 08:00:00,100,error2
3915,2015-12-04 02:00:00,100,error1
3916,2015-12-08 06:00:00,100,error2
3917,2015-12-08 06:00:00,100,error3


In [5]:
failures

Unnamed: 0,datetime,machineID,failure
0,2015-01-05 06:00:00,1,comp4
1,2015-03-06 06:00:00,1,comp1
2,2015-04-20 06:00:00,1,comp2
3,2015-06-19 06:00:00,1,comp4
4,2015-09-02 06:00:00,1,comp4
...,...,...,...
756,2015-11-29 06:00:00,99,comp3
757,2015-12-14 06:00:00,99,comp4
758,2015-02-12 06:00:00,100,comp1
759,2015-09-10 06:00:00,100,comp1


In [7]:
new_failures = failures.drop(columns=['datetime'], axis=1, inplace=False)
rating_data = new_failures.groupby('machineID')['failure'].value_counts().unstack()
rating_data

failure,comp1,comp2,comp3,comp4
machineID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,2.0,,4.0
2,1.0,3.0,,
3,1.0,4.0,,
4,2.0,4.0,,
5,4.0,3.0,,
...,...,...,...,...
96,,,5.0,
97,,1.0,3.0,4.0
98,4.0,4.0,5.0,3.0
99,2.0,6.0,6.0,5.0


In [12]:
errors["count"] = 1

error_features = errors.pivot_table(
    index="machineID",
    columns="errorID",
    values="count",
    aggfunc="sum",
    fill_value=0
).reset_index()

failure_features = rating_data.reset_index()

maintenance["maint_count"] = 1
maint_counts = maintenance.groupby("machineID")["maint_count"].sum().reset_index()

telemetry_agg = telemetry.groupby("machineID").agg({
    "volt": "mean",
    "rotate": "mean",
    "pressure": "mean",
    "vibration": "mean"
}).reset_index()

machine_features = machines.copy()

machine_features = (
    machine_features
    .merge(error_features, on="machineID", how="left")
    .merge(failure_features, on="machineID", how="left")
    .merge(telemetry_agg, on="machineID", how="left")
    .merge(maint_counts, on="machineID", how="left")
)
machine_features = machine_features.fillna(0)

Unnamed: 0,machineID,model,age,error1,error2,error3,error4,error5,comp1,comp2,comp3,comp4,volt,rotate,pressure,vibration,maint_count
0,1,model3,18,11,6,9,5,4,1.0,2.0,0.0,4.0,170.833898,446.336502,100.668306,40.586309,37
1,2,model4,7,5,10,5,5,3,1.0,3.0,0.0,0.0,170.760482,446.388915,100.539930,40.301539,32
2,3,model3,8,11,10,8,8,2,1.0,4.0,0.0,0.0,170.659235,446.583238,100.653114,40.475376,37
3,4,model3,7,10,11,8,2,0,2.0,4.0,0.0,0.0,170.659728,446.053801,100.569902,40.341620,33
4,5,model3,2,14,8,7,6,3,4.0,3.0,0.0,0.0,171.044435,446.858577,101.066188,40.264846,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,model2,10,5,10,8,12,4,0.0,0.0,5.0,0.0,170.614552,448.311054,101.463938,40.248600,33
96,97,model2,14,12,8,5,15,5,0.0,1.0,3.0,4.0,170.343383,447.032374,101.043793,40.552751,34
97,98,model2,20,9,12,8,11,5,4.0,4.0,5.0,3.0,170.782478,446.066974,101.705498,40.499608,37
98,99,model1,14,9,17,15,8,5,2.0,6.0,6.0,5.0,170.727884,444.766488,101.568852,40.524730,34


In [None]:
# need to build a table that has each machine mapped to their features like model, age, errors, volt, rotate etc
# errors is def special you might need to figure out a way to map frequency of each type of error for each machine before joining, might be similar to the pivot table we used for the failure count table

# machine_features_df = machines[['machineID', 'model', 'age']]
# machine_features_df = pd.get_dummies(machine_features_df, columns=['model'], prefix='model')
# machine_features_df['age_normalized'] = machine_features_df['age'] / machine_features_df['age'].max()


# historical_features = telemetry.groupby('machineID')[['volt', 'rotate', 'pressure', 'vibration']].mean().reset_index()
# historical_features.columns = ['machineID', 'avg_volt', 'avg_rot', 'avg_pressure', 'avg_vib']

# the errors stuff goes here

# # Combine all machine features include the errors stuff even though this mock version does not join the errors
# machine_features_final = machine_features_df.merge(historical_features, on='machineID', how='left').fillna(0)
# machine_features_final = machine_features_final.set_index('machineID')

In [None]:
# need to build a table shows number of replacements/maintenence a component has logged
# might need to make a col show make some sort of distinction with how recently the component was maintained bc obviously if it just got maintained it's less likely to fail next
# component_features_df = 

In [None]:
# k must be between 1 and 4 (inclusive) as there are only 4 components 
# target_user needs to be inputed as the machineID number -> between 1 and 100 (inclusive)
def feature_aware_user_CF(target_user, k, similarity_metric = 'Cosine', feature_weight=0.5, rating_data = rating_data, 
                          machine_features_df = machine_features_df, component_features_df = component_features_df):
        if target_user not in rating_data.index:
            print(f"Error: Machine {target_user} not found.")
            return None
        data_copy = rating_data.copy()
        user_means = data_copy.mean(axis=1)
        data_centered = data_copy.sub(user_means, axis=0)
        data_filled = data_centered.fillna(0) # For similarity calculation

        target_ratings = rating_data.loc[target_user]
        if not target_ratings.isna().any():
            print(f"Note: Every component in Machine {target_user} has failed at least once before. Instead of using a CF approach we will just return the components from most fails to least fails in this machine. \n")
            s = pd.Series(target_ratings)
            return s.sort_values(ascending=False)
        
        # similarity calculations only based on failure counts
        history_similarity_scores = {}
        target_vector_1d = data_filled.loc[target_user].values
        target_vector_2d = target_vector_1d.reshape(1, -1)

        if similarity_metric == "L2":
            for user in data_filled.index:
                if user == target_user: continue
                user_vector = data_filled.loc[user].values
                # L2 is distance, so we use negative distance for similarity
                history_similarity_scores[user] = -np.linalg.norm(target_vector_1d - user_vector)

        elif similarity_metric == "Cosine":
            all_similarities = cosine_similarity(data_filled.values, target_vector_2d)
            for i, user in enumerate(data_filled.index):
                if user != target_user:
                    history_similarity_scores[user] = all_similarities[i, 0]
        
        
        return None