In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
errors = pd.read_csv("PdM_errors.csv")
failures = pd.read_csv("PdM_failures.csv")
machines = pd.read_csv("PdM_machines.csv")
maintenance = pd.read_csv("PdM_maint.csv")
telemetry = pd.read_csv("PdM_telemetry.csv")

In [3]:
errors

Unnamed: 0,datetime,machineID,errorID
0,2015-01-03 07:00:00,1,error1
1,2015-01-03 20:00:00,1,error3
2,2015-01-04 06:00:00,1,error5
3,2015-01-10 15:00:00,1,error4
4,2015-01-22 10:00:00,1,error4
...,...,...,...
3914,2015-11-21 08:00:00,100,error2
3915,2015-12-04 02:00:00,100,error1
3916,2015-12-08 06:00:00,100,error2
3917,2015-12-08 06:00:00,100,error3


In [4]:
failures

Unnamed: 0,datetime,machineID,failure
0,2015-01-05 06:00:00,1,comp4
1,2015-03-06 06:00:00,1,comp1
2,2015-04-20 06:00:00,1,comp2
3,2015-06-19 06:00:00,1,comp4
4,2015-09-02 06:00:00,1,comp4
...,...,...,...
756,2015-11-29 06:00:00,99,comp3
757,2015-12-14 06:00:00,99,comp4
758,2015-02-12 06:00:00,100,comp1
759,2015-09-10 06:00:00,100,comp1


In [5]:
maintenance

Unnamed: 0,datetime,machineID,comp
0,2014-06-01 06:00:00,1,comp2
1,2014-07-16 06:00:00,1,comp4
2,2014-07-31 06:00:00,1,comp3
3,2014-12-13 06:00:00,1,comp1
4,2015-01-05 06:00:00,1,comp4
...,...,...,...
3281,2015-10-10 06:00:00,100,comp3
3282,2015-10-25 06:00:00,100,comp4
3283,2015-11-09 06:00:00,100,comp4
3284,2015-12-09 06:00:00,100,comp2


In [6]:
new_failures = failures.drop(columns=['datetime'], axis=1, inplace=False)
rating_data = new_failures.groupby('machineID')['failure'].value_counts().unstack()
rating_data

failure,comp1,comp2,comp3,comp4
machineID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,2.0,,4.0
2,1.0,3.0,,
3,1.0,4.0,,
4,2.0,4.0,,
5,4.0,3.0,,
...,...,...,...,...
96,,,5.0,
97,,1.0,3.0,4.0
98,4.0,4.0,5.0,3.0
99,2.0,6.0,6.0,5.0


In [7]:
errors["count"] = 1

error_features = errors.pivot_table(
    index="machineID",
    columns="errorID",
    values="count",
    aggfunc="sum",
    fill_value=0
).reset_index()

telemetry_agg = telemetry.groupby("machineID").agg({
    "volt": "mean",
    "rotate": "mean",
    "pressure": "mean",
    "vibration": "mean"
}).reset_index()

machine_features = machines.copy()

machine_features = (
    machine_features
    .merge(error_features, on="machineID", how="left")
    .merge(telemetry_agg, on="machineID", how="left")
)
machine_features

Unnamed: 0,machineID,model,age,error1,error2,error3,error4,error5,volt,rotate,pressure,vibration
0,1,model3,18,11,6,9,5,4,170.833898,446.336502,100.668306,40.586309
1,2,model4,7,5,10,5,5,3,170.760482,446.388915,100.539930,40.301539
2,3,model3,8,11,10,8,8,2,170.659235,446.583238,100.653114,40.475376
3,4,model3,7,10,11,8,2,0,170.659728,446.053801,100.569902,40.341620
4,5,model3,2,14,8,7,6,3,171.044435,446.858577,101.066188,40.264846
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,model2,10,5,10,8,12,4,170.614552,448.311054,101.463938,40.248600
96,97,model2,14,12,8,5,15,5,170.343383,447.032374,101.043793,40.552751
97,98,model2,20,9,12,8,11,5,170.782478,446.066974,101.705498,40.499608
98,99,model1,14,9,17,15,8,5,170.727884,444.766488,101.568852,40.524730


In [8]:
machine_features = pd.get_dummies(machine_features, 
                                 columns=["model"], 
                                 drop_first=False)
machine_features

Unnamed: 0,machineID,age,error1,error2,error3,error4,error5,volt,rotate,pressure,vibration,model_model1,model_model2,model_model3,model_model4
0,1,18,11,6,9,5,4,170.833898,446.336502,100.668306,40.586309,0,0,1,0
1,2,7,5,10,5,5,3,170.760482,446.388915,100.539930,40.301539,0,0,0,1
2,3,8,11,10,8,8,2,170.659235,446.583238,100.653114,40.475376,0,0,1,0
3,4,7,10,11,8,2,0,170.659728,446.053801,100.569902,40.341620,0,0,1,0
4,5,2,14,8,7,6,3,171.044435,446.858577,101.066188,40.264846,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,10,5,10,8,12,4,170.614552,448.311054,101.463938,40.248600,0,1,0,0
96,97,14,12,8,5,15,5,170.343383,447.032374,101.043793,40.552751,0,1,0,0
97,98,20,9,12,8,11,5,170.782478,446.066974,101.705498,40.499608,0,1,0,0
98,99,14,9,17,15,8,5,170.727884,444.766488,101.568852,40.524730,1,0,0,0


In [9]:
maintenance["total_maint_count"] = 1
maint_counts = maintenance.groupby(["machineID", "comp"])["total_maint_count"].sum().reset_index()

maintenance['datetime'] = pd.to_datetime(maintenance['datetime'])
CURRENT_DATE = maintenance['datetime'].max()
recency_dates = maintenance.groupby(['machineID', 'comp'])['datetime'].max().reset_index(name='last_maintenance_date')
recency_dates['how_recent'] = (CURRENT_DATE - recency_dates['last_maintenance_date']).dt.days
recency_dates

component_features = (
    maint_counts
    .merge(recency_dates, on=["machineID", "comp"], how="inner")
)
component_features

Unnamed: 0,machineID,comp,total_maint_count,last_maintenance_date,how_recent
0,1,comp1,11,2015-12-31 06:00:00,1
1,1,comp2,9,2015-11-16 06:00:00,46
2,1,comp3,8,2015-12-01 06:00:00,31
3,1,comp4,9,2015-12-16 06:00:00,16
4,2,comp1,7,2015-12-14 06:00:00,18
...,...,...,...,...,...
395,99,comp4,7,2015-12-14 06:00:00,18
396,100,comp1,10,2015-10-10 06:00:00,83
397,100,comp2,4,2015-12-24 06:00:00,8
398,100,comp3,7,2015-10-10 06:00:00,83


In [10]:
def calculate_component_bias(component_features, alpha=0.6):
    component_features = component_features.copy()

    scaler_count = MinMaxScaler()
    component_features['total_maint_count'] = scaler_count.fit_transform(component_features[['total_maint_count']])

    scaler_recency = MinMaxScaler()
    component_features['how_recent'] = 1 - scaler_recency.fit_transform(component_features[['how_recent']])

    final_bias = (alpha * component_features['total_maint_count']) + ((1 - alpha) * component_features['how_recent'])
    centered_bias = final_bias - final_bias.mean()
    return pd.Series(centered_bias, index=component_features.index)

In [11]:
def find_topk_similar_machines(target_user,rating_data, machine_features_df, similarity_metric, k, feature_weight):
        data_copy = rating_data.copy()
        user_means = data_copy.mean(axis=1)
        data_centered = data_copy.sub(user_means, axis=0)
        data_filled = data_centered.fillna(0) # For similarity calculation

        scaler = StandardScaler()
        scaled_features = pd.DataFrame(scaler.fit_transform(machine_features_df), 
                                    index=machine_features_df.index, 
                                    columns=machine_features_df.columns)

        target_ratings = rating_data.loc[target_user]
        if not target_ratings.isna().any():
            print(f"Note: Every component in Machine {target_user} has failed at least once before. Instead of using a CF approach we will just return the components from most fails to least fails in this machine. \n")
            s = pd.Series(target_ratings)
            return s.sort_values(ascending=False)
        
        # similarity calculations only based on failure counts
        history_similarity_scores = {}
        target_vector_1d = data_filled.loc[target_user].values
        target_vector_2d = target_vector_1d.reshape(1, -1)

        # similarity between machines based on features
        feature_similarity_scores = {}
        target_feature_vector_1d = scaled_features.loc[target_user].values
        target_feature_vector_2d = target_feature_vector_1d.reshape(1, -1) 

        if similarity_metric == "L2":
            for user in data_filled.index:
                if user == target_user: continue
                user_vector = data_filled.loc[user].values
                # L2 is distance, so we use negative distance for similarity
                history_similarity_scores[user] = -np.linalg.norm(target_vector_1d - user_vector)

            for user in data_filled.index:
                if user == target_user: continue
                user_feature_vector = scaled_features.loc[user].values
                # L2 is distance, so we use negative distance for similarity
                feature_similarity_scores[user] = -np.linalg.norm(target_feature_vector_1d - user_feature_vector)

        elif similarity_metric == "Cosine":
            all_similarities = cosine_similarity(data_filled.values, target_vector_2d)
            for i, user in enumerate(data_filled.index):
                if user != target_user:
                    history_similarity_scores[user] = all_similarities[i, 0]

            all_feature_similarities = cosine_similarity(scaled_features.values, target_feature_vector_2d)
            for i, user in enumerate(scaled_features.index):
                if user != target_user:
                    feature_similarity_scores[user] = all_feature_similarities[i, 0]

        else:
            print("Error: Unsupported similarity metric. Use 'L2' or 'Cosine'.")
            return None

        # combine similarity scores
        similar_history = pd.Series(history_similarity_scores)
        similar_machines = pd.Series(feature_similarity_scores)
        new_scaler = MinMaxScaler()
        similar_history_scaled = pd.Series(new_scaler.fit_transform(similar_history.values.reshape(-1, 1)).flatten(), index=similar_history.index)
        similar_machines_scaled = pd.Series(new_scaler.fit_transform(similar_machines.values.reshape(-1, 1)).flatten(), index=similar_machines.index)
        combined_similarity = (1 - feature_weight) * similar_history_scaled + feature_weight * similar_machines_scaled # weighted average

        # get top k similar users
        top_k_users = combined_similarity.nlargest(k)
        return top_k_users
        

In [12]:
# k must be between 1 and 4 (inclusive) as there are only 4 components 
# target_user needs to be inputed as the machineID number -> between 1 and 100 (inclusive)
def feature_aware_user_CF(target_user, k=4, similarity_metric = 'Cosine', feature_weight=0.4, rating_data = rating_data, 
                          machine_features_df = machine_features, component_features_df = component_features):
        if target_user not in rating_data.index:
            print(f"Error: Machine {target_user} not found.")
            return None
        
        top_k_users = find_topk_similar_machines(target_user, rating_data, machine_features_df, similarity_metric, k, feature_weight)

        target_ratings = rating_data.loc[target_user]
        target_unrated = target_ratings[target_ratings.isna()].index

        OGdata = rating_data.copy()
        similar_users_df = OGdata.loc[top_k_users.keys(), target_unrated]
        user_means = OGdata.loc[top_k_users.keys()].mean(axis=1)
        similar_users_df = similar_users_df.apply(lambda col: col.fillna(user_means))

        # using newly scored ratings to get the weighted avg
        weights = pd.Series(top_k_users)
        weighted_avg = ((similar_users_df.T * weights).T.sum(axis=0))/ weights.sum()
        # just in case there are any NaNs left (if all similar users had NaN for a component)
        weighted_avg = weighted_avg.fillna(OGdata[target_unrated].mean())

        # finally introducing the risk of the components based on maintenance history
        component_bias = calculate_component_bias(component_features_df)
        #bias_for_target = component_bias.loc[weighted_avg.index.get_level_values('machineID') == target_user]
        bias_for_target = component_bias.loc[weighted_avg.index]
        final_preds = weighted_avg + bias_for_target

        predictions_ranked = final_preds.sort_values(ascending=False)
        
        return predictions_ranked

In [13]:
feature_aware_user_CF(target_user=10)

KeyError: "None of [Index(['comp3', 'comp4'], dtype='object', name='failure')] are in the [index]"