<a href="https://colab.research.google.com/github/hana2013-lab/breast-cancer-random-forest/blob/main/robot_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#Upload kaggle.json
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"hanamuh","key":"2ecbf649f0e281e4df9d6d84a0d1a58e"}'}

In [8]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [9]:
#Install kaggle API
!pip install kaggle



In [10]:
#Download dataset
!kaggle datasets download -d ziya07/robotic-operations-performance-dataset

Dataset URL: https://www.kaggle.com/datasets/ziya07/robotic-operations-performance-dataset
License(s): CC0-1.0
Downloading robotic-operations-performance-dataset.zip to /content
  0% 0.00/7.91k [00:00<?, ?B/s]
100% 7.91k/7.91k [00:00<00:00, 21.1MB/s]


In [11]:
#Unzip data
!unzip robotic-operations-performance-dataset.zip
!ls

Archive:  robotic-operations-performance-dataset.zip
  inflating: robot_dataset.csv       
kaggle.json	   robotic-operations-performance-dataset.zip
robot_dataset.csv  sample_data


In [33]:
#Load Data & Checking first few rows of set
import pandas as pd
df = pd.read_csv('robot_dataset.csv')
df.head()

Unnamed: 0,Robot_ID,Task_Type,Component_ID,Sensor_Type,Sensor_Data,Processing_Time (s),Accuracy (%),Environmental_Status,Energy_Consumption (kWh),Human_Intervention_Needed,Obstacle_Detected,Defect_Detected
0,RBT_001,Inspection,CMP_460,LIDAR,1 (obstacle detected),67.0,90.4,Stable,2.2,No,Yes,Yes
1,RBT_002,Assembly,CMP_252,Thermal,85.3 (°C),71.2,98.1,Stable,2.7,Yes,No,No
2,RBT_003,Inspection,CMP_248,Thermal,92% (visual fit),49.2,95.3,Unstable,2.4,No,No,No
3,RBT_004,Welding,CMP_433,Camera,98% (defect-free),74.5,90.2,Stable,2.4,Yes,No,Yes
4,RBT_005,Assembly,CMP_992,Camera,92% (visual fit),64.5,97.2,Unstable,1.8,No,No,No


In [34]:
#Check rows and columns
df. shape

(500, 12)

In [35]:
#Check Basic Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Robot_ID                   500 non-null    object 
 1   Task_Type                  500 non-null    object 
 2   Component_ID               500 non-null    object 
 3   Sensor_Type                500 non-null    object 
 4   Sensor_Data                500 non-null    object 
 5   Processing_Time (s)        500 non-null    float64
 6   Accuracy (%)               500 non-null    float64
 7   Environmental_Status       500 non-null    object 
 8   Energy_Consumption (kWh)   500 non-null    float64
 9   Human_Intervention_Needed  500 non-null    object 
 10  Obstacle_Detected          500 non-null    object 
 11  Defect_Detected            500 non-null    object 
dtypes: float64(3), object(9)
memory usage: 47.0+ KB


In [36]:
#Summary Statistics for numerical features
df.describe()

Unnamed: 0,Processing_Time (s),Accuracy (%),Energy_Consumption (kWh)
count,500.0,500.0,500.0
mean,59.937,94.9206,2.014
std,11.688808,2.900949,0.582789
min,40.1,90.0,1.0
25%,49.675,92.5,1.5
50%,59.6,94.9,2.0
75%,70.0,97.5,2.5
max,80.0,99.9,3.0


DATA PREPROCESSING

In [17]:
#Feature Typing; Splitting Feature into Numerical & Categorical
#Drop non-informative features
df = df.drop(columns=["Component_ID", "Sensor_Data"])

target = "Human_Intervention_Needed"
X = df.drop(columns=[target])
y = df[target]

categorical_features = [
    "Task_Type", "Sensor_Type", "Environmental_Status",
    "Obstacle_Detected", "Defect_Detected"
]

numeric_features = [
    "Processing_Time (s)", "Accuracy (%)", "Energy_Consumption (kWh)"
]

In [37]:
#Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [40]:
#Preprocessing pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="drop"
)

In [42]:
#Apply preprocessing. How many rows and features after preprocessing

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

print("Train shape after preprocessing:" , X_train_preprocessed.shape)
print("Test shape after preprocessing:" , X_test_preprocessed.shape)

Train shape after preprocessing: (400, 17)
Test shape after preprocessing: (100, 17)


In [44]:
#Before Preprocessing
print(X_train.head())

      Task_Type Sensor_Type  Processing_Time (s)  Accuracy (%)  \
161  Inspection      Camera                 69.6          94.0   
310  Inspection       LIDAR                 47.0          98.9   
34      Welding     Thermal                 66.2          98.3   
448    Painting     Thermal                 44.6          95.4   
23      Welding      Camera                 55.1          92.0   

    Environmental_Status  Energy_Consumption (kWh) Obstacle_Detected  \
161             Unstable                       2.8                No   
310             Unstable                       2.1                No   
34              Unstable                       2.2               Yes   
448               Stable                       2.9               Yes   
23                Stable                       2.2                No   

    Defect_Detected  
161             Yes  
310              No  
34              Yes  
448             Yes  
23              Yes  


In [45]:
#After Preprocessing
print(X_train_preprocessed[:5])

[[ 0.83321371 -0.32262746  1.31973909  0.          1.          0.
   0.          1.          0.          0.          0.          0.
   1.          1.          0.          0.          1.        ]
 [-1.10344634  1.34760452  0.12963621  0.          1.          0.
   0.          0.          1.          0.          0.          0.
   1.          1.          0.          1.          0.        ]
 [ 0.54185777  1.14308632  0.2996509   0.          0.          0.
   1.          0.          0.          0.          1.          0.
   1.          0.          1.          0.          1.        ]
 [-1.30910935  0.15458167  1.48975379  0.          0.          1.
   0.          0.          0.          0.          1.          1.
   0.          0.          1.          0.          1.        ]
 [-0.40933367 -1.0043548   0.2996509   0.          0.          0.
   1.          1.          0.          0.          0.          1.
   0.          1.          0.          0.          1.        ]]


In [46]:
#Raw data vs transformed data
print("Before:", X_train.shape)
print("After:", X_train_preprocessed.shape)

Before: (400, 8)
After: (400, 17)


In [47]:
#Feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()
print(feature_names)

['num__Processing_Time (s)' 'num__Accuracy (%)'
 'num__Energy_Consumption (kWh)' 'cat__Task_Type_Assembly'
 'cat__Task_Type_Inspection' 'cat__Task_Type_Painting'
 'cat__Task_Type_Welding' 'cat__Sensor_Type_Camera'
 'cat__Sensor_Type_LIDAR' 'cat__Sensor_Type_LIDAR + Camera'
 'cat__Sensor_Type_Thermal' 'cat__Environmental_Status_Stable'
 'cat__Environmental_Status_Unstable' 'cat__Obstacle_Detected_No'
 'cat__Obstacle_Detected_Yes' 'cat__Defect_Detected_No'
 'cat__Defect_Detected_Yes']


In [48]:
#RandomForestClassifier testing
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train_preprocessed, y_train)

print("Accuracy:", model.score(X_test_preprocessed, y_test))

Accuracy: 0.44


In [49]:
#check class balance
y.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Human_Intervention_Needed,Unnamed: 1_level_1
Yes,0.506
No,0.494


In [50]:
#Get full metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(X_test_preprocessed)))

              precision    recall  f1-score   support

          No       0.43      0.47      0.45        49
         Yes       0.45      0.41      0.43        51

    accuracy                           0.44       100
   macro avg       0.44      0.44      0.44       100
weighted avg       0.44      0.44      0.44       100



In [54]:
#Save processed dataset
import numpy as np

np.save("X_train_preprocessed.npy", X_train_preprocessed)
np.save("X_test_preprocessed.npy", X_test_preprocessed)