In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns

In [2]:
dataset = pd.read_csv('/content/data.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Catalyst ID                       500 non-null    object 
 1   Composition                       500 non-null    object 
 2   Synthesis Method                  500 non-null    object 
 3   Surface Area (m2/g)               500 non-null    float64
 4   Pore Volume (cm3/g)               500 non-null    float64
 5   Active Sites Density (sites/nm2)  500 non-null    float64
 6   Reaction Temperature (°C)         500 non-null    int64  
 7   Pressure (bar)                    500 non-null    int64  
 8   Time (h)                          500 non-null    int64  
 9   Product Selectivity (%)           500 non-null    float64
 10  Conversion (%)                    500 non-null    float64
 11  Performance                       500 non-null    object 
dtypes: float

In [4]:
print(dataset.isnull == True)

False


In [5]:
df = dataset.copy()

In [6]:
# 1. تحميل ملف البيانات الأصلي
df = pd.read_csv("data.csv")

# --- التأكد من الأعمدة الأصلية المطلوبة (تصحيح الخطأ) ---
required_cols = ['Composition', 'Synthesis Method']
for col in required_cols:
    if col not in df.columns:
        # إذا كان العمود مفقوداً، قم بإنشاء عمود فارغ (للتطبيق الآمن)
        df[col] = "Unknown"
        print(f"⚠️ تنبيه: العمود '{col}' غير موجود، تم إنشاء عمود 'Unknown'.")

# --- الثوابت والدوال (كما هي من التصحيح الأخير) ---
SUB_MAP = str.maketrans("₀₁₂₃₄₅₆₇₈₉", "0123456789")
SUP_MAP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789")
ELEMENTS = set(["Ag","Al","Au","B","C","Ca","Ce","Co","Cr","Cu","Fe","Mg","Mn","Mo","N","Ni","O","P","Pd","Pt","Rh","Ru","Si","Sn","Ti","V","W","Zn","Zr"])
NONMETALS = {"C","N","O","P","S","Si"}

# ... (بقية الدوال: normalize_text, detect_type, extract_form, extract_active_metal, canonicalize_basic, infer_structure) ...

# الدوال كما هي تماماً من الرد السابق (للتكرار الذاتي)
def normalize_text(s: str) -> str:
    s = str(s).strip().translate(SUB_MAP).translate(SUP_MAP)
    s = re.sub(r"\s+", " ", s).replace("–", "-").replace("—", "-").replace("−", "-")
    s = re.sub(r"\b(supported on|on)\b", " / ", s, flags=re.IGNORECASE)
    return s or "Unknown"

def detect_type(s: str) -> str:
    low = s.lower()
    if "mof" in low or "uio-" in low or "mil-" in low or "zif-" in low: return "MOF"
    if re.search(r'\b(oxide|o\d+)\b', low) and not ("mof" in low or "zif" in low or "cof" in low): return "OXIDE"
    if "carbide" in low: return "CARBIDE"
    return "COMPOSITE"

def extract_form(s: str) -> str:
    low = " " + s.lower() + " "
    if "single-atom" in low or " sa " in low or re.search(r"\bSA[-\s]", s): return "SA"
    if "nanosheet" in low: return "NS"
    if "nanoparticle" in low: return "NP"
    if re.search(r"\bhybrid|\bcomposite", low): return "HYB"
    return "Unknown"

def extract_active_metal(s: str) -> str:
    s_modified = s
    if '/' in s: s_modified = s.split('/')[0]
    match_paren = re.search(r'\(([A-Za-z0-9, \-]*)\)$', s_modified)
    if match_paren: s_modified = match_paren.group(1)

    tokens = re.findall(r"[A-Z][a-z]?", s_modified.replace("-"," ").replace(","," "))
    found_metals = []

    for t in tokens:
        if t in ELEMENTS and t not in found_metals:
            found_metals.append(t)

    if not found_metals:
        if 'ZIF-8' in s: found_metals.append('Zn')
        if 'HKUST-1' in s: found_metals.append('Cu')
        if 'MOF-5' in s: found_metals.append('Zn')

    if any(m not in NONMETALS for m in found_metals):
        found_metals = [m for m in found_metals if m not in NONMETALS]

    if not found_metals: return "Unknown"

    return ",".join(sorted(found_metals))

def canonicalize_basic(name: str):
    clean = normalize_text(name)
    metals = extract_active_metal(clean)
    typ = detect_type(clean)
    form = extract_form(clean)
    return clean, metals, typ, form

def infer_structure(name_clean: str, current_struct: str) -> str:
    cur = (current_struct or "").strip()
    if cur and cur.lower() not in {"–","-","none","nan",""}: return cur
    name = (name_clean or "").lower()
    if re.search(r"\bsingle[-\s]?atom\b|\bsa[-\s]", name): return "SA"
    if "nanosheet" in name or "nanosheets" in name: return "NS"
    if "nanoparticle" in name or "nanoparticles" in name: return "NP"
    if "nanowire" in name or "nanowires" in name: return "NW"
    if "hybrid" in name or "composite" in name: return "HYB"
    return "Unknown"

# --- 3. تطبيق الاستخلاص على البيانات (لا تغيير) ---

res = df["Composition"].astype(str).apply(canonicalize_basic)
df["Catalyst_clean"] = res.apply(lambda x: x[0])
df["Metal"] = res.apply(lambda x: x[1])
df["Type"] = res.apply(lambda x: x[2])
df["Structure"] = res.apply(lambda x: x[3])

df["Structure"] = [infer_structure(n, s) for n, s in zip(df["Catalyst_clean"], df["Structure"])]

# --- 4. الترميز (OHE) ---

# 4.1. جمع كافة الأعمدة التصنيفية
# تم التأكد الآن من وجود 'Synthesis Method'
categorical_features = ['Synthesis Method', 'Type', 'Structure']

# 4.2. ترميز المعادن (Multi-Label OHE)
all_elements = set()
for metal_list in df['Metal'].str.split(','):
    if isinstance(metal_list, list):
        for el in metal_list:
            if el and el.strip() != 'Unknown': all_elements.add(el.strip())
sorted_elements = sorted(list(all_elements))

for element in sorted_elements:
    col_name = f"Metal_{element}"
    df[col_name] = df['Metal'].apply(lambda x: 1 if re.search(r'\b{}\b'.format(re.escape(element)), x) else 0)

# 4.3. تطبيق OHE على Synthesis Method, Type, Structure
# تم حل المشكلة: الآن يتم تمرير الأعمدة التي تم التأكد من وجودها
df_ohe_TS = pd.get_dummies(df[categorical_features], prefix=categorical_features)
df = df.drop(columns=categorical_features)
df = pd.concat([df, df_ohe_TS], axis=1)

# --- 5. الحفظ النهائي ---

output_file_name = "data_ml_ready_final_fix.csv"
df.to_csv(output_file_name, index=False)

print(f"✅ تم تصحيح الخطأ بنجاح وإتمام عملية الترميز. الملف النهائي هو: {output_file_name}")

✅ تم تصحيح الخطأ بنجاح وإتمام عملية الترميز. الملف النهائي هو: data_ml_ready_final_fix.csv


In [7]:
# 2. تحديد جميع الأعمدة ذات النوع المنطقي (bool)
# هذه الأعمدة هي التي نتجت عن ترميز OHE (مثل Type_MOF, Synthesis Method_Solvothermal, Metal_Pt)
boolean_cols = df.select_dtypes(include=['bool']).columns

# 3. تطبيق التحويل من (True/False) إلى (1/0)
# يتم التحويل عبر تغيير نوع البيانات إلى int (حيث True تصبح 1 و False تصبح 0)
if not boolean_cols.empty:
    df[boolean_cols] = df[boolean_cols].astype(int)
    print(f"✅ تم تحويل {len(boolean_cols)} عمودًا من منطقي (True/False) إلى عددي (1/0).")
else:
    print("⚠️ لم يتم العثور على أعمدة من النوع المنطقي (bool). ربما تم التحويل مسبقًا.")


# 4. حفظ الملف الجديد النهائي
output_file_name = "data_ml_ready_numeric_final.csv"
df.to_csv(output_file_name, index=False)

# 5. عرض جزء من الأعمدة التي تم تحويلها للتأكد من النتيجة
print("\nأول 5 صفوف بعد التحويل إلى 1 و 0:")
# اختيار أعمدة OHE التي كنا نراها سابقاً في المخرجات
example_cols = [col for col in df.columns if col.startswith('Synthesis Method_') or col.startswith('Type_') or col.startswith('Metal_')]
print(df[example_cols].head().T) # عرض الأعمدة بشكل رأسي لسهولة القراءة

✅ تم تحويل 14 عمودًا من منطقي (True/False) إلى عددي (1/0).

أول 5 صفوف بعد التحويل إلى 1 و 0:
                                           0  1  2  3  4
Metal_Ag                                   0  0  0  0  0
Metal_Al                                   0  0  0  0  0
Metal_Au                                   0  0  0  0  0
Metal_B                                    0  0  0  0  0
Metal_C                                    0  0  0  0  0
Metal_Ce                                   0  0  0  0  0
Metal_Co                                   0  0  0  0  0
Metal_Cr                                   0  0  0  0  0
Metal_Cu                                   0  0  0  0  0
Metal_Fe                                   0  0  0  0  0
Metal_Mg                                   0  0  0  0  0
Metal_Mn                                   0  0  0  0  0
Metal_Mo                                   0  0  0  0  0
Metal_N                                    0  0  0  0  0
Metal_Ni                                   0  0  0 

In [8]:
print(df.isnull().sum())

Catalyst ID                                  0
Composition                                  0
Surface Area (m2/g)                          0
Pore Volume (cm3/g)                          0
Active Sites Density (sites/nm2)             0
Reaction Temperature (°C)                    0
Pressure (bar)                               0
Time (h)                                     0
Product Selectivity (%)                      0
Conversion (%)                               0
Performance                                  0
Catalyst_clean                               0
Metal                                        0
Metal_Ag                                     0
Metal_Al                                     0
Metal_Au                                     0
Metal_B                                      0
Metal_C                                      0
Metal_Ce                                     0
Metal_Co                                     0
Metal_Cr                                     0
Metal_Cu     

In [9]:
df.head()

Unnamed: 0,Catalyst ID,Composition,Surface Area (m2/g),Pore Volume (cm3/g),Active Sites Density (sites/nm2),Reaction Temperature (°C),Pressure (bar),Time (h),Product Selectivity (%),Conversion (%),...,Synthesis Method_Ion exchange,Synthesis Method_PVD,Synthesis Method_Precipitation,Synthesis Method_Solvothermal,Synthesis Method_Thermal decomposition,Synthesis Method_Wet chemical,Type_COMPOSITE,Type_MOF,Type_OXIDE,Structure_Unknown
0,CAT-001,UiO-66(Zr),1250.5,1.55,8.2,180,15,5,93.5,95.2,...,0,0,0,1,0,0,0,1,0,1
1,CAT-002,Pt/TiO2,250.8,0.35,3.1,350,5,12,75.1,68.1,...,0,0,0,0,0,0,1,0,0,1
2,CAT-003,ZIF-8,1890.3,2.2,10.5,150,10,8,89.0,85.5,...,0,0,0,0,0,0,0,1,0,1
3,CAT-004,Ni/Al2O3,150.7,0.25,2.5,420,50,20,55.0,45.9,...,0,0,0,0,0,0,1,0,0,1
4,CAT-005,MOF-5,2100.9,2.85,12.1,200,20,6,96.8,91.0,...,0,0,0,1,0,0,0,1,0,1


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 52 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Catalyst ID                                500 non-null    object 
 1   Composition                                500 non-null    object 
 2   Surface Area (m2/g)                        500 non-null    float64
 3   Pore Volume (cm3/g)                        500 non-null    float64
 4   Active Sites Density (sites/nm2)           500 non-null    float64
 5   Reaction Temperature (°C)                  500 non-null    int64  
 6   Pressure (bar)                             500 non-null    int64  
 7   Time (h)                                   500 non-null    int64  
 8   Product Selectivity (%)                    500 non-null    float64
 9   Conversion (%)                             500 non-null    float64
 10  Performance               

In [11]:
df.head()

Unnamed: 0,Catalyst ID,Composition,Surface Area (m2/g),Pore Volume (cm3/g),Active Sites Density (sites/nm2),Reaction Temperature (°C),Pressure (bar),Time (h),Product Selectivity (%),Conversion (%),...,Synthesis Method_Ion exchange,Synthesis Method_PVD,Synthesis Method_Precipitation,Synthesis Method_Solvothermal,Synthesis Method_Thermal decomposition,Synthesis Method_Wet chemical,Type_COMPOSITE,Type_MOF,Type_OXIDE,Structure_Unknown
0,CAT-001,UiO-66(Zr),1250.5,1.55,8.2,180,15,5,93.5,95.2,...,0,0,0,1,0,0,0,1,0,1
1,CAT-002,Pt/TiO2,250.8,0.35,3.1,350,5,12,75.1,68.1,...,0,0,0,0,0,0,1,0,0,1
2,CAT-003,ZIF-8,1890.3,2.2,10.5,150,10,8,89.0,85.5,...,0,0,0,0,0,0,0,1,0,1
3,CAT-004,Ni/Al2O3,150.7,0.25,2.5,420,50,20,55.0,45.9,...,0,0,0,0,0,0,1,0,0,1
4,CAT-005,MOF-5,2100.9,2.85,12.1,200,20,6,96.8,91.0,...,0,0,0,1,0,0,0,1,0,1


In [12]:
print(df.isnull().sum())

Catalyst ID                                  0
Composition                                  0
Surface Area (m2/g)                          0
Pore Volume (cm3/g)                          0
Active Sites Density (sites/nm2)             0
Reaction Temperature (°C)                    0
Pressure (bar)                               0
Time (h)                                     0
Product Selectivity (%)                      0
Conversion (%)                               0
Performance                                  0
Catalyst_clean                               0
Metal                                        0
Metal_Ag                                     0
Metal_Al                                     0
Metal_Au                                     0
Metal_B                                      0
Metal_C                                      0
Metal_Ce                                     0
Metal_Co                                     0
Metal_Cr                                     0
Metal_Cu     

In [13]:
print(df.columns.tolist())

['Catalyst ID', 'Composition', 'Surface Area (m2/g)', 'Pore Volume (cm3/g)', 'Active Sites Density (sites/nm2)', 'Reaction Temperature (°C)', 'Pressure (bar)', 'Time (h)', 'Product Selectivity (%)', 'Conversion (%)', 'Performance', 'Catalyst_clean', 'Metal', 'Metal_Ag', 'Metal_Al', 'Metal_Au', 'Metal_B', 'Metal_C', 'Metal_Ce', 'Metal_Co', 'Metal_Cr', 'Metal_Cu', 'Metal_Fe', 'Metal_Mg', 'Metal_Mn', 'Metal_Mo', 'Metal_N', 'Metal_Ni', 'Metal_O', 'Metal_Pd', 'Metal_Pt', 'Metal_Rh', 'Metal_Ru', 'Metal_Ti', 'Metal_V', 'Metal_W', 'Metal_Zn', 'Metal_Zr', 'Synthesis Method_Co-precipitation', 'Synthesis Method_Deposition-Precipitation', 'Synthesis Method_Hydrothermal', 'Synthesis Method_Impregnation', 'Synthesis Method_Ion exchange', 'Synthesis Method_PVD', 'Synthesis Method_Precipitation', 'Synthesis Method_Solvothermal', 'Synthesis Method_Thermal decomposition', 'Synthesis Method_Wet chemical', 'Type_COMPOSITE', 'Type_MOF', 'Type_OXIDE', 'Structure_Unknown']


In [14]:
df.head(5)

Unnamed: 0,Catalyst ID,Composition,Surface Area (m2/g),Pore Volume (cm3/g),Active Sites Density (sites/nm2),Reaction Temperature (°C),Pressure (bar),Time (h),Product Selectivity (%),Conversion (%),...,Synthesis Method_Ion exchange,Synthesis Method_PVD,Synthesis Method_Precipitation,Synthesis Method_Solvothermal,Synthesis Method_Thermal decomposition,Synthesis Method_Wet chemical,Type_COMPOSITE,Type_MOF,Type_OXIDE,Structure_Unknown
0,CAT-001,UiO-66(Zr),1250.5,1.55,8.2,180,15,5,93.5,95.2,...,0,0,0,1,0,0,0,1,0,1
1,CAT-002,Pt/TiO2,250.8,0.35,3.1,350,5,12,75.1,68.1,...,0,0,0,0,0,0,1,0,0,1
2,CAT-003,ZIF-8,1890.3,2.2,10.5,150,10,8,89.0,85.5,...,0,0,0,0,0,0,0,1,0,1
3,CAT-004,Ni/Al2O3,150.7,0.25,2.5,420,50,20,55.0,45.9,...,0,0,0,0,0,0,1,0,0,1
4,CAT-005,MOF-5,2100.9,2.85,12.1,200,20,6,96.8,91.0,...,0,0,0,1,0,0,0,1,0,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 52 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Catalyst ID                                500 non-null    object 
 1   Composition                                500 non-null    object 
 2   Surface Area (m2/g)                        500 non-null    float64
 3   Pore Volume (cm3/g)                        500 non-null    float64
 4   Active Sites Density (sites/nm2)           500 non-null    float64
 5   Reaction Temperature (°C)                  500 non-null    int64  
 6   Pressure (bar)                             500 non-null    int64  
 7   Time (h)                                   500 non-null    int64  
 8   Product Selectivity (%)                    500 non-null    float64
 9   Conversion (%)                             500 non-null    float64
 10  Performance               

In [16]:
print(df["Performance"].value_counts())


Performance
Good         181
Excellent    172
Poor         106
Average       41
Name: count, dtype: int64


In [17]:
print(df.sample(5))


    Catalyst ID    Composition  Surface Area (m2/g)  Pore Volume (cm3/g)  \
477     CAT-478  MOF-74(Fe-Cr)               1750.8                 2.05   
434     CAT-435  Cr-MIL-53-NH2               1450.0                 1.70   
18      CAT-019        IRMOF-3               1400.8                 1.70   
455     CAT-456  Fe/TiO2-Al2O3                280.4                 0.35   
427     CAT-428  MOF-74(Fe-Co)               1750.8                 2.05   

     Active Sites Density (sites/nm2)  Reaction Temperature (°C)  \
477                              10.2                        215   
434                               9.0                        200   
18                                8.8                        210   
455                               3.6                        440   
427                              10.2                        210   

     Pressure (bar)  Time (h)  Product Selectivity (%)  Conversion (%)  ...  \
477              14         8                     88.5 

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [20]:
# 2. تعيين الهدف (y) والفئات
performance_mapping = {'Poor': 0, 'Average': 1, 'Good': 2, 'Excellent': 3}
y = df['Performance'].map(performance_mapping)

In [21]:
cols_to_drop = [
    'Catalyst ID', 'Composition', 'Performance',
    'Catalyst_clean', 'Metal',
    'Type', 'Structure', 'Synthesis Method',
    'Product Selectivity (%)', 'Conversion (%)'
]
X = df.drop(columns=[col for col in cols_to_drop if col in df.columns], errors='ignore')
X = X.select_dtypes(include=[np.number])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.42, random_state=0, stratify=y
)

In [23]:
# 5. تدريب النموذج
model = RandomForestClassifier(n_estimators=400, max_depth=4, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# 6. التنبؤ والتقييم
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

In [27]:
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.8857142857142857
F1 Score: 0.8833467256075951
Confusion Matrix:
<function confusion_matrix at 0x7ab968dcc4a0>
