# Expand the annotation from evaluation_final and perform some inspection

In [150]:
import pandas as pd
import numpy as np

# Define column names
columns = ['Macro F1 score weight distribution', 'Result for baseline model', 'Result for finetuned model']

# Create a DataFrame with NaN values
df = pd.DataFrame(np.nan, index=[0, 1, 2], columns=columns)
df


Unnamed: 0,Macro F1 score weight distribution,Result for baseline model,Result for finetuned model
0,,,
1,,,
2,,,


In [151]:
import pandas as pd
import ast

# Load the dataset
annotation_df = pd.read_csv("q1_submission_notebooks/annotation.csv")


In [152]:
# Convert string representation of dictionary into actual dictionary
expanded_data = []
for index, row in annotation_df.iterrows():
    row_dict = ast.literal_eval(row['output'])  # Convert string to dictionary
    
    # Extract common fields
    common_fields = {
        "paper_num": row['first_num'],  # Track the first_num
        "perovskite_composition": row_dict.get("perovskite_composition"),
        "electron_transport_layer": row_dict.get("electron_transport_layer"),
        "hole_transport_layer": row_dict.get("hole_transport_layer"),
        "structure_pin_nip": row_dict.get("structure_pin_nip"),
    }

    # Extract test data
    for key, test_data in row_dict.items():
        if key.startswith("test_"):
            test_row = common_fields.copy()
            test_row["test"] = key  # Store test name
            test_row.update(test_data)  # Merge test details
            expanded_data.append(test_row)

# Convert list of dictionaries into DataFrame
df_expanded = pd.DataFrame(expanded_data)

# # Fill missing passivating_molecule values based on the first test in each group
# df_expanded['passivating_molecule'] = df_expanded.groupby('perovskite_composition')['passivating_molecule'].transform(lambda x: x.ffill())

In [153]:
df_expanded

Unnamed: 0,paper_num,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_cont,efficiency_tret
0,0,Cs0.05FA0.85MA0.1PbI3,C60,2PACz,PIN,test_1,ISOSL3,4-chlorobenzenesulfonate,50.0,65.0,1200.0,24.0,26.9,,1.18,,95.0
1,0,Cs0.05FA0.85MA0.1PbI3,C60,2PACz,PIN,test_1_2,ISOSL3,4-chlorobenzenesulfonate,50.0,85.0,540.0,24.0,26.9,,,,87.0
2,0,Cs0.05FA0.85MA0.1PbI3,C60,2PACz,PIN,test_2,ISOSD2,4-chlorobenzenesulfonate,,85.0,1500.0,24.0,26.9,,,,95.0
3,1,,TinOxide,PTAA,PIN,test_1,ISOST,phenethylammonium,,85.0,500.0,,19.1,,1.16,,
4,2,(FAPbI3)0.95(MAPbBr 3)0.05,tin dioxide,Spiro-OMeTAD,NIP,test_2,ISOSLT,butylammonium lead tetra iodide,,,1620.0,,24.0,,,,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,147,FAPbI 3,,Spiro-OMeTAD,NIP,test_1,ISOSD,carbazole-triphenylamine and phenylammonium io...,85.0,85.0,1000.0,22.3,24.7,,1.11,,92.3
246,147,FAPbI 3,,Spiro-OMeTAD,NIP,test_2,ISOSLT,carbazole-triphenylamine and phenylammonium io...,,,1100.0,,24.7,,,66.6,94.6
247,148,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1,ISOSLT,phenylethylammonium iodide,50-70,,500.0,21.2,22.7,1.09,1.12,70.0,84.0
248,148,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1_2,ISOSLT,4-tert-butyl-benzylammonium iodide,50-70,,500.0,,,,,70.0,95.0


In [164]:
df_expanded[df_expanded['paper_num'] == 90]

Unnamed: 0,paper_num,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_cont,efficiency_tret
132,90,(FAPbI 3 ) 0.85 (MAPbBr 3 ) 0.15,,Spiro-OMeTAD,NIP,test_1,,MA +,,,,18.7,19.8,,1.17,,


## Inspect having both passivating and perovskite

In [159]:
df_expanded[(df_expanded['perovskite_composition'].notnull()) & (df_expanded['passivating_molecule'].notnull())]['paper_num'].unique()

array([  0,   2,   6,   7,   8,   9,  10,  12,  13,  14,  15,  16,  19,
        20,  22,  23,  31,  36,  39,  42,  46,  48,  50,  53,  55,  59,
        60,  62,  64,  71,  72,  73,  74,  75,  77,  79,  85,  88,  90,
        95,  97, 113, 114, 116, 118, 120, 126, 131, 133, 143, 145, 146,
       147, 148, 149], dtype=int64)

## Inspect Missingness

In [145]:
df_expanded[df_expanded['perovskite_composition'].isnull()]['paper_num'].unique()

array([  1,  11,  18,  27,  37,  40,  41,  44,  49,  82,  83,  86,  91,
        92,  94,  99, 100, 101, 105, 106, 109, 115, 119, 121, 127, 129,
       130, 135, 137, 141, 142, 144], dtype=int64)

# Efficiency cont Check - Done

In [136]:
data_inspect = df_expanded['efficiency_cont'].value_counts()

for index, value in data_inspect.iteritems():
    print(index)

60.0
80.0
40.0
70.0
0.0
50.0
65.0
85.0
27.0
38.0
63.0
20.0
67.2
29.0
84.0
30.0
68.0
58.0
90.0
76.0
75.8
89.0
10.0
71.0
4.0
96.5
87.0
78.0
75.6
55.0
72.0
73.2
77.6
83.0
56.0
6.0
58.6
92.7
74.0
98.0
82.0
51.0
47.0
9.65
50.6
59.1
70.8
57.0
44.0
93.0
32.0
49.0
71.3
25.0
94.0
95.0
66.6


In [137]:
efficiency_cont = df_expanded[["paper_num","efficiency_cont"]]
efficiency_cont = efficiency_cont[efficiency_cont['efficiency_cont'] == 0.7]
efficiency_cont

Unnamed: 0,paper_num,efficiency_cont


# Efficiency treat Check - Done

In [138]:
data_inspect = df_expanded['efficiency_tret'].value_counts()

for index, value in data_inspect.iteritems():
    print(index)

90.0
95.0
80.0
100.0
92.0
85.0
98.0
96.0
91.0
86.0
93.0
75.0
97.0
94.0
87.0
74.0
99.0
98.2
73.0
88.0
53.0
82.0
65.0
92.2
45.0
97.5
98.7
92.3
91.8
95.2
71.0
90.5
98.9
88.7
89.0
92.6
94.6
94.5
67.0
60.0
1.0
96.7
91.5
84.8
65.1
86.2
104.0
47.9
99.4
96.8
93.8
95.7
96.2
81.0
35.0
83.0
50.0
82.1
91.1
84.0


In [139]:
efficiency_tret = df_expanded[["paper_num","efficiency_tret"]]
efficiency_tret = efficiency_tret[efficiency_tret['efficiency_tret'] == 0.92]
efficiency_tret

Unnamed: 0,paper_num,efficiency_tret


# Composition - DONE

In [140]:
data_inspect = df_expanded['perovskite_composition'].value_counts()

for index, value in data_inspect.iteritems():
    print(index)

FAPbI 3
(FAPbI3)0.95(MAPbBr3)0.05
MAPbI 3
MAPbI3
FA0.85MA0.1Cs0.05PbI2.9Br0.1
Cs0.05(MA)0.16(FA)0.79Pb(I0.83Br0.17 )3
FAPbI3
FA0.98Cs0.02PbI3
FA0.83Cs0.17PbI2.7Br0.3
Cs0.1FA0.9PbI3
FA0.98MA0.02Pbl3
Cs 0.05(FA0.98MA0.02)0.95Pb(I0.98Br0.02)3
Rb0.05Cs0.05MA0.05FA0.85Pb(I0.95Br0.05)3
(FAPbI3)0.97(MC)0.03
F- Cs 0.05 FA 0.81 MA 0.14 PbI 2.55 Br 0.45
Cs0.05(FAPbI3)0.85(MAPbBr3)0.15
(FAPbI3)0.95(MAPbBr 3)0.05
Cs0.15FA0.85PbI2.55Br0.45
MAPbBr 3
Cs0.05FA0.85MA0.1PbI3
(FAPbI3)0.94(MAPbBr3)0.06
Cs0.05FA0.81MA0.14PbI2.55Br0.45
FA0.83MA0.17Pb-(I0.83Br0.17)3
Cs0.05MA0.05FA0.9Pb(I0.95Br0.05)3\
Cs 0.05 (FA0.92MA0.08)0.95Pb(I0.92Br0.08)3
Cs 0.05(FA0.95MA0.05)0.95Pb(I0.95Br0.05)3
2-phenylethylammonium iodide
Cs0.05FA0.95PbI3
FA0.9Cs0.1PbI3
FA0.95MA0.05Pb(I0.92Br0.08)3
Cs0.05(FA5/6MA1/6)0.95Pb(I0.9Br0.1)3
Cs0.12FA0.8MA0.08PbI1.8Br1
Cs0.05(FA0.95MA0.05)0.95Pb(I0.95Br0.05)3
Cs0.05MA0.1FA0.85PbI3
FA0.95Cs0.05PbI3
(FAPbI3)0.77(MAPbBr3)0.14(CsPbI3)0.09
Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3
Cs0.05FA0.9MA0.05Pb(I0.

## Perovskite Composition problem
- F- Cs 0.05 FA 0.81 MA 0.14 PbI 2.55 Br 0.45
- Cl-contained FAPbI3


In [141]:
composition = df_expanded[["paper_num","perovskite_composition"]]
composition = composition[composition['perovskite_composition'] == "F- Cs 0.05 FA 0.81 MA 0.14 PbI 2.55 Br 0.45"]
composition

Unnamed: 0,paper_num,perovskite_composition
128,88,F- Cs 0.05 FA 0.81 MA 0.14 PbI 2.55 Br 0.45
129,88,F- Cs 0.05 FA 0.81 MA 0.14 PbI 2.55 Br 0.45
130,88,F- Cs 0.05 FA 0.81 MA 0.14 PbI 2.55 Br 0.45


# Inspect passivating molecule

In [142]:
data_inspect = df_expanded['passivating_molecule'].value_counts()

for index, value in data_inspect.iteritems():
    print(index)

phenethylammonium iodide
4-chlorobenzenesulfonate
1,4-butane diammonium iodide
ortho-(phenylene)di(ethylammonium) iodide
Dimethylpyrroline Diammonium Iodide
butylammonium lead tetra iodide
4-tert-butyl-benzylammonium
4-fluorophenylethylammonium iodide
butylammonium
phenylethylammonium iodide
oleylammonium iodide
phenylethylammonium lead iodide
3,4,5-trifluoroanilinium
4-fluoro-phenylethylammonium iodide
(phenethylamino)methaniminium iodide
octylammonium iodide
ethylenediammonium diiodide
carbazole-triphenylamine and phenylammonium iodide units
4-trifluorophenylethylammonium iodide
ortho-carborane decorated with phenylamino groups
Lead Iodide
phenyl-C61-butyric acid methyl ester
L-α-phosphatidylcholine
choline chloride
n-hexyl trimethyl ammonium bromide
FA
4-tert-butyl-benzylammonium iodide
MA +
FAPbI3
oleylamine
chlorine-rich mixed-halide perovskite interlayer
Formamidinium Tin(III) Chloride
4-fluoroaniline
sodium thioglycolate
Piperazinium Diiodide
piperazinium iodide
Titanium dioxide

### Passivating issues
Issue raised by Kang and ChatGPT
- iso-butylamine iodide --> paper 5: butylamine iodide is convertable
- EDBE --> paper 9: wasn't the tested passivating. Fixed.
- MAPbBr3 --> paper 79: Done this was perovskite molecule, not compostion not passivating
- FAPbI3 --> paper 89 Changed into perovskite composition, but this there is no passivation in this paper. 
- FAPbI3 --> paper 91 Paper about additive and no passivation was mentioned
- chlorophenylethylammonium iodide --> paper 14: this was 4-chlorophenylethylammonium iodide and an additive.
- fluorophenylethylammonium iodide --> paper 15: Talking with kelly if this is consistently passivating or additive
- fluorophenylethylammonium iodide --> paper 143: fluorophenylethylammonium lead iodide was passivating. Fixed. 
- 3,4,5-trifluoroanilinium --> paper 19: This is passivating. Done. 
- tri-octyl phosphine oxide --> paper 36 There was no metric associated with this passivating. 
- azetidinium lead iodide --> paper 42 nothing wrong, perfect paper 
- DMePDAI 2 --> paper 53 Dimethylpyrroline Diammonium Iodide
- (phenethylamino)methaniminium iodide --> paper 77 nothing wrong, perfect paper
- NOTE:This is how to prepare the PSC,relevent...? --> paper 60 Fixed. the treatment was 4-vinylbenzylammonium bromide
- lead iodide --> paper 102 perovskite molecule. This paper had no passivating. skip. 
- formate --> paper 110 formate is an additive to composition. Passivating was not mentioned, skip
- europium ion pair --> Already reviewed, they were no passivation, so skiped.
- ortho-carborane --> paper 146 This is indeed passivating. Additionally, CB-NH2 is also another passivating. 
- "OATsO" & "OABF4" ---> paper 10
- CF3-phenethylammonium --> paper 16 Already reviewed. hydrophobic 3-(trifluoromethyl)phenethylamine hydroiodide
- CF3 -PEAI --> paper 73 3,5-bis(trifluoromethyl)phenethylammonium iodide
- 4-chlorobenzenesulfonate (4Cl-BZS) --> paper 0 Perfect paper, done. 
- lead oxalate --> paper 126 The passivation is correct. PbC2O4, 
- sodium thioglycolate --> paper 133 passivating correct
- quanternary ammonium halides --> paper 82 this is not a specific passivating name. Already corrected. 
- ferrocenyl-bis-thiophene-2-carboxylate --> paper 120 correct passivation (FcTc2) tested. 


- PCBM --> 68 [6 6']-phenyl-C61-butyric acid methyl ester. Has passivating and perovskite, but this is tested on ETL bilayer. Maybe record only PCE
- PCBM --> 69 [6 6']-phenyl-C61-butyric acid methyl ester. Has passivating and perovskite, but testing on different way ETL is treated. Maybe record only PCE
- PS Plastic Foam 1% --> paper 48  want to know the difference between PS plastic foam and PS 0.02%. Asking Kelly
- 4-trifluoromethyl-phenylammonium --> paper 149 Issue with identifying passivating cation. Also with composition. Asking kelly
- 4-fluorophenylethylammonium iodide --> paper 74 Differentiation between 2D and CLP confusion. Asking Kelly

<br>
- poly(methyl methacrylate)


In [143]:
passivatin = df_expanded[["paper_num","passivating_molecule"]]
passivatin = passivatin[passivatin['passivating_molecule'] == "poly(methyl methacrylate)"]
passivatin

Unnamed: 0,paper_num,passivating_molecule


In [81]:
passivatin = df_expanded[["paper_num","passivating_molecule"]]
passivatin = passivatin[passivatin['passivating_molecule'] == "2-thiopheneethylammonium iodide"]
passivatin

Unnamed: 0,paper_num,passivating_molecule
95,75,2-thiopheneethylammonium iodide


In [86]:
import pubchempy as pcp
import numpy as np

In [87]:
def fetch_smiles_from_name(molecule_name):
    try:
        # Search for the molecule in PubChem by name
        compounds = pcp.get_compounds(molecule_name, 'name')
        if compounds:
            return compounds[0].isomeric_smiles  # Return the first match's SMILES
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching SMILES for {molecule_name}: {e}")
        return None

In [88]:
fetch_smiles_from_name("2-thiopheneethylammonium iodide")

nan