In [2]:
# read data/images-tem.json
import pandas as pd
import json

# df display options
pd.set_option('max_colwidth',100)

# read json file
with open('data/images-tem.json') as f:
    data = json.load(f)

# read json file into pandas dataframe
df = pd.json_normalize(data, meta=['metaData'])

# clean up dataframe
df.rename(columns={'metaData.doi': 'doi'}, inplace=True)
df.rename(columns={'metaData.title': 'title'}, inplace=True)
df.rename(columns={'metaData.id': 'id'}, inplace=True)
df.rename(columns={'microscopyType': 'microscopy_type'}, inplace=True)
df.drop(columns=['metaData.keywords'], inplace=True)


In [3]:
# print dataframe columns in a list
print(f'Columns: {list(df.columns)}')

Columns: ['file', 'description', 'microscopy_type', 'type', 'title', 'id', 'doi']


In [4]:
# number of unique DOIs
print(f'Number of unique DOIs: {len(df["doi"].unique())}')

Number of unique DOIs: 63


In [5]:

# id : Some images share the same id because they come from the same sample
print(f'Number of unique samples: {len(df["id"].unique())}')

# create a new column with the blob id (uuid in the file path)
df['blob_id'] = df['file'].apply(lambda x: x.split('/')[-1])


Number of unique samples: 871


In [6]:
## new copy of the dataframe to work with
df2 = df.copy()
df2.drop(columns=['title', 'microscopy_type', 'type', 'description'], inplace=True)

# create a new column with the number of files per id (may be more than 1)
df2['count'] = df2.groupby('id')['file'].transform('count')

# group by id and stack the files into a list but keep doi and id and count
df2 = df2.groupby(['id', 'doi', 'count'])['file'].apply(list).reset_index(name='urls')


In [7]:
# Some useful stats
print(f'Number of samples with 1 image: {len(df2[df2["count"] == 1])}')
print(f'Number of samples with 2 images: {len(df2[df2["count"] == 2])}')
print(f'Number of samples with 3 images: {len(df2[df2["count"] == 3])}')
print(f'Number of samples with 4 images: {len(df2[df2["count"] == 4])}')
print(f'Number of samples with 5 images: {len(df2[df2["count"] == 5])}')
print(f'Total number of samples : {len(df2)}')


Number of samples with 1 image: 679
Number of samples with 2 images: 165
Number of samples with 3 images: 20
Number of samples with 4 images: 5
Number of samples with 5 images: 1
Total number of samples : 870


In [8]:
# examples of samples with 2 images
num_samples = 5
for i in df2[df2["count"] == 2].index[:num_samples]:
    #print(df2.iloc[i]['urls'])
    pass


In [9]:
# examples of samples with 3 images
num_samples = 2
for i in df2[df2["count"] == 3].index[:num_samples]:
    #print(df2.iloc[i]['urls'])
    pass

In [10]:
# examples of samples with 4 images
num_samples = 2
for i in df2[df2["count"] == 4].index[:num_samples]:
    #print(df2.iloc[i]['urls'])
    pass

In [11]:
# add binarized file location on disk 
df2['bin_file'] = df2['urls'].apply(lambda x: [f'images/{i.split("/")[-1]}.png' for i in x])
df2.head()


Unnamed: 0,id,doi,count,urls,bin_file
0,58587c9be74a1d205f4ea8c8,10.1039/c4ra15178j,1,[https://qa.materialsmine.org/api/files/58580ef2e74a1d205f4e8b12],[images/58580ef2e74a1d205f4e8b12.png]
1,5876fa3de74a1d6c0d2dd154,10.1039/c4ra15178j,1,[https://qa.materialsmine.org/api/files/58583dd8e74a1d205f4e9757],[images/58583dd8e74a1d205f4e9757.png]
2,5a1deb51e74a1d03cd1e5fea,10.1021/ma400553c,1,[https://qa.materialsmine.org/api/files/59668196e74a1d62877b9728],[images/59668196e74a1d62877b9728.png]
3,5a1deb54e74a1d03cd1e5ff5,10.1016/j.polymer.2011.12.019,2,"[https://qa.materialsmine.org/api/files/5978e5dbe74a1d157722955f, https://qa.materialsmine.org/a...","[images/5978e5dbe74a1d157722955f.png, images/5978e5d5e74a1d157722955d.png]"
4,5a1deb55e74a1d03cd1e5ffa,10.1016/j.compscitech.2006.01.030,1,[https://qa.materialsmine.org/api/files/59113c7be74a1d36e1b7ea5a],[images/59113c7be74a1d36e1b7ea5a.png]


**How to connect every sample image in `df1` to its corresponding sample stored in Nanomine?**

I have a list of 870 sample images that I need to be able to connect to a sample. A sample in nanomine is defined by a slug. For example `https://qa.materialsmine.org/explorer/sample/e408-s23-prasad-2021` has the slug `e408-s23-prasad-2021`. 

With the following sparql query I am able to query all the sample images from the knowledge graph.

```sparql
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX mm: <http://materialsmine.org/ns/>
PREFIX prov: <http://www.w3.org/ns/prov#>
SELECT DISTINCT * WHERE {
    ?sample a mm:PolymerNanocomposite ;
            sio:isRepresentedBy ?image .
	FILTER(!REGEX(STR(?image),"localhost"))
    FILTER(!REGEX(STR(?image),"XMLCONV"))
  }
```

This query returns a table with the following columns:
| sample | image |
|---|---|
| http://materialsmine.org/sample/e408-s16-prasad-2021 | https://nanomine.org/nmr/blob?id=60b932a68558b7dc6dd6ebe5 |

Using the sample column I can get the slug of the sample. And using the image column I can get the id of the image in nanomine.



In [16]:
# read SPARQL query results (this contains the slug for each sample)

sparqldf = pd.read_csv('data/queryResults.csv')
sparqldf.head()

# clean up column names
sparqldf.columns = sparqldf.columns.str.replace('"', '')
sparqldf.columns = sparqldf.columns.str.replace(' ', '')

# create a new column with the slug
sparqldf['slug'] = sparqldf['sample'].apply(lambda x: x.split('/')[-1])

# clean up the image column
sparqldf['image'] = sparqldf['image'].apply(lambda x: x.replace('"', ''))
sparqldf['image'] = sparqldf['image'].apply(lambda x: x.replace(' ', ''))

# extract the blob_id and create a new column for the blob id
sparqldf['blob_id'] = sparqldf['image'].apply(lambda x: x.split('=')[-1])

# clean up the blob id column
sparqldf['blob_id'] = sparqldf['blob_id'].apply(lambda x: x.replace(' ', ''))

# [debug] check for a clean blob id
sparqldf['blob_id'].sample(5).tolist()

# check how many blob ids (images) are in the sparql query
count = 0
for blob_id in df['blob_id'].tolist():
    if blob_id in sparqldf['blob_id'].tolist():
        count += 1
print(f'Number of blob ids in sparql query: {count}')

Number of blob ids in sparql query: 0
