In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.auto import tqdm
import time
from datetime import date
import datetime
import numpy as np

In [16]:
# Create Dataset folder for Download result 
from os import mkdir, makedirs
# DATASETS_FOLDER_PATH = "Datasets" #-%s" % date.today().strftime("%Y%m%d")
# makedirs(DATASETS_FOLDER_PATH)

In [17]:
def download_data(PREFIX = "GSM", ID_LIST = list(range(2987694, 2987923 + 1)), ATTRIBUTES = ["ID", "Title", "self-administration", "challenge"]):

    dfObj = pd.DataFrame([], columns = ATTRIBUTES)

    for id in tqdm(ID_LIST, ncols=134):
        web_url = "https://www.ncbi.nlm.nih.gov/biosample/%s" % PREFIX + str(id)
        r = requests.get(web_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        trs = soup.find_all("tr") # tr has th and td
        appendObj = {}
        appendObj["ID"] = PREFIX + str(id)
        appendObj["Title"] = soup.find("title").text.split(" ")[0]

        for atr in ATTRIBUTES:

            for tr in trs:

                if tr.th.text == atr:
                    appendObj[atr] = tr.td.text
                    break
        # dfObj.append(appendObj, ignore_index=True)
        append_df = pd.DataFrame(np.array([appendObj[key] for key in ATTRIBUTES]), index = ATTRIBUTES)
        dfObj = pd.concat([dfObj, append_df.T])

    return(dfObj)

In [18]:
download_result = download_data()

  0%|                                                                                                         …

In [19]:
download_result

Unnamed: 0,ID,Title,self-administration,challenge
0,GSM2987694,BLA-011,Saline,Saline
0,GSM2987695,BLA-012,Cocaine,Cocaine
0,GSM2987696,BLA-014,Cocaine,Saline
0,GSM2987697,BLA-031,Cocaine,Cocaine
0,GSM2987698,BLA-035,Saline,Saline
...,...,...,...,...
0,GSM2987919,VTA-203,Cocaine,
0,GSM2987920,VTA-204,Saline,
0,GSM2987921,VTA-205,Saline,
0,GSM2987922,VTA-214,Cocaine,


In [20]:
label_arr = []
for i in range(0, len(download_result)):
    label_arr.append([item[0] for item in download_result["self-administration"] ][i] + [item[0] for item in download_result["challenge"] ][i])
label_arr
download_result["status"] = label_arr

label_arr = []
for v in download_result["Title"]:
    label_arr.append(v.split("-")[0])
label_arr
download_result["region"] = label_arr

In [21]:
download_result

Unnamed: 0,ID,Title,self-administration,challenge,status,region
0,GSM2987694,BLA-011,Saline,Saline,SS,BLA
0,GSM2987695,BLA-012,Cocaine,Cocaine,CC,BLA
0,GSM2987696,BLA-014,Cocaine,Saline,CS,BLA
0,GSM2987697,BLA-031,Cocaine,Cocaine,CC,BLA
0,GSM2987698,BLA-035,Saline,Saline,SS,BLA
...,...,...,...,...,...,...
0,GSM2987919,VTA-203,Cocaine,,CN,VTA
0,GSM2987920,VTA-204,Saline,,SN,VTA
0,GSM2987921,VTA-205,Saline,,SN,VTA
0,GSM2987922,VTA-214,Cocaine,,CN,VTA


In [22]:
df_meta = pd.DataFrame(
                        index = ["BLA", "Cpu", "Hipp", "NAc", "PFC", "VTA"],
                        columns = ["SS", "SC", "CS", "CC", "CN", "SN"]
                        )
for region in ["BLA", "Cpu", "Hipp", "NAc", "PFC", "VTA"]:
    for status in ["SS", "SC", "CS", "CC", "CN", "SN"]:    
        df_meta.loc[region, status] = len(download_result[(download_result["region"] == region) & (download_result["status"] == status)])

In [23]:
df_meta

Unnamed: 0,SS,SC,CS,CC,CN,SN
BLA,7,7,5,6,6,7
Cpu,7,6,6,7,8,8
Hipp,6,7,6,5,8,7
NAc,6,5,5,7,8,8
PFC,7,7,5,7,8,7
VTA,6,5,3,5,6,6


In [40]:
def post_modification_and_save_to_csv(dfObj, result_id = datetime.datetime.now().strftime("%H%M%S")):
    df = pd.read_csv("DATASETS/original.CSV")
    df_bi = pd.read_csv("DATASETS/Behavioral_index_data.csv")
    df_with_denoted_colnames = pd.DataFrame(data=[], index=df["Gene"])
    
    for i in df.columns[1:231]:
        challenge = str(dfObj.loc[dfObj['Title'] == i]["challenge"].values)[2:3]
        sa = str(dfObj.loc[dfObj['Title'] == i]["self-administration"].values)[2:3]

        region = i[0:3]
        sample_num = i[-3:]

        ai = str(df_bi.loc[df_bi['Animal.ID'] == int(sample_num)]["Addiction Index"].values[0])

#         if ai != "0.0":
        col_name = i[-3:len(i)] + "_" + i[0:3].upper() + "_"  + sa + challenge + "-" + ai
        df_with_denoted_colnames[col_name] = df[i].values
    df_with_denoted_colnames.to_csv(DATASETS_FOLDER_PATH + "/Total_Conditions_Counts_with_AI%s.csv" % result_id)

    df = pd.read_csv(DATASETS_FOLDER_PATH + "/Total_Conditions_Counts_with_AI%s.csv" % result_id)
    df_with_denoted_colnames = pd.read_csv(DATASETS_FOLDER_PATH + "/Total_Conditions_Counts_with_AI%s.csv" % result_id)


    regions = ["BLA", "VTA", "CPU", "HIP", "NAC", "PFC"] 

    for region in tqdm(regions, ncols=134, desc="for %s"%region):
        arr = []
        for col in df.columns[1:]:
            if col[4:7] == region:
                arr.append(col)
        df_tmp = pd.DataFrame(columns=arr)
        for s in arr:
            df_tmp[s] = df[s]
        df_tmp.index = df_with_denoted_colnames.Gene

    #     df_tmp.to_csv("Dataset_with_AI-20200216/%s_Count_Dataset.csv" % arr[0][4:7], index=True)
    #     df_tmp.to_csv("Dataset_with_AI-20200216/%s_Count_Dataset_no_index.csv" % arr[0][4:7], index=False)

        df_tmp.to_csv(DATASETS_FOLDER_PATH + "/%s_Count_Dataset%s.csv" % (arr[0][4:7], result_id), index=True)
        df_tmp.to_csv(DATASETS_FOLDER_PATH + "/%s_Count_Dataset_no_index%s.csv" % (arr[0][4:7], result_id), index=False)

In [41]:
post_modification_and_save_to_csv(dfObj = download_result, result_id="")

  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_denoted_colnames[col_name] = df[i].values
  df_with_de

NameError: name 'DATASETS_FOLDER_PATH' is not defined