# Data Download
This script is adapted from [here](https://github.com/baicalin/GAN-WGCNA/blob/main/codes/RNASeq_pipeline/1_RNASeq_Dataset_Downloader.ipynb).

In [42]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.auto import tqdm
import time
from datetime import date
import datetime
import numpy as np

In [43]:
DATASETS_FOLDER_PATH = "../../data/00_data-gathering"
os.makedirs(DATASETS_FOLDER_PATH, exist_ok=True)

In [44]:
def download_data(PREFIX = "GSM", ID_LIST = list(range(2987694, 2987923 + 1)), ATTRIBUTES = ["ID", "Title", "self-administration", "challenge"]):

    dfObj = pd.DataFrame([], columns = ATTRIBUTES)

    for id in tqdm(ID_LIST, ncols=134):
        web_url = "https://www.ncbi.nlm.nih.gov/biosample/%s" % PREFIX + str(id)
        r = requests.get(web_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        trs = soup.find_all("tr") # tr has th and td
        appendObj = {}
        appendObj["ID"] = PREFIX + str(id)
        appendObj["Title"] = soup.find("title").text.split(" ")[0]

        for atr in ATTRIBUTES:

            for tr in trs:

                if tr.th.text == atr:
                    appendObj[atr] = tr.td.text
                    break

        # NOTE from Jane: I got an error about halfway through downloading the data that said attribute 'self-administration'
        # was not found in one of th appendObjs; I changed the line to cath this error, but we may want to check whether our
        # data is the same as the paper's.
        # Original line: np.array([appendObj[key] for key in ATTRIBUTES)
        # Revised line: np.array([appendObj[key] for key in ATTRIBUTES if key in appendObj.keys()])
        try:
            append_df = pd.DataFrame(np.array([appendObj[key] for key in ATTRIBUTES if key in appendObj.keys()]), index = ATTRIBUTES)
            dfObj = pd.concat([dfObj, append_df.T])
        except Exception as e:
            print(e)
            print(appendObj)
            print()
            print(dfObj.head(5))

    return(dfObj)

In [45]:
download_result = download_data()

  0%|                                                                                                         …

### Warning!
Jane: I got a 404 for mouse ID GSM2987724

In [46]:
def label_data(download_result):
    label_arr = []
    for i in range(0, len(download_result)):
        label_arr.append([item[0] for item in download_result["self-administration"] ][i] + [item[0] for item in download_result["challenge"] ][i])
    label_arr
    download_result["status"] = label_arr

    label_arr = []
    for v in download_result["Title"]:
        label_arr.append(v.split("-")[0])
        
    label_arr
    download_result["region"] = label_arr
    download_result.to_csv(os.path.join(DATASETS_FOLDER_PATH, "downloaded_data.csv"), index = False)
    return download_result

In [47]:
download_result = label_data(download_result)
download_result

Unnamed: 0,ID,Title,self-administration,challenge,status,region
0,GSM2987694,BLA-011,Saline,Saline,SS,BLA
0,GSM2987695,BLA-012,Cocaine,Cocaine,CC,BLA
0,GSM2987696,BLA-014,Cocaine,Saline,CS,BLA
0,GSM2987697,BLA-031,Cocaine,Cocaine,CC,BLA
0,GSM2987698,BLA-035,Saline,Saline,SS,BLA
...,...,...,...,...,...,...
0,GSM2987919,VTA-203,Cocaine,,CN,VTA
0,GSM2987920,VTA-204,Saline,,SN,VTA
0,GSM2987921,VTA-205,Saline,,SN,VTA
0,GSM2987922,VTA-214,Cocaine,,CN,VTA


In [48]:
df_meta = pd.DataFrame(
                        index = ["BLA", "Cpu", "Hipp", "NAc", "PFC", "VTA"],
                        columns = ["SS", "SC", "CS", "CC", "CN", "SN"]
                        )
for region in ["BLA", "Cpu", "Hipp", "NAc", "PFC", "VTA"]:
    for status in ["SS", "SC", "CS", "CC", "CN", "SN"]:    
        df_meta.loc[region, status] = len(download_result[(download_result["region"] == region) & (download_result["status"] == status)])
df_meta

Unnamed: 0,SS,SC,CS,CC,CN,SN
BLA,7,7,5,6,6,7
Cpu,7,6,6,7,8,8
Hipp,6,7,6,5,8,7
NAc,6,5,5,7,8,8
PFC,7,7,5,7,8,7
VTA,6,5,3,5,6,6


In [49]:
df_meta.to_csv(os.path.join(DATASETS_FOLDER_PATH, "metadata.csv"))  

### Warning
Jane: We are off-by-1 here at BLA-SN (original paper has 7, we only have six because of the 404)

### Warning
Jane: I couldn't find in the repo where "DATASETS/original.CSV" in post_modification_and_save_to_csv() comes from; I am taking a wild guess that it is [this data linked from the paper](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE110344) so I wrote the code below to extract it.

In [50]:
import gzip
import urllib.request

def download_file(url, name):
    out_file = DATASETS_FOLDER_PATH + name + '.tsv'

    # Download archive
    try:
        # Read the file inside the .gz archive located at url
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=response) as uncompressed:
                file_content = uncompressed.read()

        # write to file in binary mode 'wb'
        with open(out_file, 'wb') as f:
            f.write(file_content)
            return 0

    except Exception as e:
        print(e)
        return 1
    

download_file('https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE110344&format=file&file=GSE110344%5Fcounttab%5Fself%5Fadministration%2Etxt%2Egz', 'original')

0

In [51]:
original = pd.read_csv(DATASETS_FOLDER_PATH + "original.tsv", sep="\t")
original

Unnamed: 0,Gene,BLA-011,BLA-012,BLA-014,BLA-031,BLA-035,BLA-043,BLA-052,BLA-065,BLA-072,...,VTA-182,VTA-191,VTA-193,VTA-195,VTA-201,VTA-203,VTA-204,VTA-205,VTA-214,VTA-223
0,ENSMUSG00000000001,706,788,657,712,780,953,1046,440,782,...,717,847,725,674,1014,856,768,773,1036,828
1,ENSMUSG00000000003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSMUSG00000000028,27,33,28,19,35,62,37,43,30,...,39,45,31,39,36,35,40,28,51,49
3,ENSMUSG00000000031,0,0,0,1,0,0,2,0,1,...,0,4,2,0,2,6,1,3,1,0
4,ENSMUSG00000000037,27,40,23,40,42,38,45,19,25,...,21,27,17,23,37,37,21,23,50,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43341,ENSMUSG00000104523,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43342,ENSMUSG00000104524,1,0,0,0,1,0,3,1,0,...,1,1,1,0,1,2,1,3,2,5
43343,ENSMUSG00000104525,0,3,2,2,1,0,1,1,1,...,2,1,0,0,1,2,0,3,2,2
43344,ENSMUSG00000104526,2,0,1,1,0,1,0,0,1,...,0,0,1,2,1,1,0,2,3,1


In [52]:
# Code to download the behavioral index data from the paper's repo:

behavioral_index_url = "https://media.githubusercontent.com/media/baicalin/GAN-WGCNA/refs/heads/main/codes/RNASeq_pipeline/Datasets/Behavioral_index_data.csv"

# download and save as csv
response = requests.get(behavioral_index_url)
with open(os.path.join(DATASETS_FOLDER_PATH, "behavioral_index_data.csv"), 'wb') as f:
    f.write(response.content)

In [53]:
def post_modification_and_save_to_csv(dfObj, result_id=None):
    # If no result_id was provided, create one based on the current time.
    if result_id is None:
        result_id = datetime.datetime.now().strftime("%H%M%S")
    
    # Define paths for the input files.
    original_path = os.path.join(DATASETS_FOLDER_PATH, "downloaded_data.tsv")
    behavioral_path = os.path.join(DATASETS_FOLDER_PATH, "behavioral_index_data.csv")
    
    # Read the input datasets.
    df = pd.read_csv(original_path, sep="\t")
    df_bi = pd.read_csv(behavioral_path)
    
    # Build new DataFrame columns using a dictionary to avoid iterative insertion.
    new_columns = {}
    for col in df.columns[1:231]:
        # Extract metadata from dfObj.
        # (The [2:3] slicing mimics the original behavior that extracts a character from
        # the string representation of the numpy array. Adjust as needed.)
        challenge = str(dfObj.loc[dfObj['Title'] == col, "challenge"].values)[2:3]
        sa = str(dfObj.loc[dfObj['Title'] == col, "self-administration"].values)[2:3]
        
        # Determine region and sample number based on column name.
        region = col[0:3]
        sample_num = col[-3:]
        
        # Get the Addiction Index from the behavioral data.
        ai = str(df_bi.loc[df_bi['Animal.ID'] == int(sample_num), "Addiction Index"].values[0])
        
        # Create a new column name (e.g., "001_BLA_AB-3.0").
        new_col_name = f"{col[-3:]}_{region.upper()}_{sa}{challenge}-{ai}"
        new_columns[new_col_name] = df[col].values
    
    # Create a new DataFrame with the new columns, using the 'Gene' column as the index.
    df_with_denoted_colnames = pd.DataFrame(new_columns, index=df["Gene"])
    
    # Save the complete DataFrame to CSV. The index is labeled "Gene" so that it is saved and
    # can be reloaded easily.
    output_file = os.path.join(DATASETS_FOLDER_PATH, f"Total_Conditions_Counts_with_AI{result_id}.csv")
    df_with_denoted_colnames.to_csv(output_file, index_label="Gene")
    
    # Reload the CSV file.
    # Using index_col="Gene" ensures that the gene names become the DataFrame index.
    df_loaded = pd.read_csv(output_file, index_col="Gene")
    
    # List of regions to process.
    regions = ["BLA", "VTA", "CPU", "HIP", "NAC", "PFC"]
    
    # For each region, select the columns that belong to it and save the DataFrame.
    for region in tqdm(regions, ncols=134, desc="Processing Regions"):
        # Identify all columns where positions 4 to 6 match the region.
        # (e.g., in "001_BLA_AB-3.0", characters 4 to 7 are "BLA".)
        region_cols = [col for col in df_loaded.columns if col[4:7] == region]
        
        # Create a DataFrame with these columns.
        df_region = df_loaded[region_cols].copy()
        
        # Save the region-specific DataFrame with and without the index.
        region_out_with_index = os.path.join(DATASETS_FOLDER_PATH, f"{region}_Count_Dataset{result_id}.csv")
        region_out_without_index = os.path.join(DATASETS_FOLDER_PATH, f"{region}_Count_Dataset_no_index{result_id}.csv")
        df_region.to_csv(region_out_with_index, index=True)
        df_region.to_csv(region_out_without_index, index=False)

In [54]:
post_modification_and_save_to_csv(dfObj = download_result, result_id="")

FileNotFoundError: [Errno 2] No such file or directory: '../../data/00_data-gathering/downloaded_data.tsv'