# Py General Utilities

In [None]:
import os
import io
import zipfile
import requests

def simple_bool(message):
    choose = input(message+" (y/n): ").lower()
    your_bool = choose in ["y", "yes","yea","sure"]
    return your_bool

# >> Downloaders

In [None]:
import os
import io
import requests
import zipfile


# git clone
def git_clone(repo_url,
              save_dir = os.getcwd()):
    cmd = f'git clone {repo_url} {save_dir}'
    os.system(cmd)
    print('done')


def git_clone_sub(repo_url,
                  subfolder,
                  temp_dir = os.getcwd()+'\\temp\\'):
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    cmd = f'git clone {repo_url} {temp_dir}'
    os.system(cmd)
    # specify original path and destination path
    original_path = temp_dir + subfolder
    destination_path = os.path.join(os.getcwd(), subfolder)
    os.rename(original_path, destination_path)
    delete = f'rmdir /s /q  {temp_dir}'
    os.system(delete)
    print('done')


# Simple Downloader
def get_file(url,
             file_name,
             dir = os.getcwd()):
    response = requests.get(url)
    if response.status_code == 200:
        content = response.content
        file_path = os.path.join(dir, file_name)
        with open(file_path, 'wb') as file:
            file.write(content)
        print(f"File downloaded successfully. Saved as {file_name}")
    else:
        print("Unable to download the file.")


# Download single GitHub file from repository
def get_gitfile(url,
                flag='',
                dir = os.getcwd()):
    url = url.replace('blob','raw')
    response = requests.get(url)
    file_name = flag + url.rsplit('/',1)[1]
    file_path = os.path.join(dir, file_name)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"File downloaded successfully. Saved as {file_name}")
    else:
        print("Unable to download the file.")


# Download and Exrtact zip file from Zenodo
def get_and_extract_zenodo(file,
                           zenodoid = 8205724,
                           dir = os.getcwd(),
                           ext = '.zip'):
    url='https://zenodo.org/record/'+str(zenodoid)+'/files/'+file+'.zip?download=1'
    zip_file_name = file+ext
    extracted_folder_name = dir
    # Download the ZIP file
    response = requests.get(url)
    if response.status_code == 200:
        # Extract the ZIP contents
        with io.BytesIO(response.content) as zip_buffer:
            with zipfile.ZipFile(zip_buffer, 'r') as zip_ref:
                zip_ref.extractall(extracted_folder_name)
        print(f"ZIP file '{zip_file_name}' extracted to '{extracted_folder_name}' successfully.")
    else:
        print("Failed to download the ZIP file.")


In [None]:
git_clone('https://github.com/johndef64/pychatgpt.git', save_dir = os.getcwd()+'\\temp\\')

In [None]:
git_clone_sub('https://github.com/johndef64/pychatgpt.git', 
              'gpt-cli')

In [None]:
git_clone_sub('https://github.com/spacetx/starfish.git','starfish')

In [None]:
git_clone('https://github.com/johndef64/pychatgpt.git', r'C:\Users\giova\Documents\GitHub\pyutilities_datascience\newrepo')

# Download Datasets.csv

In [16]:
# Download the file
handle = "https://github.com/"
file1 = handle+"SeniorMars/pokemon-csv/blob/master/pokemon.csv"
file2 = handle+"zehnzwanzig/PokemonGo_CSV/blob/master/pokemon.csv"
file3 = handle+"johndef64/GRPM_system/blob/main/human_genes_repo/H_GENES_proteincoding_genes.csv"

get_gitfile(file1, 'base_')
get_gitfile(file2, 'go_'  )
get_gitfile(file3)

File downloaded successfully. Saved as base_pokemon.csv
File downloaded successfully. Saved as go_pokemon.csv
File downloaded successfully. Saved as H_GENES_proteincoding_genes.csv


# >> File Operartions

In [None]:
import os
import glob
import pandas as pd


# Check files in folder (with extension)
def file_display(ext,
                 contains='',
                 path=os.getcwd()):
    file_pattern = os.path.join(path, "*."+ext)
    files = glob.glob(file_pattern)
    files_name = []
    for file in files:
        file_name = os.path.basename(file)
        files_name.append(file_name)

    print('Available .'+ext+' files:')
    files_df = pd.Series(files_name)
    file = files_df[files_df.str.contains(contains)]
    print(file)


def file_display_subfolders(folder_path=os.getcwd()):
    subfolders = [f.name for f in os.scandir(folder_path) if f.is_dir()]
    print("Subfolders in", folder_path, ":")
    for subfolder in subfolders:
        print(subfolder)


def file_get_subfolders(folder_path=os.getcwd()):
    subfolders = [f.name for f in os.scandir(folder_path) if f.is_dir()]
    return subfolders


def file_get_files(ext,
                   contains='',
                   path=os.getcwd()):
    file_pattern = os.path.join(path, "*."+ext)
    files = glob.glob(file_pattern)
    files_name = []
    for file in files:
        file_name = os.path.basename(file)
        files_name.append(file_name)
    filtered_file = [item for item in files_name if isinstance(item, str) and contains in item]
    return filtered_file


def file_get_files_pd(ext, contains='',
                      path=os.getcwd()):
    # Create a file path pattern to match 'ext' files
    file_pattern = os.path.join(path, "*."+ext)
    # Use glob to get a list of file paths matching the pattern
    files = glob.glob(file_pattern)
    files_name = []
    # Get the list of 'ext' files
    for file in files:
        file_name = os.path.basename(file)
        files_name.append(file_name)
    files_sr = pd.Series(files_name)
    filtered_file = files_sr[files_sr.str.contains(contains)]
    return pd.Series(filtered_file)


def file_delete(filename,
                path=os.getcwd()):
    try:
        os.remove(os.path.join(path, filename))
        print(f"File {filename} deleted successfully.")
    except FileNotFoundError:
        print(f"File {filename} not found.")
    except PermissionError:
        print(f"Permission denied.")
    except Exception as e:
        print(f"Unable to delete file {filename}. Error: {str(e)}")

# Load & Display CSV

In [None]:
import pandas as pd
path =  r'C:\Users\yourpath\dataset.csv'
df = pd.read_csv(path, encoding='utf-8')

poke_base = pd.read_csv('base_pokemon.csv')
poke_go = pd.read_csv('go_pokemon.csv', encoding='latin-1')
display(df)

# >> Dataframe Functions

## functions

In [None]:
import pandas as pd
import re
import ast


def pd_choose(my_list):
    i = int(input('choose index:\n'+str(pd.Series(my_list))))
    return my_list[i]


def pd_choose_col(my_df):
    i = int(input('choose column:\n'+str(pd.Series(my_df.columns))))
    return my_df.columns[i]


# Df merger
def pd_merge_base(df1, df2, column1, column2, how= ''):
    merged_df = pd.merge(df1, df2, left_on=column1, right_on=column2, how=how)
    return merged_df


def pd_merge_select(df1, df2, how= 'inner'):
    column1 = df1.columns[int(input(pd.Series(df1.columns)))]
    column2 = df2.columns[int(input(pd.Series(df2.columns)))]
    merged_df = pd.merge(df1, df2, left_on=column1, right_on=column2, how=how)
    return merged_df


def pd_merge_select_multi(df1, df2):
    how = pd_choose(['inner', 'outer','left', 'right','cross' ])
    column1 = df1.columns[int(input(pd.Series(df1.columns)))]
    column2 = df2.columns[int(input(pd.Series(df2.columns)))]
    merged_df = pd.merge(df1, df2, left_on=column1, right_on=column2, how=how)
    return merged_df


def pd_groupby_describe(df1):
    var = ['all','number','object','bool']
    include = pd_choose(var)
    column1 = df1.columns[int(input(pd.Series(df1.columns)))]
    df1_count = df1.groupby(column1).describe(include=include).dropna(axis=1,how='all').reset_index()
    return df1_count


def pd_groupby_describe_flat(df1):
    include = pd_choose(['all','number','object','bool'])
    column1 = df1.columns[int(input(pd.Series(df1.columns)))]
    df1_count = df1.groupby(column1).describe(include=include).dropna(axis=1,how='all').reset_index()
    df1_count.columns = df1_count.columns.to_flat_index()
    #pattern = r"([\w]+)_([\w]+)"
    list_of_strings = []
    for tuple in df1_count.columns:
        string = "_".join(tuple)
        #string = re.sub(r"\s+", "", string)
        list_of_strings.append(string)
    df1_count.columns = list_of_strings
    return df1_count

## simple operations

### Create random df

In [None]:
import pandas as pd
import random

data = {'A': [], 'B': [], 'C': []}
my_list = poke_go.NAME_ENGLISH

for _ in range(20):  # This will create a dataframe with 5 rows
    data['A'].append(random.randint(1, 10))
    data['B'].append(random.uniform(0.0, 1.0))
    data['C'].append(random.choice(my_list))

simple_df = pd.DataFrame(data)
simple_df

### Add columns

In [None]:
simple_df.columns[0]

In [None]:
# nuova colonna con aggiunta
import math
import random
random = random.randint(1, 10)

simple_df['new_col'] = [(int(i))*2.2 for i in range(len(simple_df))]
simple_df['new_col_2'] = simple_df[simple_df.columns[0]].apply(lambda x: math.sqrt(x))
simple_df['new_col_3'] = simple_df[simple_df.columns[1]].apply(lambda x: x*random)
simple_df

In [None]:
df = poke_go
df['Type'] = df['TYP1'] + ','+ df['TYP2']
poke_go =df

#### Add boolean column

In [None]:
#method: str contains
df = pd.read_csv('df.csv', index_col=False)

hook = ['linmor', 'vonn', 'janus','deast','gio','papa','gius','iavn','dean','def','giul','lorem','crest','hor','vita','van','gian','imtd']
df['bool'] = df['core'].str.contains('|'.join(hook))
df[['core','bool']]

#df = df.drop('index', axis=1)
if simple_bool('save csc?'):
    df.to_csv('filtered_df.csv')

In [None]:
df = pd.read_csv('filtered_df.csv', index_col=False)
column_names = df.columns.tolist()

# Move the last column to the beginning of the list
column_names = [column_names[-1]] + column_names[:-1]
# Reorder the columns in the DataFrame using the updated column names
df = df[column_names]

if simple_bool('save csv?'):
    df.to_csv('filtered_df.csv')

#### apply style to df

In [None]:
df = pd.read_csv('filtered_df.csv', index_col=False)
#df = df.drop('Unnamed: 0', axis=1)

def highlight_true(value):
    """
    Apply background color to cells containing True.
    """
    if value is True:
        return 'background-color: yellow'
    else:
        return ''

df_style = df.style.applymap(highlight_true)
df_style

#### remove bool column

In [None]:
#remove bool column
df = df[df['bool']==True]
df = df.drop('bool', axis=1)

if simple_bool('save csc?'):
    df.to_csv('log64_bool.csv')

### misc op

In [None]:
my_list = ['all','number','object','bool']
pd_choose(my_list)

In [None]:
data = {
    'name': ['John', 'Alice', 'Bob', 'Alice'],
    'age': [25, 30, 35, 27],
    'salary': [5000, 6000, 5500, 7000],
    'city': ['New York', 'Chicago', 'New York', 'Chicago']
}
df = pd.DataFrame(data)

df.select_dtypes(include='object').columns

### Merge Data

In [None]:
simple_df = pd_merge_select(poke_go, poke_base, 'inner')[['NAME_ENGLISH','NAME_FRENCH','Ability1']]

In [None]:
pd_merge_select_multi(poke_go[['NAME_ENGLISH','NAME_FRENCH']], poke_base[['Name','Ability1']])

### Groupby Data

In [None]:
# groupby describe
group_0 = pd_groupby_describe(poke_go)
group_0[group_0.TYP1.count]

In [None]:
# groupby describe flat
df = poke_go
df['Type'] = df['TYP1'] + ','+ df['TYP2']
poke_go =df

group_1 = pd_groupby_describe_flat(poke_go, 'number')#.T#, 'str')
group_1

In [None]:
type(gr.columns[0])
group_1.T

In [None]:
import pandas as pd

# Create a sample DataFrame
data = {
    'name': ['John', 'Alice', 'Bob', 'Alice'],
    'age': [25, 30, 35, 27],
    'salary': [5000, 6000, 5500, 7000],
    'city': ['New York', 'Chicago', 'New York', 'Chicago']
}
df = pd.DataFrame(data)

# Group by multiple columns
grouped = df.groupby(['city', 'name'])

# Example usage
print(grouped.size())  # Grouped size based on 'city' and 'name'
print(grouped.sum()['salary'])  # Grouped sum of 'salary' based on 'city' and 'name'
grouped.count

# >> Data Visualization

While Pandas provides some basic plotting functionality, you may still need to use Matplotlib for more advanced and customized plots. Here are a few reasons why you might consider using Matplotlib alongside Pandas for plotting:

1. **Flexibility**: Matplotlib is a powerful visualization library that offers a wide range of plot types and customization options. It provides fine-grained control over plot elements such as labels, titles, color schemes, and annotations. If you need to create complex or specialized plots that go beyond the capabilities of Pandas, Matplotlib can be a valuable tool.

2. **Additional Plot Types**: While Pandas offers several basic plot types (e.g., line, bar, scatter), Matplotlib provides a larger variety of plot types such as histograms, pie charts, box plots, heatmaps, and 3D plots. If you need to create these types of plots, Matplotlib is a great choice.

3. **Integration with Pandas**: Matplotlib integrates well with Pandas, allowing you to directly plot Pandas DataFrames and Series objects. You can use the `plot()` function from Pandas to quickly create basic plots, and then use Matplotlib to customize them further if needed. Matplotlib provides fine-grained control over plot elements, allowing you to tweak the plots created using Pandas.

4. **Publication-Quality Plots**: Matplotlib is widely used in scientific research and data analysis because it offers a high level of customization and can create publication-quality plots. If you need to create professional-looking plots for papers, reports, or presentations, Matplotlib provides the necessary tools and options.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a sample DataFrame
data = {
    'name': ['John', 'Alice', 'Bob', 'Alice'],
    'age': [25, 30, 35, 27],
    'salary': [5000, 6000, 5500, 7000],
    'city': ['New York', 'Chicago', 'New York', 'Chicago']
}
df = pd.DataFrame(data)

In [None]:
#---------------------------------------------------------
df = poke_go
grouped_data = df.groupby(pd_choose_col(df)).describe()
#grouped_data = pd_groupby_describe(df)
grouped_data

In [None]:
grouped_data['MAX_ATT']

In [None]:
# Visualize string columns
#include = pd_choose(['number','object','bool','category','datetime'])
#string_columns = grouped_data.select_dtypes(include=include).columns

column = 'MAX_ATT'
grouped_data[column].boxplot()
#grouped_data[column].plot(kind='bar')
plt.title(f"{column} distribution by chosen col")
plt.show()

In [None]:
# Visualize string columns
include = pd_choose(['str','int','float','number','object','bool','category','datetime'])
string_columns = df.select_dtypes(include=[include]).columns

In [None]:
string_columns

In [None]:
import matplotlib.pyplot as plt

column = pd_choose_col(df)
fig = plt.figure(figsize = (20,5))
ax = fig.add_subplot()
grouped_data[column].plot(kind='bar', ax=ax)
ax.set_title(f"{column} distribution by chosen col")

plt.show()

In [None]:
import matplotlib.pyplot as plt

column = pd_choose_col(df)
fig = plt.figure(figsize=(20, 5))
ax = fig.add_subplot()

ax.set_title(f"{column} distribution by chosen col")
grouped_data[column].plot(kind='bar', ax=ax)

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
column = pd_choose_col(df)

#grouped_data[column].boxplot()
grouped_data[column].plot(kind='bar')
plt.title(f"{column} distribution by chosen col")
plt.show()

grouped_data[column]['mean'].plot(kind='bar')
plt.title(f"{column} distribution by chosen col")
plt.show()

In [None]:
column = pd_choose_col(df)
grouped_data[column].plot(kind='bar')
plt.title(f"{column} distribution by chosen col")
plt.show()

In [None]:
if simple_bool('loop plot?'):
    for column in string_columns:
        if column in grouped_data.columns:
            #grouped_data[column].plot(kind='bar')
            grouped_data[column].boxplot()
            plt.title(f"{column} distribution by chosen col")
            plt.show()
        else:
            print(f"Column '{column}' does not exist in the DataFrame.")

In [None]:

# Visualize integer columns
int_columns = df.select_dtypes(include=['int', 'float']).columns
int_columns

In [None]:
if simple_bool('loop plot?'):
    for column in int_columns:
        if column in grouped_data.columns:
            grouped_data[column].boxplot()
            plt.title(f"{column} distribution by chosen col")
            plt.show()
        else:
            print(f"Column '{column}' does not exist in the DataFrame.")

In [None]:
import seaborn as sns
import pandas as pd


import matplotlib.pyplot as plt
plt.show()

In [None]:
poke_go

In [None]:
gr

### General data

In [None]:
import pandas as pd
#import matplotlib.pyplot as plt

# Create a sample DataFrame
data = {
    'name': ['John', 'Alice', 'Bob', 'Alice'],
    'age': [25, 30, 35, 27],
    'salary': [5000, 6000, 5500, 7000],
    'city': ['New York', 'Chicago', 'New York', 'Chicago']
}
df = pd.DataFrame(data)
# Perform groupby and describe
grouped_data = df.groupby(['name']).describe(include=['object'])

# Visualize string columns
string_columns = df.select_dtypes(include=['object']).columns
for column in string_columns:
    if column in grouped_data.columns:
        grouped_data[column].plot(kind='bar')
        plt.title(f"{column} distribution by city")
        plt.show()
    else:
        print(f"Column '{column}' does not exist in the DataFrame.")

In [None]:
grouped_data

In [None]:
import pandas as pd
df = pd.DataFrame(data)
# Perform groupby and describe
grouped_data = df.groupby(['name']).describe(include=['number'])
# Visualize integer columns
int_columns = df.select_dtypes(include=['int', 'float']).columns
for column in int_columns:
    if column in grouped_data.columns:
        grouped_data[column].boxplot()
        plt.title(f"{column} distribution by city")
        plt.show()
    else:
        print(f"Column '{column}' does not exist in the DataFrame.")

In [None]:
# subdivision of a multiindex object
doit = grouped_data['age']
print(type(doit))
print(type(grouped_data))
doit.boxplot()

In [None]:
§grouped_data['age']['count'].plot()

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.kdeplot(data=gr, x=gr.columns[1])
plt.show()

## other operations

Ci sono molte operazioni che puoi eseguire sui DataFrame di Pandas. Ecco alcuni esempi:

1. **Selezione dei dati**: Puoi selezionare dati specifici utilizzando il nome della colonna o condizioni specifiche.

   ```python
   df['colonna']  # seleziona una colonna
   df[df['colonna'] > 0]  # seleziona righe dove 'colonna' è maggiore di 0
   ```

2. **Manipolazione dei dati**: Puoi modificare i tuoi dati in molti modi, come ad esempio aggiungere nuove colonne, modificare valori esistenti, ecc.

   ```python
   df['nuova_colonna'] = df['colonna1'] + df['colonna2']  # aggiunge una nuova colonna
   df['colonna'] = df['colonna'].apply(lambda x: x*2)  # modifica i valori in 'colonna'
   ```

3. **Ordinamento**: Puoi ordinare i tuoi dati in base ai valori di una o più colonne.

   ```python
   df.sort_values(by='colonna')  # ordina in base a 'colonna'
   ```

4. **Grouping**: Puoi raggruppare i tuoi dati in base ai valori di una o più colonne e calcolare statistiche aggregate.

   ```python
   df.groupby('colonna').mean()  # calcola la media per ogni gruppo in 'colonna'
   ```

5. **Pivot**: Puoi pivotare i tuoi dati per creare una tabella pivot.

   ```python
   df.pivot_table(values='colonna1', index='colonna2', columns='colonna3')
   ```

6. **Join**: Oltre al merge, puoi anche unire DataFrame utilizzando `join`.

   ```python
   df1.join(df2, on='colonna_comune')
   ```

7. **Reshaping**: Puoi modificare la forma del tuo DataFrame utilizzando operazioni come `melt`, `pivot`, ecc.

8. **Handling Missing Values**: Puoi gestire i valori mancanti utilizzando metodi come `dropna`, `fillna`, ecc.

Questi sono solo alcuni esempi delle operazioni che puoi eseguire sui DataFrame di Pandas. Pandas è una libreria molto potente e flessibile che offre molte altre funzionalità.

# Get pychatgpt

In [None]:
import os
import sys
if 'google.colab' in sys.modules:
    os.chdir('/content/pychatgpt') #google colab

if simple_bool('Do you have an openai API-key?'):
    # Get pychatgpt
    url="https://raw.githubusercontent.com/johndef64/pychatgpt/main/pychatgpt.py"
    get_gitfile(url)

    import pychatgpt as op
    model = 'gpt-3.5-turbo-16k'
    # Example usage
    message = "Describe the Cosmic Holographic Principle"
    response = op.send_message(message, model='gpt-4')

else:
    print('get your api-key at https://platform.openai.com/account/api-keys\n'
          'or simply use web playground at https://platform.openai.com/playground?model=gpt-3.5-turbo-16k')

In [None]:
op.chat_gpt=[]
character = 'Friedrich Nietzsche'
m = '''
Please tell me about your thoughts about society.
'''
op.send_message(m,
                persona=character,
                model=model)

In [None]:
m = '''
Please, tell me more.
'''
op.send_message(m,
                persona=character,
                model=model)

In [None]:
op.save_chat()

In [None]:
op.load_chat()

In [None]:
# cleatchat
m = 'clearchat'
op.send_message(m)
print(op.chat_gpt)# get & import pychatgpt (openai based module)

# open with notepad, subprocess

In [None]:
import subprocess

def open_in_notepadpp(file_path):
    notepadpp_path = r"C:\Program Files\Notepad++\notepad++.exe"  # Path to Notepad++ executable
    subprocess.Popen([notepadpp_path, file_path])

# Usage
file_path = r"chat_log.txt"  # Replace with the actual file path
open_in_notepadpp(file_path)

# Prompt for keyword-mesh

In [None]:
# Define Biomedical topics
nutritional_topic = [['diseases and disorders realted to nutrition and diet ', 'diet, food consuption, eating behaviour and nutrition']]
infective_topic = [['infective agents, bacteria, virus and protozoan','infective diseases']]
reproductive_topic = [['reproductive system physiology','reproductive system pathology', 'Assisted reproductive technology']]
female_infertility_topic = [['female infertility, genetic imprinting and maternal effect']]
special_issue = [['Diagnosis and Therapies for Genetic Diseases']]

nutritional_topics = [
    ['Obesity, overweight and body weight control', 'compulsive eating behavior'],
    ['cardiovascular diseases','physiological processes realted to cardiovascular diseases','lipid metabolism in the context of cardiovascular diseases'],
    ['Diabetes Melitus Type II and metabolic syndrome'],
    ['Vitamin metabolism and Vitamins recommended intake levels','Micronutrients metabolism and Micronutrient recommended intake levels', 'disease related to vitamins and micronutrients deficiency'],
    ['eating behaviour and taste sensation'],
    ['food intolerances'],
    ['food allergies'],
    ['diet-induced oxidative stress'],
    ['metabolism of xenobiotics'],
]
chosen_topic = special_issue
pd.Series(chosen_topic)

In [None]:
# GPT prompts

# parameters--------------------------------------
object1 = 'Pubmed MeSH terms'
object2 = 'Pubmed keywords'

object= object1
num_mesh = 100
topics = chosen_topic
topic_id = 0
#-----------------------------------------------
topic_01  = topics[topic_id][0]
topic_02  = topics[topic_id][1] if len(topics[topic_id])>=2 else None
topic_03  = topics[topic_id][2] if len(topics[topic_id])>=3 else None

format = {'list': ". Create a python list format like this:\n gpt_01 = [\"term1\",\n \"term2\",\n \"term3\",...]",
          'csv':  ". Create a CSV file like this:\n gpt_terms,\n \"term1\",\n \"term2\",\n \"term3\", ..."}
format = format['csv']

prompt_01 = "give me a comprehensive list of "+str(num_mesh)+" real "+object+" terms related to "+ topic_01+format+"\n"
prompt_02 = "give me a comprehensive list of "+str(num_mesh)+" real "+object+" terms related to "+ topic_02 +format+"\n" if len(topics[topic_id])>=2 else None
prompt_03 = "give me a comprehensive list of "+str(num_mesh)+" real "+object+" terms related to "+ topic_03 +format+"\n" if len(topics[topic_id])>=3 else None

prompts = [prompt_01, prompt_02, prompt_03]
# If you do not have an openai API key, paste these prompts at https://platform.openai.com/playground?model=gpt-3.5-turbo-16k

import pyperclip
pyperclip.copy(prompt_01)
pyperclip.copy(prompt_01+prompt_02) if len(topics[topic_id])>=2 else None
pyperclip.copy(prompt_01+prompt_02+prompt_03) if len(topics[topic_id])>=3 else None

print('prompt_01:',prompt_01)

In [None]:
# get GPT-terms
import pychatgpt as op
op.chat_gpt =[]
response = op.send_message_gpt(prompt_01, model='gpt-4', maxtoken=2000)
#response = op.send_message_gpt('clearchat')

print('''\n\nGet the GPT terms from 'chat_log.txt'
=> save them manually in csv format in "ref-mesh-archive/gpt_terms/yourterms.csv"''')

In [None]:
file_path = 'special_issue_2.csv'
content = '''gpt_terms
"Genetic Diseases Diagnosis",
"Genetic Testing",
"Molecular Diagnostics",
"Genetic Screenings",
"DNA Sequencing",
"Genome Mapping",
"Chromosomal Abnormalities",
"Prenatal Diagnosis",
"Newborn Screening",
"Personalized Medicine",
"Genetic Counseling",
"Carrier Testing",
"Genomic Medicine",
"Pharmacogenetics",
"Predictive Testing",
"Presymptomatic Testing",
"Biochemical Testing",
"Genetic Therapies",
"Gene Therapy",
"Gene Editing",
"CRISPR-Cas9",
"Stem Cell Therapy",
"RNA Therapy",
"Genetic Surgery",
"Molecular Therapy",
"Enzyme Replacement Therapy",
"Antisense Therapy",
"Gene Silencing",
"Genetic Vaccine",
"Pharmacological Chaperones"
'''
def write_to_file(file_path, content):
    with open(file_path, 'w') as file:
        file.write(content)

write_to_file(file_path, content)

In [None]:
op.send_message_gpt('how to say, if a apackahger is not instaled, intall it?')

# Get Zenodo Nbib Dataset (full)

In [None]:
import os
import io
import sys
import glob
import zipfile
import requests
from datetime import datetime

def simple_bool(message):
    choose = input(message+" (y/n): ").lower()
    your_bool = choose in ["y", "yes","yea","sure"]
    return your_bool

def get_and_extract_zenodo(file, dir = os.getcwd(), ext = '.zip'):
    url='https://zenodo.org/record/8205724/files/'+file+'.zip?download=1'
    zip_file_name = file+ext
    extracted_folder_name = dir
    # Download the ZIP file
    response = requests.get(url)
    if response.status_code == 200:
        # Extract the ZIP contents
        with io.BytesIO(response.content) as zip_buffer:
            with zipfile.ZipFile(zip_buffer, 'r') as zip_ref:
                zip_ref.extractall(extracted_folder_name)
        print(f"ZIP file '{zip_file_name}' extracted to '{extracted_folder_name}' successfully.")
    else:
        print("Failed to download the ZIP file.")

if simple_bool('Download nbib-data from Zenodo?\n (careful! 5GB unpacked)'):
    timea = datetime.now()
    get_and_extract_zenodo('nbib_data')
    print('Download and extraction time ',datetime.now()-timea)

#set source dataset:-----------------------
db_tag = 'pcg'
db_name = 'grpm_db_' + db_tag
db_path = 'grpm_dataset/'+db_name

time1 = datetime.now()
#import gene-fullnbib
dummy_nbib = pd.read_csv(db_path+'/complete_nbibtable.csv', index_col=0)
dummy_nbib['pubmed_id'] = dummy_nbib['pubmed_id'].astype(str)
time2 = datetime.now()
print('time import nbib: ', time2-time1)
print(dummy_nbib.memory_usage().sum() / 1024 / 1024, 'MB')

display(dummy_nbib)

# R in Python

You can run R code in a Jupyter notebook. You need to install the rpy2 package and load the R extension in Jupyter using the following steps:

1. Install the rpy2 package using pip (Python's package installer). In a command line window run:
   ```bash
   pip install rpy2
   ```

2. Then, in your Jupyter notebook, load the rpy2 extension by adding this to a cell and running it:
   ```python
   %load_ext rpy2.ipython
   ```

3. Now you can use R in any cell by starting the cell with `%%R`. For example:
   ```python
   %%R
   x <- seq(0, 2*pi, length.out=50)
   y <- sin(x)
   plot(x, y, main="y = sin(x)")
   ```

Remember, the rpy2 package requires that R is installed on your machine and it relies on dynamic libraries that R uses, so be sure you have installed R and it is findable by rpy2.

In [None]:
!pip install rpy2

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
x <- seq(0, 2*pi, length.out=50)
y <- sin(x)
plot(x, y, main="y = sin(x)")

# Repo Links

In [None]:
# quick open in colab

handle = "https://colab.research.google.com/github/"
path = "johndef64/pychatgpt/blob/main/pychatgpt_trial.ipynb"
path1 = "spacetx/starfish/blob/master/notebooks/DARTFISH.ipynb"
import pyperclip
pyperclip.copy(handle+path1)