In [None]:
import os
import numpy as np
import pandas as pd
import pandas_profiling
import plotnine
from plotnine import *  # Provides a ggplot-like interface to matplotlib.
from IPython.display import display

## Plot setup.
theme_set(theme_bw(base_size = 11)) # Default theme for plots.

def get_boxplot_fun_data(df):
  """Returns a data frame with a y position and a label, for use annotating ggplot boxplots.

  Args:
    d: A data frame.
  Returns:
    A data frame with column y as max and column label as length.
  """
  d = {'y': max(df), 'label': f'N = {len(df)}'}
  return(pd.DataFrame(data=d, index=[0]))

# NOTE: if you get any errors from this cell, restart your kernel and run it again.


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Saving each table as data frames to workspace

In [None]:

# This code copies file in your Google Bucket and loads it into a dataframe

name_of_file_in_bucket = 'person_df.csv'

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()

In [None]:
# This code copies file in your Google Bucket and loads it into a dataframe

name_of_file_in_bucket = 'condition_df.csv'

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()

In [None]:
# This code copies file in your Google Bucket and loads it into a dataframe

name_of_file_in_bucket = 'measurement_df.csv'

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()

In [None]:
# This code copies file in your Google Bucket and loads it into a dataframe

name_of_file_in_bucket = 'drug_df.csv'

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()

In [None]:
# This code copies file in your Google Bucket and loads it into a dataframe

name_of_file_in_bucket = 'survey_df.csv'

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()

In [None]:
# This code copies file in your Google Bucket and loads it into a dataframe

name_of_file_in_bucket = 'observation_df.csv'

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe = pd.read_csv(name_of_file_in_bucket)
my_dataframe.head()

# Cleaning data each separately

In [None]:
#read data frames from google bucket to workspace 
import pandas as pd
person_df=pd.read_csv('person_df.csv')
condition_df=pd.read_csv('condition_df.csv')
measurement_df=pd.read_csv('measurement_df.csv')
drug_df=pd.read_csv('drug_df.csv')
survey_df=pd.read_csv('survey_df.csv')
observation_df=pd.read_csv('observation_df.csv')

## cleaning person data

In [None]:
import pandas as pd
person_df=pd.read_csv('person_df.csv')

## cleaning and reshaping observation data¶

In [None]:
observation_df=pd.read_csv('observation_df.csv')

In [None]:
observation_df.person_id.nunique()

In [None]:
observation_df=observation_df.drop(observation_df[observation_df['value_as_concept_name'] == 'PMI: Skip'].index)
#observation_df.sort_values(by=['person_id','observation_datetime']).head(60)

In [None]:
observation_df=observation_df.pivot_table(index=['observation_datetime','person_id'],
                                              columns='standard_concept_name', values='value_as_concept_name', aggfunc='first').reset_index()

In [None]:
observation_df=observation_df.rename(
    columns={
        'Alcohol: 6 or More Drinks Occurrence':'alcohol6or_more',
        'Are you covered by health insurance or some other kind of health care plan [PhenX]':'Health_insurance_cov',
        'Cigar Smoking: Cigar Smoke Participant':'cigar_smoke',
        'Current occupational status [SAMHSA]':'Current_occupation_status',
        'Electronic Smoking: Electric Smoke Participant':'electronic_cigarate_smoke',
        'Health Insurance: Health Insurance Type':'Health_insurance_type',
        'Home Own: Current Home Own':'current_home_own_rent',
        'Hookah Smoking: Hookah Smoke Participant':'hookah_smoke',
        'How often have you been bothered by emotional problems such as feeling anxious, depressed or irritable in past 7 days [PROMIS]':'howmuch_emotional_problem_7yrs',
        'Insurance: Healthcare Coverage':'insurance_healthcare_coverage',
        'Living Situation: Stable House Concern':'stable_house_concern',
        'Marital status':'marital_status',
        'Race':'race',
        'Race: What Race Ethnicity':'race_ethnicity',
        'Recreational Drug Use: Which Drugs Used':'resreational_drug',
        'Sex':'sex',
        'Smoked at least 100 cigarettes in entire life':'smoked100cigarettes',
        'Total combined household income range in last year':'hh_income_last_yr',
        'What is the highest grade or level of schooling you completed [SAMHSA]':'highest_grade_edu'
    }
)

In [None]:
observation_df.sort_values(by=['person_id','observation_datetime']).head(60)

In [None]:
observation_df=observation_df.drop(columns=['sex','race','race', 'Age'])

In [None]:
observation_df_w=observation_df

In [None]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = observation_df_w   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'observation_df_wide.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


## Cleaning and reshaping survey data

In [None]:
survey_df=pd.read_csv('survey_df.csv')

In [None]:
survey_df=survey_df.drop(survey_df[survey_df['answer'] == 'PMI: Skip'].index)

In [None]:
survey_df_w=survey_df.pivot_table(index=['survey_datetime','person_id'],
                                              columns='question', values='answer', aggfunc='first').reset_index()

In [None]:
# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = survey_df_w   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'survey_df_wide.csv'

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')

## cleaning condition data and creating exacebration variable

In [None]:
condition_df=pd.read_csv('condition_df.csv')
condition_df.info()

In [None]:
import numpy as np
condition_df['exacerbation']=np.where(condition_df['standard_concept_name'].str.contains('exacerbation'), 1, 0)

In [None]:
condition_df.sort_values(by=['person_id','condition_start_datetime']).head(60)

In [None]:
condition_df['exacerbation'].value_counts()

In [None]:
condition_df['condition_start_datetime']=pd.to_datetime(condition_df['condition_start_datetime'])
condition_df['condition_end_datetime']=pd.to_datetime(condition_df['condition_end_datetime'])

## subseting  and cleaning, and pivoting (wide) measurement data

In [None]:
import pandas as pd
measurement_df=pd.read_csv('measurement_df.csv')

In [None]:
measurement_df=measurement_df.drop(columns=['range_high','range_low','value_as_concept_name',
                             'measurement_type_concept_name', 'standard_vocabulary',
                             'visit_occurrence_concept_name'])


In [None]:
measurement_df=measurement_df.dropna(subset=['value_as_number'])
measurement_df.shape

In [None]:
measurement_df=measurement_df[measurement_df['standard_concept_name'].str.contains('Monocytes', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Leukocytes', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Neutrophils', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Eosinophils', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Lymphocytes', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Basophills', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('BMI', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Albumin [Mass/volume] in Serum or Plasma', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Bilirubin.total [Mass/volume] in Serum or Plasma', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Bilirubin.total [Mass/volume] in Blood', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('IgE', case=False, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Hemoglobin [Mass/volume] in Blood', case=True, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Nitrite', case=False, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Protein', case=False, regex=True)|
                              measurement_df['standard_concept_name'].str.contains('Urobilinogen', case=False, regex=True)]

In [None]:
import numpy as np
conditions=[(measurement_df['standard_concept_name'].str.contains('Monocytes', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Leukocytes', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Neutrophils', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Eosinophils', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Lymphocytes', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Basophills', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('BMI', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Albumin', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Bilirubin', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('IgE', case=False, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Hemoglobin', case=True, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Nitrite', case=False, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Protein', case=False, regex=True)),
           (measurement_df['standard_concept_name'].str.contains('Urobilinogen', case=False, regex=True))]

values=['Monocytes','Leukocytes','Neutrophils','Eosinophils',
        'Lymphocytes','Basophills','BMI','Albumin','Bilirubin',
        'IgE','Hemoglobin','Nitrite', 'Protein','Urobilinogen']

measurement_df['variable'] = np.select(conditions, values)

In [None]:
measurement_df=measurement_df.pivot_table(index=['measurement_datetime','person_id'],columns='variable', values='value_as_number').reset_index()

In [None]:
# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

my_dataframe = measurement_df_w   


destination_filename = 'measurement_df_wide.csv'


# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


## cleaning drug data

In [None]:
import pandas as pd
drug_df=pd.read_csv('drug_df.csv')

In [None]:
drug_df.sort_values(by=['person_id','drug_exposure_start_datetime']).head(60)

In [None]:
for val in drug_df['standard_concept_name'].unique():
    print(val)

In [None]:
drug_df=drug_df.convert_dtypes()

In [None]:

drug_df=drug_df.dropna(subset=['standard_concept_name'])
drug_df.shape

In [None]:
import numpy as np
SABA=['albuterol','ipratropium','Levalbuterol', 'Terbutaline','Pirbuterol','Salbutamol']
Inhaled_corticosteroids=['budesonide','Beclomethasone','Pulmicort','Ciclesonide','Flunisolide',
                         'Fluticasone','Mometasone'] 
LABA=['formoterol','Fluticasone','salmeterol','vilantero','Indacaterol','Olodaerol'] 
Leukotriene_modifiers=['Montelukast','Zafirlukast','Zileuton','Arformoterol','Carmoterol']  
Methylxanthines_and_Monoclonals=['Theophylline','Omalizumab','Aminophylline'] 
Anti_histamines=['Azelastine','methylprednisolone','Phenylephrine','diphenhydramine',
                 'Chlorpheniramine', 'Desloratadine', 'Fexofenadine','Levocetirizine',
                 'Loratadine','Cetirizine','pseudoephedrine','Oxymetazoline','Levocetirizine',
                 'Tetrahydrozoline', 'Olopatadine','Ketotifen','Triprolidine','Cromoglicic',
                 'Levocetirizine', 'Xylometazoline','Antazoline']
Syst_corticosteroids=['cortisone','prednisone','prednisolone','methylprednisolone',
                      'dexamethasone','betamethasone','hydrocortisone','Triamcinolone']
Sympathomimetics=['Phenylpropanolamine','Amphetamine','Methoxamine','Epinephrine','Metaraminol',
                  'Labetalol','Phenylephrine','Norepinephrine','Midodrine','Phentermine']


conditions=[
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(SABA),case=False, na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Inhaled_corticosteroids),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(LABA),case=False, na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Leukotriene_modifiers),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Methylxanthines_and_Monoclonals),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Anti_histamines),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Syst_corticosteroids),case=False,na=False),
    drug_df['standard_concept_name'].astype(str).str.contains('|'.join(Sympathomimetics),case=False,na=False)
]


choices=[
    'SABA',
    'Inhaled corticosteroids', 'LABA', 'Leukotriene modifiers',
    'Methylxanthines and Monoclonals', 'Anti-histamines', 'Syst_corticosteroids','Sympathomimetics'
]


    
drug_df['drug_class'] =np.select(conditions, choices, 'other')  

In [None]:
#drug_df[drug_df['drug_class']=='other'].head(60)
drug_df['drug_class'].value_counts()

In [None]:
drug_df_w=drug_df

In [None]:
# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = drug_df_w   

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'drug_df_wide.csv'

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


In [None]:
#!pip install --upgrade pip
#!pip install MedCodes
from medcodes.drugs.standardization import Drug

# Merging the cleaned data frames

## merge person data to condition data

In [None]:
import pandas as pd
person_df=pd.read_csv('person_df.csv')
condition_df=pd.read_csv('condition_df.csv')
person_cond=pd.merge(person_df,
                   condition_df,
                   on='person_id', how='inner')

In [None]:
person_cond.info()

## merge person_condition data to observation data

In [None]:
observation_df_wide=pd.read_csv('observation_df_wide.csv')

In [None]:

person_cond['condition_start_datetime']=pd.to_datetime(person_cond['condition_start_datetime'])
observation_df_wide['observation_datetime']=pd.to_datetime(observation_df_wide['observation_datetime'])

In [None]:
person_cond=person_cond.sort_values(by=['condition_start_datetime'])

In [None]:

person_cond_obsv=pd.merge(person_cond,
                   observation_df_wide,
                   on='person_id', how='inner')

In [None]:
person_cond_obsv[['person_id','condition_start_datetime',
                  'observation_datetime']].sort_values(by=['person_id',
                                                           'condition_start_datetime',
                                                           'observation_datetime']).head(100)

In [None]:
person_cond_obsv.sort_values(by=['person_id','condition_start_datetime','observation_datetime'])

## merge person_condition_observ data to measurement data

In [None]:
person_cond_obsv=person_cond_obsv.sort_values(by=['condition_start_datetime'])

In [None]:
person_cond_obsv['condition_start_datetime']=pd.to_datetime(person_cond_obsv['condition_start_datetime'])

In [None]:
person_cond_obsv=person_cond_obsv.drop(columns='stop_reason')

In [None]:
import numpy as np
person_cond_obsv['exacerbation']=np.where(person_cond_obsv['standard_concept_name'].str.contains('exacerbation'), 1, 0)

In [None]:
measurement_df_wide=pd.read_csv('measurement_df_wide.csv')

In [None]:
measurement_df_wide['measurement_datetime']=pd.to_datetime(measurement_df_wide['measurement_datetime'])

In [None]:
person_cond_obsv_measu=pd.merge_asof(measurement_df_wide, person_cond_obsv, 
                   left_on='measurement_datetime', 
                   right_on='condition_start_datetime',
                   left_by=['person_id'],
                   right_by=['person_id'],allow_exact_matches=True,
                   direction='forward',tolerance=pd.Timedelta("5y"))

In [None]:
person_cond_obsv_measu.sort_values(by=['person_id','measurement_datetime',
                     'condition_start_datetime']).head(500)[['person_id','measurement_datetime','observation_datetime',
                                                             'condition_start_datetime','standard_concept_name']]


In [None]:
person_cond_obsv_measu[person_cond_obsv_measu['person_id']==1000109][['person_id','measurement_datetime','observation_datetime',
                                  'condition_start_datetime','standard_concept_name']]

In [None]:
person_cond_obsv_measu.dropna(subset=['condition_start_datetime']).sort_values(by=['person_id',
                                                                 'condition_start_datetime']) [['person_id',
                                                                                                'measurement_datetime','observation_datetime',
                                                                                                'condition_start_datetime','standard_concept_name']].head(200)

In [None]:
person_cond_obsv_measu=person_cond_obsv_measu.dropna(subset=['condition_start_datetime'])

## merge person_condition_observation_measurement data to drug data

In [None]:
drug_df_wide=pd.read_csv('drug_df_wide.csv')

In [None]:
drug_df_wide['drug_exposure_start_datetime']=pd.to_datetime(drug_df_wide['drug_exposure_start_datetime'])

In [None]:
drug_df_wide=drug_df_wide.sort_values('drug_exposure_start_datetime')

In [None]:
person_cond_obsv_measu=person_cond_obsv_measu.sort_values(by=['condition_start_datetime'])

In [None]:
person_cond_obsv_measu_drug=pd.merge_asof(person_cond_obsv_measu, drug_df_wide, 
                   left_on='condition_start_datetime', 
                   right_on='drug_exposure_start_datetime',
                   left_by=['person_id'],
                   right_by=['person_id'],allow_exact_matches=True,
                   direction='backward',tolerance=pd.Timedelta("10y"))

In [None]:
person_cond_obsv_measu_drug.info()

In [None]:
person_cond_obsv_measu_drug.sort_values(by=['person_id', 'drug_exposure_start_datetime',
                          'condition_start_datetime']) [['person_id',
                                                         'measurement_datetime','observation_datetime','drug_class',
                                                         'condition_start_datetime','standard_concept_name_x',
                                                         'standard_concept_name_y','drug_exposure_start_datetime']].head(200)

In [None]:
person_cond_obsv_measu_drug[person_cond_obsv_measu_drug['person_id']==1000151][['person_id','measurement_datetime',
                                            'observation_datetime','drug_class',
                                            'condition_start_datetime','standard_concept_name_x',
                                            'standard_concept_name_y','drug_exposure_start_datetime']].head(200)


In [None]:
# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = person_cond_obsv_measu_drug   

destination_filename = 'asthma_exaceb_df.csv'


# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')


## merge person_condition_observation_measurement_drug data to survey data

In [None]:
survey_df_wide=pd.read_csv('survey_df_wide.csv')

In [None]:

person_co_dr_mr_svy[person_co_dr_mr_svy['person_id']==1000109][['person_id','measurement_datetime',
                                            'observation_datetime','survey_datetime',
                                            'condition_start_datetime','standard_concept_name_x','drug_class',
                                            'standard_concept_name_y','drug_exposure_start_datetime',
                                                                'Respiratory: Asthma Currently',
                                                               'Respiratory: How Old Were You Asthma',
                                                               'Respiratory: Rx Meds for Asthma']].head(200)

# saving combined ands cleaned data to workspace in google bucket

In [None]:
# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = person_co_dr_mr_svy_obs   

destination_filename = 'person_cond_drug_mearsu_svy_obs.csv'


# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
os.system(f"gsutil cp './{destination_filename}' '{my_bucket}/data/'")
print(f'[INFO] {destination_filename} is successfully uploaded in your bucket.')