### Clean_csv and Create DB,Table in Postgres

In [None]:
!pip install sqlalchemy
!pip install psycopg2

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import sqlalchemy as db 

In [None]:
!pip install sqlalchemy_utils

### Create database in postgreSQL

In [None]:
from sqlalchemy_utils import create_database

# user postgres, password admin
# for example, one wants to create a new db, say, starter
engine = db.create_engine('postgresql://postgres:admin@localhost:5434/musicdb')

# create database
create_database(engine.url)

# release resources associated with engine
engine.dispose()

### csv clean

In [None]:
# create a dataframe from csv file
df = pd.read_csv('https://raw.githubusercontent.com/leepiau/music_mental_health/main/mxmh_survey_results.csv')

In [None]:
print(df.shape)

In [None]:
df.info()

In [None]:
# Check the duplicate columns
df=df.drop_duplicates()

In [None]:
# Drop 'Timestamp' and 'Permissions' columns 
df.drop(columns=['Timestamp', 'Permissions'], inplace=True)

In [None]:
# Clean column names: strip leading/trailing whitespaces, convert to lowercase, replace spaces with underscores
keys = list(df.columns)

formatted_keys = {key: key.lower().replace(' ', '_').replace('[', '').replace(']', '').replace('&', '_n_') for key in keys}

df = df.rename(columns = formatted_keys)

In [None]:
# Decide to create 3 new cols per row and populate with the genres consumed 'Very frequently'
def find_frequencies(row):
    frequencies = [col for col in df.columns if row[col] == 'Very frequently']
    return pd.Series(frequencies + [None]*3)[:3]

# Apply the function to the DataFrame
df[['freq1', 'freq2', 'freq3']] = df.apply(find_frequencies, axis=1)

#Display the DataFrame with the new columns
print(df)

In [None]:
# add 'age_group' column and their corresponding ranges 
age_group = pd.cut(df['age'],
                         bins=[0, 18, 35, 60, 75, 100],
                         labels=['early_years', 'young_adults', 'middle_age', 'mature_adults', 'elderly'], right = False)
df.insert(1, 'age_group', age_group)

In [2]:
print(df)

In [None]:
print(df.shape)

### Create table in PostgreSQL

In [None]:
# Create connection engine
#user postgres, password admin,database
engine = db.create_engine('postgresql://postgres:admin@localhost:5434/musicdb') 

conn = engine.raw_connection()

In [None]:
# Create new tables in PostgreSQL

commands = ('''CREATE TABLE IF NOT EXISTS music_survey (
    id SERIAL PRIMARY KEY,
    age INTEGER,
    age_group VARCHAR,
    primary_streaming_service VARCHAR,
    hours_per_day FLOAT,
    while_working VARCHAR,
    instrumentalist VARCHAR,
    composer VARCHAR,
    fav_genre VARCHAR,
    exploratory VARCHAR,
    foreign_languages VARCHAR,
    bpm INTEGER,
    frequency_classical VARCHAR,
    frequency_country VARCHAR,
    frequency_edm VARCHAR,
    frequency_folk VARCHAR,
    frequency_gospel VARCHAR,
    frequency_hip_hop VARCHAR,
    frequency_jazz VARCHAR,
    frequency_k_pop VARCHAR,
    frequency_latin VARCHAR,
    frequency_lofi VARCHAR,
    frequency_metal VARCHAR,
    frequency_pop VARCHAR,
    frequency_r_n_b VARCHAR,
    frequency_rap VARCHAR,
    frequency_rock VARCHAR,
    frequency_video_game_music VARCHAR,
    anxiety VARCHAR,
    depression VARCHAR,
    insomnia VARCHAR,
    ocd VARCHAR,
    music_effects VARCHAR,
    freq1 VARCHAR,
    freq2 VARCHAR,
    freq3 VARCHAR
);''')
            
            
# Initialize connection to PostgreSQL
cur = conn.cursor()

# Create cursor to execute SQL commands

#for command in commands:
cur.execute(commands)

# Commit changes
conn.commit()

# Close communication with server
cur.close()
conn.close()

In [None]:
# Copy data to table
df.to_sql(name= 'music_survey', con = engine, if_exists= 'replace', index= False)

### Read from PostgreSQL

In [None]:
engine = db.create_engine('postgresql://postgres:admin@localhost:5434/musicdb') 
conn = engine.raw_connection() 

In [None]:
pd.read_sql('SELECT * FROM music_survey', conn)