# Import libraries

In [29]:
# import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy as db
from dotenv import load_dotenv

In [32]:
# Load environment variables from .env file
load_dotenv()

# Access environment variables
survey_results_csv=os.environ.get('survey_results_csv')
db_elephant=os.environ.get('db_elephant')

# Import file and clean data

In [33]:
# create a dataframe from csv file (Note: get 'raw' file)
df = pd.read_csv(survey_results_csv)

ValueError: Invalid file path or buffer object type: <class 'NoneType'>

In [6]:
# take a look at contents
print(df.shape)

(736, 33)


In [4]:
print(df.info)

<bound method DataFrame.info of                Timestamp   Age Primary streaming service  Hours per day  \
0     8/27/2022 19:29:02  18.0                   Spotify            3.0   
1     8/27/2022 19:57:31  63.0                   Pandora            1.5   
2     8/27/2022 21:28:18  18.0                   Spotify            4.0   
3     8/27/2022 21:40:40  61.0             YouTube Music            2.5   
4     8/27/2022 21:54:47  18.0                   Spotify            4.0   
..                   ...   ...                       ...            ...   
731  10/30/2022 14:37:28  17.0                   Spotify            2.0   
732   11/1/2022 22:26:42  18.0                   Spotify            1.0   
733   11/3/2022 23:24:38  19.0   Other streaming service            6.0   
734   11/4/2022 17:31:47  19.0                   Spotify            5.0   
735    11/9/2022 1:55:20  29.0             YouTube Music            2.0   

    While working Instrumentalist Composer         Fav genre Explor

In [None]:
# make sure all columns are displayed
pd.set_option('display.max_columns', None)

# peek at df
print(df.head())

In [None]:
# check 'Permissions' values. If only 1 unique value, drop the column
print(df['Permissions'].unique())

In [None]:
# check % of nulls in each column
total_nulls = df.isnull().sum()
print(total_nulls)
# percent_missing = total_nulls * 100 / len(df)
# print(percent_missing)

In [None]:
df.head()

In [None]:
# check the rows where 'Music Effects' is null to consider dropping the rows
null_rows = df.loc[df['Music effects'].isnull()]
print(null_rows)

In [None]:
# check for duplicated rows
print(df[df.duplicated()])

In [None]:
# decide to drop rows where 'Music effects' has null values
df.dropna(subset=['Music effects'], inplace=True)

# confirm the rows where 'Music effects' has null values have been dropped
null_rows = df.loc[df['Music effects'].isnull()]
print(null_rows)

In [None]:
# decide to drop 'Timestamp' and 'Permissions' columns
df.drop(columns=['Timestamp', 'Permissions'], inplace=True)
print(df.head())

In [None]:
# reset the index if working on df
df.reset_index(drop=True, inplace=True)

In [None]:
print(df.head())

In [None]:
print(df.dtypes)

In [None]:
# clean column names - change to lowercase, underscores
# extract list of column names
keys = list(df.columns)

# Create a dictionary {old: new}. New has lowercase, underscores, no square brackets
formatted_keys = {key: key.lower().replace(' ', '_').replace('[', '').replace(']', '').replace('&', '_n_') for key in keys}

# Replace old column names with new ones
df = df.rename(columns = formatted_keys)

In [None]:
# add column 'age_group'
age_group = pd.cut(df['age'],
                         bins=[0, 18, 35, 60, 75, 100],
                         labels=['early_years', 'young_adults', 'middle_age', 'mature_adults', 'elderly'], right = False)
df.insert(1, 'age_group', age_group)

In [None]:
print(df.head())

In [None]:
# Change genre names to lower case and reflect Spotify values
df['fav_genre'] = df['fav_genre'].str.lower().str.replace('&', '-n-').str.replace(' ', '-')

In [None]:
df['fav_genre'].unique()

In [None]:
# Create a column with a count of occurrence of 'Very frequently' on each row
df['count'] = df.apply(lambda row: row.str.contains('Very frequently').sum(), axis=1)

In [None]:
# Verify column is created
df.head()

In [None]:
# Get a sense of the frequency where respondents listed more than 1 genre 'Very frequently'
df['count'].plot(bins=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], kind='hist', edgecolor='black')
plt.title('Frequency of "Very frequently"')
plt.xlabel('Count of "Very frequently"')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Decide to create 3 new cols per row and populate with the genres consumed 'Very frequently'
def find_frequencies(row):
    frequencies = [col for col in df.columns if row[col] == 'Very frequently']
    return pd.Series(frequencies + [None]*3)[:3]

# Apply the function to the DataFrame
df[['freq1', 'freq2', 'freq3']] = df.apply(find_frequencies, axis=1)

In [None]:
# Verify cols created
df.head()

In [None]:
print(df['freq1'].unique())

In [None]:
# Change genre values in freq1-3 to reflect Spotify values
for i in range(1,4):
    df[f'freq{i}'] = df[f'freq{i}'].str.replace('frequency_', '').str.replace('_', '-')

In [None]:
print(df['freq3'].unique())

In [None]:
# Check for nulls in freq1
check_nulls = df['freq1'].isnull().sum()
print(check_nulls)

In [None]:
# Replace nulls in freq1 with value from fav_genre
df['freq1'] = df['freq1'].fillna(df['fav_genre'])

In [None]:
# Check for nulls in freq1 after replacement
check_nulls = df['freq1'].isnull().sum()
print(check_nulls)

In [None]:
# Add new column 'respondent' to be Primary Key in DB table
df.insert(0, 'respondent', range(101, 101 + len(df)))

In [None]:
df.head()

In [None]:
list(df.columns)

In [None]:
# Copy freq* cols to another df as backup
df_freq = df[['respondent',
 'frequency_classical',
 'frequency_country',
 'frequency_edm',
 'frequency_folk',
 'frequency_gospel',
 'frequency_hip_hop',
 'frequency_jazz',
 'frequency_k_pop',
 'frequency_latin',
 'frequency_lofi',
 'frequency_metal',
 'frequency_pop',
 'frequency_r_n_b',
 'frequency_rap',
 'frequency_rock',
 'frequency_video_game_music']].copy()

In [None]:
df_freq.shape

In [None]:
df_freq.head()

In [None]:
# Remove un-necessary cols from original df
df = df.drop(['count', 'frequency_classical',
 'frequency_country',
 'frequency_edm',
 'frequency_folk',
 'frequency_gospel',
 'frequency_hip_hop',
 'frequency_jazz',
 'frequency_k_pop',
 'frequency_latin',
 'frequency_lofi',
 'frequency_metal',
 'frequency_pop',
 'frequency_r_n_b',
 'frequency_rap',
 'frequency_rock',
 'frequency_video_game_music'], axis=1)

In [None]:
print(df.shape)

In [None]:
df.head()

In [None]:
list(df.columns)

# Exploratory Data Analysis

In [None]:
# Does music have an effect on mood?
sections = df['music_effects'].value_counts()
labels = sections.index.to_list()
plt.figure(figsize=(5,5))
plt.pie(sections, labels=labels,autopct='%.1f%%',shadow=True, pctdistance=0.8)
plt.title('Music effects', fontsize=16)
plt.legend(title='Music effects',loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

In [None]:
# Count of disorders

# Create a dictionary to store the counts
disorder_counts = {
    'depression': {},
    'anxiety': {},
    'insomnia': {},
    'ocd': {}
}

# Loop through the disorder columns
for disorder in ['depression', 'anxiety', 'insomnia', 'ocd']:
    # Get the value counts for the current disorder
    value_counts = df[disorder].value_counts()

    # Extract the counts for the desired categories
    for level in range(11):
        disorder_counts[disorder][level] = value_counts.get(level, 0)

# Create a DataFrame from the dictionary
df_disorder_counts = pd.DataFrame(disorder_counts)

# Plot the bar chart for each disorder
df_disorder_counts.plot(kind='bar')
plt.title('Count of Disorder Levels')
plt.xlabel('Disorder Level')
plt.ylabel('Count')
plt.legend(title='Disorder', loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

In [None]:
# Age anaylsis
labels = df['age'].value_counts().index.to_list()
sections = df['age'].value_counts().to_numpy()
plt.figure(figsize=(10, 7))
plt.bar(labels, sections)
plt.title('Age Distribution', fontsize=16)
plt.xlabel('age', fontsize=12)
plt.ylabel('count', fontsize=12)
plt.show()

In [None]:
# Age group distribution
age_group_counts = df['age_group'].value_counts()
labels = age_group_counts.index.to_list()
plt.figure(figsize=(5,5))
plt.pie(age_group_counts, labels=labels,autopct='%.2f%%',shadow=True, pctdistance=0.6)
plt.title('Distribution of Age Group', fontsize=16)
plt.legend(title='Age Group', loc='upper left',bbox_to_anchor=(1,1))
plt.show()

In [None]:
# Distribution of Favourite Genre
labels = df['fav_genre'].value_counts().index.to_list()
sections = df['fav_genre'].value_counts().to_numpy()
colors = sns.color_palette('husl', len(sections))
plt.figure(figsize=(10,5))
plt.bar(labels, sections, color=colors)
plt.title('Favorite Genre', fontsize=16)
plt.xlabel('Genre', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=90)
plt.show()

# Connect to DB, create table and load data into DB

In [None]:
# Connect to DB
engine = db.create_engine(db_elephant) 
conn = engine.raw_connection()

In [None]:
# Create new table 'survey' in PostgreSQL (Create only 1 column, to_sql seems to create other columns automatically)
commands = ('''CREATE TABLE IF NOT EXISTS survey (
    respondent INT PRIMARY KEY
);''')
            
# Initialize connection to PostgreSQL
cur = conn.cursor()

# Create cursor to execute SQL commands
#for command in commands:
cur.execute(commands)

# Commit changes
conn.commit()

In [None]:
# Copy data to table
df.to_sql(name= 'survey', con = engine, if_exists= 'replace', index= False)

In [None]:
# Create new table 'frequency' in PostgreSQL (Create only 1 column, to_sql seems to create other columns automatically)
commands = ('''CREATE TABLE IF NOT EXISTS frequency (
    respondent INT PRIMARY KEY
);''')
            
# Initialize connection to PostgreSQL
cur = conn.cursor()

# Create cursor to execute SQL commands
#for command in commands:
cur.execute(commands)

# Commit changes
conn.commit()

In [None]:
# Copy data to table
df_freq.to_sql(name= 'frequency', con = engine, if_exists= 'replace', index= False)

In [None]:
# Close communication with server
cur.close()
conn.close()