In [1]:
import sqlite3
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
def create_db_connection(db_file):
    """
    Create a database connection to the SQLite database
    specified by the db_file
    
    :param db_file: database file
    :return: Connection object or None
    """
    connection = None
    try:
        connection = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return connection

In [3]:
db_connection = create_db_connection("genres.db") # can be None!

genres_data = pd.read_sql_query("SELECT * from genrepath", db_connection)
genres_data = genres_data.drop('filename', axis=1)

# Encode Labels
genres_list = genres_data.iloc[:,-1]
encoder = LabelEncoder()
y = encoder.fit_transform(genres_list)

# Replace genres from 'genres_data' with encoded labels
genres_data['genre'] = y

# Print first five rows
genres_data.head()

Unnamed: 0,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,genre
0,0.349943,0.130225,1784.420446,2002.650192,3806.485316,0.083066,-113.596748,121.557297,-19.158825,42.351032,...,8.810669,-3.667368,5.751691,-5.162763,0.750948,-1.691938,-0.409953,-2.300209,1.219929,0
1,0.340983,0.095918,1529.835316,2038.617579,3548.820207,0.056044,-207.556793,124.006721,8.93056,35.874687,...,5.376803,-2.23912,4.216963,-6.012273,0.93611,-0.716537,0.293876,-0.287431,0.531573,0
2,0.363603,0.175573,1552.481958,1747.165985,3040.514948,0.076301,-90.754387,140.4599,-29.109968,31.689013,...,5.789265,-8.905224,-1.08372,-9.218359,2.455806,-7.726901,-1.815723,-3.433434,-2.226821,0
3,0.404779,0.141191,1070.119953,1596.333948,2185.028454,0.033309,-199.431152,150.099213,5.647593,26.871927,...,6.087676,-2.476421,-1.07389,-2.874778,0.780977,-3.316932,0.637982,-0.61969,-3.408233,0
4,0.30859,0.091563,1835.494603,1748.362448,3580.945013,0.1015,-160.266037,126.198799,-35.60545,22.153301,...,-2.806384,-6.934123,-7.558618,-9.173553,-4.512165,-5.453538,-0.924161,-4.409333,-11.703781,0


In [4]:
### Step 2: Visualize and explore dataset in order to gain insights

# correlation of each numeric feature with target 'SalePrice' feature
# feature_corr = genres_data.corr()
# print(feature_corr['genre'].sort_values(ascending=False))

# # Scatter Matrix
# scatter_features = [
#     'chroma_stft', 'rmse', 'spectral_centroid', 'spectral_bandwidth', 'rolloff',
#     'zero_crossing_rate', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6',
#     'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14'
#     'mfcc15', 'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20'
# ]
# pd.plotting.scatter_matrix(genres_data[scatter_features], figsize=(15,10))
# plt.show()

# print(genres_data[['mfcc14', 'mfcc15']])

# # Pivot table to evaluate relationship of SaleCondition and SalePrice
# condition_pivot = genres_data.pivot_table(index='Neighborhood', values='SalePrice', aggfunc=np.median)
# condition_pivot.plot(kind='bar', color='blue')


In [5]:
### Step 3: Transform and select features

# Separating features and target variable
train_data_features = genres_data.drop('genre', axis =1)

train_data_target = genres_data["genre"].copy()
train_data_target.columns = ['genre']

# convert this to a DataFrame
train_data_target = pd.DataFrame(train_data_target)

print('Shape of features:', train_data_features.shape)
print('Shape of target:', train_data_target.shape)

print(train_data_features.info())

print(train_data_target.info())
train_data_target.head(10)

# print features
train_data_features.head(10)

Shape of features: (1000, 26)
Shape of target: (1000, 1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   chroma_stft         1000 non-null   float64
 1   rmse                1000 non-null   float64
 2   spectral_centroid   1000 non-null   float64
 3   spectral_bandwidth  1000 non-null   float64
 4   rolloff             1000 non-null   float64
 5   zero_crossing_rate  1000 non-null   float64
 6   mfcc1               1000 non-null   float64
 7   mfcc2               1000 non-null   float64
 8   mfcc3               1000 non-null   float64
 9   mfcc4               1000 non-null   float64
 10  mfcc5               1000 non-null   float64
 11  mfcc6               1000 non-null   float64
 12  mfcc7               1000 non-null   float64
 13  mfcc8               1000 non-null   float64
 14  mfcc9               1000 non-null   float64
 15 

Unnamed: 0,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,...,mfcc11,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20
0,0.349943,0.130225,1784.420446,2002.650192,3806.485316,0.083066,-113.596748,121.557297,-19.158825,42.351032,...,-8.324325,8.810669,-3.667368,5.751691,-5.162763,0.750948,-1.691938,-0.409953,-2.300209,1.219929
1,0.340983,0.095918,1529.835316,2038.617579,3548.820207,0.056044,-207.556793,124.006721,8.93056,35.874687,...,-5.560388,5.376803,-2.23912,4.216963,-6.012273,0.93611,-0.716537,0.293876,-0.287431,0.531573
2,0.363603,0.175573,1552.481958,1747.165985,3040.514948,0.076301,-90.754387,140.4599,-29.109968,31.689013,...,-13.123111,5.789265,-8.905224,-1.08372,-9.218359,2.455806,-7.726901,-1.815723,-3.433434,-2.226821
3,0.404779,0.141191,1070.119953,1596.333948,2185.028454,0.033309,-199.431152,150.099213,5.647593,26.871927,...,-3.196314,6.087676,-2.476421,-1.07389,-2.874778,0.780977,-3.316932,0.637982,-0.61969,-3.408233
4,0.30859,0.091563,1835.494603,1748.362448,3580.945013,0.1015,-160.266037,126.198799,-35.60545,22.153301,...,-13.083821,-2.806384,-6.934123,-7.558618,-9.173553,-4.512165,-5.453538,-0.924161,-4.409333,-11.703781
5,0.302346,0.103468,1831.942368,1729.483241,3480.937285,0.09404,-177.869049,118.196907,-17.550674,30.758635,...,-11.776275,-2.420614,-9.339365,-9.939324,-3.909892,-5.570624,-1.839023,-2.77842,-3.046866,-8.115808
6,0.291308,0.141796,1459.078483,1388.913312,2795.616429,0.073028,-190.14946,130.296951,-36.344139,33.01305,...,-7.840328,-3.125678,-6.593118,-9.942267,-6.537224,-10.064754,-10.912171,-6.972478,-3.449033,-6.49551
7,0.307921,0.131785,1451.754147,1577.369917,2955.348796,0.061435,-179.395432,136.459244,-26.656359,39.988026,...,-8.414556,-6.954826,-3.544536,-8.051242,-8.959538,-8.424336,-10.558884,-10.788157,-4.693748,-8.638613
8,0.409037,0.142438,1719.213163,2031.643884,3781.318802,0.064028,-121.361023,122.513107,-14.74213,46.143444,...,-6.72119,7.010944,-12.741831,5.066004,-5.175478,-1.444024,-7.03707,2.697452,3.409809,-2.698353
9,0.274009,0.081352,1817.516386,1973.73907,3944.451148,0.079215,-213.180801,115.152794,-11.716267,39.029469,...,-6.039595,3.784771,0.225668,-5.113637,-0.413064,-1.184537,-1.920715,-2.293727,2.891266,-4.233204


In [6]:
# Transformers in Pipeline steps for numeric and categorical features
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                     ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value ='NA')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# ColumnTransformer to apply transformations to the correct columns in the dataframe
numeric_features = train_data_features.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_data_features.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [7]:
# Fit a simple linear regression

# Train a simple linear regression model
features_train, features_test, target_train, target_test = train_test_split(train_data_features, train_data_target, test_size=0.2, random_state=0)

lr = Pipeline(steps=[('preprocessor', preprocessor),
                    ('regression', LinearRegression())])


model = lr.fit(features_train, target_train)

# Make predictions using the trained model on test set
features_test_pred = model.predict(features_test)

# Evaluate model on train set
features_train_pred = model.predict(features_train)
RSME_train = metrics.mean_squared_error(target_train, features_train_pred)
RMSE_test = metrics.mean_squared_error(target_test, features_test_pred)
print("RMSE on Train set :", RSME_train)
print("RMSE on Test set :", RMSE_test)


RMSE on Train set : 5.955314914346473
RMSE on Test set : 6.129541435909212
