In [1]:
import re
import time
import glob
import pywt

import numpy as np
import pandas as pd

import seaborn as sns
sns.set_style('darkgrid')
color = sns.color_palette()

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

### Concatenate all the files

In [2]:
# get path to all files
path = 'data/spectra/simulated_data/'
files = sorted(glob.glob(path + 'model_parameters_data*.txt'), key=lambda x: int(re.search(r'\d+', x).group()))
# write_data = open('model_parameters_data_all.txt', 'w')
# write_params  = open('model_parameters_all.txt', 'w')

# # start timer
# start_timer = time.time()
# # read in all files
# for file in files:
#     with open(file, 'r') as f:
#         # skip the first 10 lines in each file
#         for i in range(10):
#             f.readline()
#             # read in the data from each file
#         data = f.readlines()
#         # get the length of the data in each file
#         length = len(data)
#         # write the data to the output file in the same order as the files
#         for i in range(length):
#             write_data.write(data[i])
            
# # close the files
# write_data.close()
# # # check how much time it took
# print(time.time() - start_timer)      

In [4]:
# start_timer = time.time()
# # get the params data
# df_param = pd.DataFrame()
# for file in files:
#     # get the parameters used
#     df_param = df_param.append(pd.read_csv(file, skiprows=1, nrows=6, header=None, sep=' ', names=['A', 'B', 'C', 'D']))
#     # save the dataframe to a csv file
#     df_param.to_csv('model_parameters_data.csv', index=False)
    
# print(time.time() - start_timer)

#### Load the files 

In [5]:
start_time = time.time()
df_data = pd.read_csv('data/params/model_parameters_data_all.txt', sep=' ', header=None) # all data dataframe
print(time.time() - start_time)
df_data.head()

724.204806804657


Unnamed: 0,0,1
0,238.858,5.856426e-15
1,238.85801,5.856426e-15
2,238.85802,5.856426e-15
3,238.85803,5.856426e-15
4,238.85804,5.856426e-15


In [6]:
df_data.shape

(178500000, 2)

In [None]:
df_param = pd.read_csv('data/params/model_parameters_data.csv')
df_param.head()

In [None]:
# drop the second and fourth column
df_param.drop(['B', 'D'], axis=1, inplace=True)

In [None]:
# transform the data to the right dataframe
df_param = df_param.assign(g = df_param.groupby('A').cumcount()).pivot(index='g', columns='A', values='C')

In [None]:
df_param.head(10)

In [None]:
df_param.columns

In [None]:
columns = ['Frequency', 'Intensity']
df_data.columns = columns
df_data.shape

In [None]:
df_data.head()

In [None]:
freq_sig = np.array(np.array_split(df_data['Frequency'], len(files)))
signal = np.array(np.array_split(df_data['Intensity'], len(files)))

# plot the signal in one plot
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
plt.plot(freq_sig[0],  signal[0], color='r', label='Spectrum 1')
plt.plot(freq_sig[1], signal[1], color='g', label='Spectrum 2')
plt.plot(freq_sig[2], signal[2], color='b', label='Spectrum 3')
plt.plot(freq_sig[3], signal[3], color='y', label='Spectrum 4')
plt.plot(freq_sig[4], signal[4], color='k', label='Spectrum 5')
plt.xlabel("Frequency (GHz)")
plt.ylabel("Intensity (K)")
plt.legend()
plt.show()

In [None]:
# plot the data in separate plots
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(15, 6))
sns.set_style("whitegrid")
for i, ax in enumerate(axes.ravel(), start=1):
    ax.plot(freq_sig[i-1], signal[i-1], label='Spectrum {}'.format(i))
    ax.set_xlabel("Frequency (GHz)", fontdict={'fontsize': 14})
    ax.set_ylabel("Intensity (K)", fontdict={'fontsize': 14})
    ax.set_title("Spectrum {}".format(i))
    
# plt.savefig('../spectra/simulated_data/spectrum_plots.png')

In [None]:
# print(pywt.wavelist())

### Feature Extraction - Wavelet Decomposition

In [None]:
#create a  feature vector array for each spectrum 
feature_vector = np.zeros((len(signal), int(df_data.shape[0]/len(files))))
detail_coeffs = np.zeros((len(signal), 558))
approx_coeffs = np.zeros((len(signal), 558))
level = 6
wname = 'db35'
def decompose_signal_dwt(_signal, wavelet=wname, mode='per', level=level):
    """
    Performs wavelet denoising on the given signal.
    """
    # loop throuh all the spectra 
    for spectra_index in range(len(_signal)):
        # max_level = pywt.dwt_max_level(len(_signal[spectra_index]), wavelet)
        coeffs = pywt.wavedec(_signal[spectra_index], wavelet=wavelet, mode=mode, level=level)
        coeff_arr, coeff_slices = pywt.coeffs_to_array(coeffs)
        detail_coeffs[spectra_index, :] = coeff_arr[coeff_slices[1]['d']] # 4th level detail coeffs 
        # get the approximation coeffs
        approx_coeffs[spectra_index, :] = coeffs[0] # 4th level approximation coeffs
        
        reconstructed_signal = pywt.waverec(coeffs, wavelet=wavelet, mode=mode)
        feature_vector[spectra_index, :] = coeff_arr[:int(df_data.shape[0]/len(files))]
        # add the coeff_arr to the dataframe for each spectra
        # df_data['fv_dwt_{}'.format(wavelet)] = pd.Series(feature_vector.reshape(1, -1)[0], index=df_data.index) #TODO: find a way to optimize (or comment it out)
        
    return coeff_arr, coeff_slices

In [None]:
feature_vector.shape

In [None]:
start_timer = time.time()
coeff_arr, coeff_slices = decompose_signal_dwt(signal)
print(time.time() - start_timer)
df_data

In [None]:
detail_coeffs.shape
detail_coeffs

## Wavelet Decomposition Plots

In [None]:
def decomposed_dwt_detail_coeffs_plots(_signal, wavelet=wname, level=level):
    """
    Plots of the detail coeffs of the signal.
    """
    # plot the reconstructed signal and the original signal in one plot
    for spectra_index in range(len(_signal)):                     
        # compute the maximum useful level of decomposition for each wavelet                        
        # max_level = pywt.dwt_max_level(len(_signal[spectra_index]), wavelet)
        fig, axes = plt.subplots(nrows=1, ncols=level, dpi=400, sharey='none', sharex='all', figsize=(18, 5))
        sns.set_style("whitegrid")
        for i, ax in enumerate(axes.ravel(), start=1):
            ax.plot(coeff_arr[coeff_slices[i]['d']], label='Level {}'.format(i))
            ax.set_xlabel("Frequency (GHz)", fontdict={'fontsize': 14})
            ax.set_ylabel("Intensity (K)", fontdict={'fontsize': 14})
            ax.set_title("Spectrum {} detail coeffiecients at level {} for {} ".format(spectra_index+1, i, wavelet)) 
            
        plt.show()

In [None]:
# decomposed_dwt_detail_coeffs_plots(signal) 

In [None]:
# TODO: find a way to plot the detail coefficients of the signal
def decomposed_dwt_approx_coeffs_plots(_signal, wavelet=wname, level=level):
    
    for spectra_index in range(len(_signal)):
        fig, axes = plt.subplots(nrows=1, ncols=level, figsize=(10, 6))
        sns.set_style("whitegrid")
        for i, ax in enumerate(axes.ravel(), start=1):
            ax.plot(coeff_arr[coeff_slices[1]['d']])
            ax.set_title("Spectrum {} approximation coeffiecients at level {} for {} ".format(spectra_index+1, i,  wavelet))
            
        plt.show()

In [None]:
# decomposed_dwt_approx_coeffs_plots(signal)

### Get Features and Labels

In [None]:
# fv = np.zeros((len(signal), int(df.shape[0]/len(files))))
# for spectra_index in range(len(signal)):
#     # get the level 3 detail coefficients
#     detail_coeffs = coeff_arr[coeff_slices[3]['d']]

# fv  = detail_coeffs
# labels =dff

In [None]:
# len(signal)

approx_coeffs.shape

 #### Have a glimpse look at any of the signal and its generated detail and approximation coefficients 

In [None]:

# for spectra_index in range(len(signal)):                     
#         # compute the maximum useful level of decomposition for each wavelet                        
#         # max_level = pywt.dwt_max_level(len(_signal[spectra_index]), wavelet)
#         fig, axes = plt.subplots(nrows=1, ncols=10, sharey='none', sharex='all', figsize=(20, 7))
#         sns.set_style("whitegrid")
#         for i, ax in enumerate(axes.ravel(), start=1):
#             ax.plot(signal[spectra_index], label='Spectrum {}'.format(spectra_index))
#             # ax.set_xlabel("Frequency (GHz)", fontdict={'fontsize': 14})
#             # ax.set_ylabel("Intensity (K)", fontdict={'fontsize': 14})
#             # ax.set_title("Spectrum {} detail coeffiecients at level {} for {} ".format(spectra_index+1, i, wavelet)) 

#     # ax[0,1].plot(signal[spectra_index], label='original')
#     # ax[1,1].plot(approx_coeffs[spectra_index], label='spectrum {} approx coeff'.format(spectra_index+1))
#     # ax[2,1].plot(detail_coeffs[spectra_index], label='spectrum {} detail coeff'.format(spectra_index+1))
# plt.legend()
# plt.show()

In [None]:
plt.figure(figsize=(20,7))
plt.plot(signal[243], label='original - 243')
plt.legend()
plt.savefig("data/5K_gen_files/original.png")
plt.show()

In [None]:
plt.figure(figsize=(20,7))
plt.plot(approx_coeffs[243], label='approx coeff - 243')
plt.legend()
plt.savefig("data/5K_gen_files/approx_level6.png")
plt.show()

In [None]:
plt.figure(figsize=(20,7))
plt.plot(detail_coeffs[243], label='detail - 243')
plt.legend()
plt.savefig("data/5K_gen_files/detail_level6.png")
plt.show()

In [None]:
features = detail_coeffs
labels_ = df_param

print('feature_shape: ', features.shape, 'labels_shape: ', labels_.shape)


In [None]:
# drop the tcmb column
labels_.drop(columns=['tcmb'], inplace=True)

#### save the true params to a file

In [None]:
labels_.head()

In [None]:
# get the last 1500 data of the parameters from labels 
df_param = df_param.iloc[-1500:, :]
# save the vals to a csv file
df_param.to_csv("data/5K_gen_files/true_param_vals.csv")
df_param.head()


In [None]:
labels_

In [None]:
df = pd.DataFrame(approx_coeffs)
# df = pd.concat([approx_coeffs, labels_], axis=1)

In [None]:
df

In [None]:
# convert the ntot column to log values 
labels_['ntot'] = np.log(labels_['ntot'])

In [None]:
labels_

In [None]:
df = pd.concat([df, labels_], axis=1)

In [None]:
df

In [None]:
X = df.iloc[:, :558]
y = df.iloc[:, -5:]


In [None]:
X

In [None]:
y

## Modelling

#### split the data into the training and test set

In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV