In [1]:
from astropy.table import Table
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

In [2]:
def open_fits(filename: str):
    """"
    Open fits file and load it as a pandas dataframe

    :param filename: location of the fits file

    :returns: data, pandas dataframe
    """
    dat = Table.read(filename, format='fits')
    data = dat.to_pandas()
    return data

In [3]:
# Creating file paths
folder_path = "../../Data/Fangyou_data/Cleaned/"
files = os.listdir(folder_path)
file_paths = [folder_path + file for file in files]

In [4]:
data = []
for file in file_paths:
    dat = pd.read_csv(file)
    print(file, " has ", len(dat), " rows.")
    data.append(dat)

../../Data/Fangyou_data/Cleaned/Bootes_clean.csv  has  17729  rows.
../../Data/Fangyou_data/Cleaned/Combined.csv  has  77609  rows.
../../Data/Fangyou_data/Cleaned/Elais-N1_clean.csv  has  30248  rows.
../../Data/Fangyou_data/Cleaned/Lockman_clean.csv  has  29632  rows.


In [5]:
# Finding unique columns across the 3 datasets
all_columns = [list(data[0].columns), list(data[1].columns), list(data[2].columns)]
all_columns = [item for sublist in all_columns for item in sublist]
unique_columns, indexes = np.unique(all_columns, return_index=True)
unique_columns = [all_columns[index] for index in sorted(indexes)]

# Column for filling in all the data about which columns are in which datasets
binary_columns = np.zeros((3,len(unique_columns)))

for i, dat in enumerate(data):
    current_columns = list(dat.columns)
    for j, has_column in enumerate(unique_columns):
        if has_column in current_columns:
            binary_columns[i,j]=1

IndexError: index 3 is out of bounds for axis 0 with size 3

In [None]:
# Dataframe containing which columns are in which dataset
has_column_dataframe = pd.DataFrame(binary_columns.T, index=unique_columns, columns=["Bootes", "Elais-N1", "Lockman Hole"])

In [None]:
has_column_dataframe['Sum'] = has_column_dataframe.sum(axis=1)

In [None]:
with pd.option_context('display.max_rows', 500):
    display(has_column_dataframe)

In [None]:

















column_frequencies = has_column_dataframe['Sum'].value_counts()
column_frequencies = column_frequencies/sum(column_frequencies)
column_frequencies = column_frequencies.sort_index(axis=0)
column_frequencies.plot(kind='bar')

In [None]:
column_frequencies

In [None]:
print("Bootes NaN's per column:\n")

Bootes_nan_columns = pd.DataFrame()
classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']
for classification in classes:
    dat = data[0]
    Bootes_nan_columns[classification] = dat[dat["Classification"] == classification].isna().sum()
    # Normalising
    Bootes_nan_columns[classification] = Bootes_nan_columns[classification]/len(dat[dat["Classification"] == classification])

Bootes_nan_columns = Bootes_nan_columns.sort_values(by=["star-forming galaxy"], ascending=False)
with pd.option_context('display.max_rows', 500):
    display(Bootes_nan_columns)

In [None]:
Bootes_nan_columns.hist(figsize=(20,20))
plt.tight_layout()
plt.show()

In [None]:
print(data[0].isna().sum().to_string())

In [None]:
plt.hist(data[0].isna().sum(axis=1).values, bins=20)
plt.title("NaN's per row")
plt.show()

In [None]:
print("EN1 NaN's per column:\n")

print(data[1].isna().sum().to_string())

In [None]:
plt.hist(data[1].isna().sum(axis=1).values, bins=20)
plt.title("NaN's per row")
plt.show()

In [None]:
print("Lockman Hole NaN's per column:\n")

print(data[2].isna().sum().to_string())

In [None]:
plt.hist(data[2].isna().sum(axis=1).values, bins=20)
plt.title("NaN's per row")
plt.show()

In [None]:
sns.pairplot(data[0].iloc[:, 15:25])
plt.show()
#fig = swarm_plot.get_figure()
#fig.savefig("./Output/Bootes_pairplot.pdf")

In [None]:
data[0].hist(figsize = (20,20), bins=50)
plt.tight_layout()
plt.savefig("./Output/histograms_Bootes.pdf")

In [None]:
indices = [3, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73]
corr = data[0].iloc[:,indices].corr()

In [None]:
fig = plt.figure(figsize=(20,20))
frame = fig.add_subplot(1,1,1)

plot = frame.matshow(corr)
fig.colorbar(plot)

plt.xticks(range(corr.shape[1]), corr.columns, rotation=45)
plt.yticks(range(corr.shape[1]), corr.columns)

#frame.set_xticklabels(ax.get_xticks(), rotation = 45)
plt.show()

In [None]:
corr.columns

In [None]:
data[0].columns