In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path = 'Regression/trainReg.txt'

try:
    df_tracks = pd.read_csv(file_path, delimiter=',')
    print(df.head())
except FileNotFoundError:
    print("File not found. Please provide the correct file path.")


In [None]:
df_tracks.head(5)

In [None]:
df_tracks.shape

In [None]:
for column in df_tracks.columns:
    print(f"{column}: {df_tracks[column].dtype}")

In [None]:
df_tracks.describe()

In [None]:
df_tracks.columns

In [None]:
(df_tracks.isnull().sum() / df_tracks.shape[0]).sort_values(ascending = False)

In [None]:
column_to_plot = 'V1'

# Creating a countplot using seaborn
plt.figure(figsize=(20, 8))
sns.countplot(data=df_tracks, x=column_to_plot)
plt.title(f'Bar Graph of Year of Release')
plt.xlabel(column_to_plot)
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility if needed
plt.show()

In [None]:
df_tracks_decades = df_tracks

df_tracks_decades['Decade'] = (df_tracks_decades['V1'] // 10) * 10

# Grouping by decades and counting appearances
decade_counts = df_tracks_decades['Decade'].value_counts().sort_index()

# Calculate percentage of appearances by decade
total_appearances = decade_counts.sum()
decade_percentages = (decade_counts / total_appearances) * 100

# Plotting a bar graph for number of appearances and percentage by decade
fig, ax1 = plt.subplots(figsize=(8, 6))

# Plotting count of appearances
decade_counts.plot(kind='bar', color='blue', ax=ax1)
ax1.set_ylabel('Count', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Creating a secondary y-axis for percentage
ax2 = ax1.twinx()
ax2.plot(decade_percentages.index, decade_percentages.values, marker='o', color='red', linestyle='-', linewidth=2)
ax2.set_ylabel('Percentage (%)', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Setting labels and title
plt.title('Number of Songs and Percentage by Decade')
plt.xlabel('Decade')

# Show plot
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility if needed
plt.tight_layout()
plt.show()


In [None]:
sns.pairplot(df_tracks)
plt.show()

In [None]:
len(df_tracks.columns)

In [None]:
sns.heatmap(df_tracks, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Matrix Heatmap')
plt.xlabel('Variables')
plt.ylabel('Variables')
plt.show()

In [None]:
numerical_columns = df_tracks.select_dtypes(include='number').columns.tolist()

# Visualizing numerical columns against FRACASO using scatterplots
for column in numerical_columns:
    if column != 'V1':
        plt.figure()
        sns.scatterplot(data=df_tracks, x=column, y='V1')
        plt.title(f'{column} vs V1')
        plt.xlabel(column)
        plt.ylabel('V1')
        plt.show()

In [None]:
from sklearn.decomposition import PCA

In [None]:
df_para_pc = df_tracks
p = PCA(n_components = 0.6)
pc = p.fit_transform(df_para_pc)
df_f= pd.DataFrame(data = pc, columns = ["PC" + str(i+1) for i in range(len(p.components_))])

l = p.components_
num_pc = p.n_features_
lista_pc = ["PC"+str(i) for i in list(range(1, num_pc+1))]
l_df = pd.DataFrame.from_dict(dict(zip(lista_pc, l)))
l_df['variable'] = df_para_pc.columns.values
l_df = l_df.set_index('variable')

sns.set(rc={"figure.figsize":(30, 30)})
#ax = sns.heatmap(l_df, annot=True, cmap='Spectral')
ax = sns.heatmap(l_df, annot=True, cmap='Spectral', fmt='.2f', annot_kws={"size": 8})  # Adjust annotation font size

plt.title("Valores propios")
plt.xlabel("Componente Principal")
plt.ylabel("Variable")
plt.show()