# Requirements

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Análise de dados importados do Kaggle

## Dados gerais

In [2]:
data = pd.read_csv('./kaggle/data.csv')
data.head(2)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936


In [3]:
data.dtypes

valence             float64
year                  int64
acousticness        float64
artists              object
danceability        float64
duration_ms           int64
energy              float64
explicit              int64
id                   object
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
name                 object
popularity            int64
release_date         object
speechiness         float64
tempo               float64
dtype: object

### Filtrando os anos a serem usados na mastertable

In [17]:
data['year'].unique()

array([1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
       1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
       1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953,
       1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964,
       1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
       1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
       1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
       1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019,
       2020])

In [18]:
data = data[data["year"] >= 2000]
data['year'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [19]:
data.shape

(41450, 19)

In [20]:
data = data.drop(["explicit", "key", "mode"], axis=1)
data.head(2)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,liveness,loudness,name,popularity,release_date,speechiness,tempo
15606,0.285,2000,0.00239,['Coldplay'],0.429,266773,0.661,3AJwUDP919kvQ9QcozQPxg,0.000121,0.234,-7.227,Yellow,84,2000-07-10,0.0281,173.372
15607,0.613,2000,0.143,['OutKast'],0.843,270507,0.806,0I3q5fE6wg7LIfHGngUTnV,0.0,0.0771,-5.946,Ms. Jackson,80,2000-10-31,0.269,94.948


In [21]:
data.shape

(41450, 16)

In [22]:
data.isna().sum()

valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
id                  0
instrumentalness    0
liveness            0
loudness            0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
dtype: int64

## Dados por gêneros musicais

In [4]:
genres_df = pd.read_csv('./kaggle/data_by_genres.csv')
genres_df.head(2)

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5


In [5]:
genres_df.dtypes

mode                  int64
genres               object
acousticness        float64
danceability        float64
duration_ms         float64
energy              float64
instrumentalness    float64
liveness            float64
loudness            float64
speechiness         float64
tempo               float64
valence             float64
popularity          float64
key                   int64
dtype: object

In [24]:
genres_df = genres_df.drop(["key", "mode"], axis=1)
genres_df.head(2)

Unnamed: 0,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity
0,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333
1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5


In [25]:
genres_df.isnull().sum()

genres              0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
popularity          0
dtype: int64

In [26]:
genres_df.shape

(2973, 12)

## Dados por ano

In [7]:
year_df = pd.read_csv('./kaggle/data_by_year.csv')
year_df.head(2)

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10


In [8]:
year_df.dtypes

mode                  int64
year                  int64
acousticness        float64
danceability        float64
duration_ms         float64
energy              float64
instrumentalness    float64
liveness            float64
loudness            float64
speechiness         float64
tempo               float64
valence             float64
popularity          float64
key                   int64
dtype: object

In [28]:
year_df["year"].unique()

array([1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
       1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
       1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953,
       1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964,
       1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
       1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
       1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
       1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019,
       2020])

In [29]:
year_df = year_df[year_df["year"]>=2000]
year_df["year"].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [30]:
#removendo colunas indesejadas
year_df = year_df.drop(["key", "mode"], axis=1)
year_df.head(2)

Unnamed: 0,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity
79,2000,0.289323,0.590918,242724.642638,0.625413,0.101168,0.197686,-8.247766,0.089205,118.999323,0.559475,46.684049
80,2001,0.286842,0.583318,240307.79601,0.626986,0.107214,0.187026,-8.305095,0.089182,117.765399,0.541479,48.750125


In [31]:
year_df.reset_index()

Unnamed: 0,index,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity
0,79,2000,0.289323,0.590918,242724.642638,0.625413,0.101168,0.197686,-8.247766,0.089205,118.999323,0.559475,46.684049
1,80,2001,0.286842,0.583318,240307.79601,0.626986,0.107214,0.187026,-8.305095,0.089182,117.765399,0.541479,48.750125
2,81,2002,0.282624,0.57616,239503.283,0.64127,0.088048,0.193911,-7.68664,0.084308,119.239738,0.542397,48.6555
3,82,2003,0.256471,0.575763,244670.57523,0.660165,0.083049,0.196976,-7.485545,0.093926,120.914622,0.530504,48.626407
4,83,2004,0.280559,0.56768,237378.708037,0.648868,0.077934,0.202199,-7.601655,0.094239,121.290346,0.524489,49.273143
5,84,2005,0.255764,0.572281,237229.588205,0.653209,0.090194,0.190082,-7.466159,0.093334,121.617967,0.532531,50.953333
6,85,2006,0.279986,0.56823,234042.914359,0.650326,0.077701,0.188289,-7.265501,0.085847,121.798615,0.520028,51.313846
7,86,2007,0.254081,0.563414,241049.962564,0.668305,0.072957,0.196127,-7.044536,0.084347,124.087516,0.516794,51.075897
8,87,2008,0.249192,0.579193,240107.315601,0.671461,0.063662,0.198431,-6.843804,0.077356,123.509934,0.527542,50.630179
9,88,2009,0.261929,0.56419,238140.013265,0.670749,0.075872,0.205252,-7.046015,0.085458,123.463808,0.50717,51.440816


In [32]:
year_df.isna().sum()

year                0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
popularity          0
dtype: int64

# Análise gráfica

In [33]:
fig = px.line(year_df, x="year", y="loudness", markers= True, title='Variação do loudness conforme os anos')
fig.show()

In [34]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=year_df['year'], y=year_df['acousticness'],
                    name='Acousticness'))
fig.add_trace(go.Scatter(x=year_df['year'], y=year_df['valence'],
                    name='Valence'))
fig.add_trace(go.Scatter(x=year_df['year'], y=year_df['danceability'],
                    name='Danceability'))
fig.add_trace(go.Scatter(x=year_df['year'], y=year_df['energy'],
                    name='Energy'))
fig.add_trace(go.Scatter(x=year_df['year'], y=year_df['instrumentalness'],
                    name='Instrumentalness'))
fig.add_trace(go.Scatter(x=year_df['year'], y=year_df['liveness'],
                    name='Liveness'))
fig.add_trace(go.Scatter(x=year_df['year'], y=year_df['speechiness'],
                    name='Speechiness'))

fig.show()

### Feature correlation

In [43]:
df_numeric = data.select_dtypes(include=[int, float])

In [44]:
fig = px.imshow(df_numeric.corr(), text_auto=True)
fig.show()