### Loading and inspecting the data.

In [2]:
#Importing pandas as pd
import pandas as pd

In [3]:
#Reading csv file
filepath = "DataFiles/songs.csv"
df = pd.read_csv(filepath)

In [4]:
#Inspecting data
df.head()

Unnamed: 0,Title,Artist,Album,Genre,Release Date,Duration,Popularity
0,Include name this.,Patrick Anderson,Care.,R&B,2008-01-09,262,71
1,Manage west energy.,Eric Miller,Raise get.,Jazz,2011-08-20,187,37
2,Evening court painting.,Richard Curry,Sport.,Electronic,2010-05-30,212,58
3,Section turn hour.,James Smith,Full.,Hip-Hop,2014-10-12,272,59
4,Five agreement teach.,Amy Rodriguez,Eat.,Blues,2005-06-09,131,34


In [5]:
df.shape

(50000, 7)

In [6]:
df = df.rename(columns={"Duration":"Duration in Minutes"})

In [7]:
df

Unnamed: 0,Title,Artist,Album,Genre,Release Date,Duration in Minutes,Popularity
0,Include name this.,Patrick Anderson,Care.,R&B,2008-01-09,262,71
1,Manage west energy.,Eric Miller,Raise get.,Jazz,2011-08-20,187,37
2,Evening court painting.,Richard Curry,Sport.,Electronic,2010-05-30,212,58
3,Section turn hour.,James Smith,Full.,Hip-Hop,2014-10-12,272,59
4,Five agreement teach.,Amy Rodriguez,Eat.,Blues,2005-06-09,131,34
...,...,...,...,...,...,...,...
49995,Thing.,Brandon Thomas,Lawyer sign.,Rock,2008-06-07,300,95
49996,Option.,Barry White,Cold.,Pop,2016-09-06,220,23
49997,Ability ever.,Leah Obrien,Trade.,Blues,2013-10-16,257,79
49998,Occur claim who.,Ashley James,Explain moment.,Jazz,2018-03-06,262,22


### Cleaning and preparing the data.

In [9]:
df.isnull().sum()

Title                  0
Artist                 0
Album                  0
Genre                  0
Release Date           0
Duration in Minutes    0
Popularity             0
dtype: int64

In [10]:
df[df.duplicated()]

Unnamed: 0,Title,Artist,Album,Genre,Release Date,Duration in Minutes,Popularity


In [11]:
q3 = df["Duration in Minutes"].quantile(0.75)
q1 = df["Duration in Minutes"].quantile(0.25)

iqr = q3-q1

upper_bound=q3+1.5*iqr
lower_bound = q1-1.5*iqr

print(upper_bound)
print(lower_bound)

391.5
27.5


In [12]:
df=df[(df['Duration in Minutes'] >= lower_bound) & (df['Duration in Minutes'] <= upper_bound)]

In [13]:
df.shape

(50000, 7)

In [14]:
q3 = df["Popularity"].quantile(0.75)
q1 = df["Popularity"].quantile(0.25)

iqr = q3-q1

upper_bound=q3+1.5*iqr
lower_bound = q1-1.5*iqr

print(upper_bound)
print(lower_bound)

152.5
-51.5


In [15]:
df=df[(df['Popularity'] >= lower_bound) & (df['Popularity'] <= upper_bound)]

In [16]:
df.shape

(50000, 7)

### Performing data manipulation.

In [18]:
df.head()

Unnamed: 0,Title,Artist,Album,Genre,Release Date,Duration in Minutes,Popularity
0,Include name this.,Patrick Anderson,Care.,R&B,2008-01-09,262,71
1,Manage west energy.,Eric Miller,Raise get.,Jazz,2011-08-20,187,37
2,Evening court painting.,Richard Curry,Sport.,Electronic,2010-05-30,212,58
3,Section turn hour.,James Smith,Full.,Hip-Hop,2014-10-12,272,59
4,Five agreement teach.,Amy Rodriguez,Eat.,Blues,2005-06-09,131,34


In [19]:
df.groupby("Genre")["Album"].count()

Genre
Blues         5101
Classical     4972
Country       5176
Electronic    4968
Hip-Hop       5033
Jazz          4939
Pop           5026
R&B           4973
Reggae        4946
Rock          4866
Name: Album, dtype: int64

In [20]:
df['Release Date'] = pd.to_datetime(df['Release Date'])

In [21]:
# Define the date you want to filter by
date_filter = pd.to_datetime("2010-12-31")

# Filter the DataFrame
First_Decade = df[df['Release Date'] <= date_filter]
First_Decade

Unnamed: 0,Title,Artist,Album,Genre,Release Date,Duration in Minutes,Popularity
0,Include name this.,Patrick Anderson,Care.,R&B,2008-01-09,262,71
2,Evening court painting.,Richard Curry,Sport.,Electronic,2010-05-30,212,58
4,Five agreement teach.,Amy Rodriguez,Eat.,Blues,2005-06-09,131,34
5,Turn child.,Jessica Martin,Cold according.,R&B,2006-09-16,207,58
6,Old.,Cheyenne Powell,Oil.,Country,2010-04-23,163,72
...,...,...,...,...,...,...,...
49986,Fly live.,Katie Rodriguez,Guess sometimes.,Rock,2007-01-17,299,4
49990,Land recent bank.,Christopher Silva,Long.,Pop,2009-08-06,163,2
49993,By population message.,Madeline Dickson,Mean.,Classical,2005-02-13,142,59
49995,Thing.,Brandon Thomas,Lawyer sign.,Rock,2008-06-07,300,95


In [22]:
# Which genre had the highest count between 2000 to 2010(first_decade)

First_Decade.groupby("Genre")["Release Date"].count()

Genre
Blues         2762
Classical     2578
Country       2706
Electronic    2600
Hip-Hop       2660
Jazz          2564
Pop           2632
R&B           2608
Reggae        2519
Rock          2580
Name: Release Date, dtype: int64

In [47]:
#Total count for the first_decade
First_Decade_Count=First_Decade["Genre"].count()
First_Decade_Count

26209

In [24]:
# Define the date you want to filter by
date_filter = pd.to_datetime("2020-12-31")

# Filter the DataFrame
Second_Decade = df[df['Release Date'] <= date_filter]
Second_Decade

Unnamed: 0,Title,Artist,Album,Genre,Release Date,Duration in Minutes,Popularity
0,Include name this.,Patrick Anderson,Care.,R&B,2008-01-09,262,71
1,Manage west energy.,Eric Miller,Raise get.,Jazz,2011-08-20,187,37
2,Evening court painting.,Richard Curry,Sport.,Electronic,2010-05-30,212,58
3,Section turn hour.,James Smith,Full.,Hip-Hop,2014-10-12,272,59
4,Five agreement teach.,Amy Rodriguez,Eat.,Blues,2005-06-09,131,34
...,...,...,...,...,...,...,...
49995,Thing.,Brandon Thomas,Lawyer sign.,Rock,2008-06-07,300,95
49996,Option.,Barry White,Cold.,Pop,2016-09-06,220,23
49997,Ability ever.,Leah Obrien,Trade.,Blues,2013-10-16,257,79
49998,Occur claim who.,Ashley James,Explain moment.,Jazz,2018-03-06,262,22


In [25]:
# Which genre had the highest count between 2011 to 2020(second_decade)

Second_Decade.groupby("Genre")["Release Date"].count()

Genre
Blues         5101
Classical     4972
Country       5176
Electronic    4968
Hip-Hop       5033
Jazz          4939
Pop           5026
R&B           4973
Reggae        4946
Rock          4866
Name: Release Date, dtype: int64

In [45]:
#Total count for the second_decade
Second_Decade_Count=Second_Decade["Genre"].count()
Second_Decade_Count

50000

In [27]:
Percentage_Increase_between_decades= ((Second_Decade_Count-First_Decade_Count)/First_Decade_Count)*100
print(First_Decade_Count)
print(Second_Decade_Count)
print(Percentage_Increase_between_decades)

26209
50000
90.77416154756


#### Summary for the first and second decade 
Production increased in the second decade by 90.7%
Highest Increase in Releases: The genre with the highest increase in the number of releases is Country, increasing from 2,706 in the first decade to 5,176 in the second decade.
Blues: The number of Blues releases rose from 2,762 to 5,101.
Classical: Classical music releases increased from 2,578 to 4,972.
Electronic: Although Electronic music releases grew, it remained consistent in its rank.
Hip-Hop and R&B: Both genres showed substantial increases, with Hip-Hop having 5,033 releases and R&B having 4,973 releases in the second decade.
Rock: Despite an increase in releases, Rock saw the lowest number of releases in the second decade compared to other genres, though it still doubled its count from the first decade.
Pop: Pop music consistently saw a high number of releases, with a notable increase from 2,632 to 5,026.



In [36]:
First_Decade_First_Decade.groupby("Genre")["Popularity"].sum()

Genre
Blues         136816
Classical     130989
Country       134882
Electronic    129253
Hip-Hop       132678
Jazz          129983
Pop           133671
R&B           132582
Reggae        128671
Rock          129652
Name: Popularity, dtype: int64

In [38]:
Second_Decade.groupby("Genre")["Popularity"].sum()

Genre
Blues         252599
Classical     252011
Country       260168
Electronic    247380
Hip-Hop       251484
Jazz          250065
Pop           253299
R&B           250770
Reggae        251568
Rock          243848
Name: Popularity, dtype: int64

#### Summary of the popularity based on Genre for the First and Second Decade
Overall Increase: All genres experienced a significant increase in popularity from the first to the second decade.
Highest Growth: The genre with the highest growth in popularity is Country, increasing from 134,882 in the first decade to 260,168 in the second decade.
Blues: The popularity of Blues rose from 136,816 to 252,599.
Classical: Classical music's popularity increased from 130,989 to 252,011.
Electronic: Although Electronic music's popularity grew, it remained one of the lower-ranking genres in both decades.
Hip-Hop and R&B: Both genres showed substantial increases, maintaining close popularity figures in both decades.
Rock: Despite an increase in popularity, Rock saw the lowest increase among the genres in the second decade, suggesting a slower growth rate compared to other genres.
Pop: Pop music consistently remained popular, with a notable increase from 133,671 to 253,299.

In [69]:
df_duration=df[["Duration in Minutes","Popularity"]]
df_duration

Unnamed: 0,Duration in Minutes,Popularity
0,262,71
1,187,37
2,212,58
3,272,59
4,131,34
...,...,...
49995,300,95
49996,220,23
49997,257,79
49998,262,22


In [71]:
df_duration.head()

Unnamed: 0,Duration in Minutes,Popularity
0,262,71
1,187,37
2,212,58
3,272,59
4,131,34


#### Duration in Minutes versus Popularity
The data presents a range of durations and popularity scores,with the longest duration being 272 minutes and the shortest being 131 minutes. Popularity scores vary from 34 to 71, showing no clear pattern or correlation between the duration of the entries and their popularity.








In [83]:
df.groupby("Genre").agg({"Duration in Minutes":["sum","mean","max","min"]})

Unnamed: 0_level_0,Duration in Minutes,Duration in Minutes,Duration in Minutes,Duration in Minutes
Unnamed: 0_level_1,sum,mean,max,min
Genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Blues,1070646,209.889433,300,120
Classical,1047862,210.752615,300,120
Country,1082050,209.051391,300,120
Electronic,1041113,209.563808,300,120
Hip-Hop,1051160,208.853566,300,120
Jazz,1034456,209.446447,300,120
Pop,1050741,209.061082,300,120
R&B,1045228,210.180575,300,120
Reggae,1036545,209.572382,300,120
Rock,1027344,211.127004,300,120


#### Duration Statistics by Genre
The data reveals that while all genres have songs with durations ranging from 120 to 300 minutes,
there are slight variations in the total and average durations across genres. Country music has the highest total duration, indicating it has more cumulative song length over the period analyzed. Rock has the highest average duration, suggesting that on average, Rock songs are slightly longer compared to other genres. Conversely, Hip-Hop has the lowest average duration, indicating slightly shorter songs on average compared to other genres.