In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../our_analyses/dataset_prepared.csv")
# remove useless variables
column2drop = ['genre','name','artists','album_name']
df.drop(column2drop, axis=1, inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   duration_ms       15000 non-null  int64  
 1   explicit          15000 non-null  bool   
 2   popularity        15000 non-null  int64  
 3   danceability      15000 non-null  float64
 4   energy            15000 non-null  float64
 5   key               15000 non-null  int64  
 6   loudness          15000 non-null  float64
 7   mode              15000 non-null  int64  
 8   speechiness       15000 non-null  float64
 9   acousticness      15000 non-null  float64
 10  instrumentalness  15000 non-null  float64
 11  liveness          15000 non-null  float64
 12  valence           15000 non-null  float64
 13  tempo             15000 non-null  float64
 14  time_signature    15000 non-null  float64
 15  n_beats           15000 non-null  float64
dtypes: bool(1), float64(11), int64(4)
memory

In [4]:
df['popularity'].info     

<bound method Series.info of 0        50
1        52
2        22
3        20
4        22
         ..
14995    15
14996    18
14997    54
14998    56
14999    43
Name: popularity, Length: 15000, dtype: int64>

# Pattern mining preprocessing

In [5]:
df["PopularityBin"] = pd.qcut(df["popularity"], 3) # divido in 3 non pop , medio e alta pop
df["duration_msBin"] = pd.qcut(df["duration_ms"], 2)

df.drop(["popularity", "duration_ms"], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,n_beats,PopularityBin,duration_msBin
0,False,0.401,0.683,8,-5.722,1,0.0401,0.181,0.0,0.0837,0.226,135.951,4.0,654.0,"(35.0, 94.0]","(227826.0, 4120258.0]"
1,False,0.672,0.858,3,-5.233,1,0.145,0.456,0.811,0.104,0.963,166.689,4.0,537.0,"(35.0, 94.0]","(8585.999, 227826.0]"
2,False,0.636,0.826,3,-7.486,1,0.0585,0.461,0.271,0.141,0.731,102.809,4.0,304.0,"(18.0, 35.0]","(8585.999, 227826.0]"
3,False,0.733,0.862,7,-5.813,1,0.0604,0.287,0.000532,0.466,0.745,107.981,4.0,427.0,"(18.0, 35.0]","(227826.0, 4120258.0]"
4,False,0.712,0.225,5,-10.017,1,0.0533,0.93,0.001,0.123,0.429,87.52,4.0,317.0,"(18.0, 35.0]","(8585.999, 227826.0]"


In [7]:
X = df.values.tolist()
X[0]

[False,
 0.401,
 0.683,
 8,
 -5.722,
 1,
 0.0401,
 0.181,
 0.0,
 0.0837,
 0.226,
 135.951,
 4.0,
 654.0,
 Interval(35.0, 94.0, closed='right'),
 Interval(227826.0, 4120258.0, closed='right')]

# Pattern Mining

## Apriori

In [8]:
from fim import apriori

ModuleNotFoundError: No module named 'fim'

### Frequent Itemset

In [None]:
# supp = 2  # 2%
supp = 20  # 20%
zmin = 2  # minimum number of items per item set

itemsets = apriori(X, target="s", supp=supp, zmin=zmin, report="S")
pd.DataFrame(itemsets, columns=["frequent_itemset", "support"])


                antecedents              consequents  antecedent support  \
0            (danceability)                 (energy)            0.660800   
1                  (energy)           (danceability)            0.727733   
2                 (valence)           (danceability)            0.411467   
3            (danceability)                (valence)            0.660800   
4                 (valence)                 (energy)            0.411467   
5                  (energy)                (valence)            0.727733   
6   (valence, danceability)                 (energy)            0.361600   
7         (valence, energy)           (danceability)            0.356200   
8    (danceability, energy)                (valence)            0.521067   
9                 (valence)   (danceability, energy)            0.411467   
10           (danceability)        (valence, energy)            0.660800   
11                 (energy)  (valence, danceability)            0.727733   

    consequ

  subset_data = subset_data.applymap(lambda x: 1 if x >= threshold else 0)


### Closed Itemsets

In [None]:
# supp = 2  # 2%
supp = 20  # 20%
zmin = 2  # minimum number of items per item set

itemsets = apriori(X, target="c", supp=supp, zmin=zmin, report="S")
pd.DataFrame(itemsets, columns=["closed_itemset", "support"])

### Maximal Itemsets

In [None]:
itemsets = apriori(X, target="m", supp=supp, zmin=zmin, report="S")
pd.DataFrame(itemsets, columns=["maximal_itemset", "support"])

### PLOTS

In [None]:
len_max_it = []
len_cl_it = []
max_supp = 25
for i in range(2, max_supp):
    max_itemsets = apriori(X, target="m", supp=i, zmin=zmin)
    cl_itemsets = apriori(X, target="c", supp=i, zmin=zmin)
    len_max_it.append(len(max_itemsets))
    len_cl_it.append(len(cl_itemsets))

plt.plot(np.arange(2, max_supp), len_max_it, label="maximal")
plt.plot(np.arange(2, max_supp), len_cl_it, label="closed")
plt.legend()
plt.xlabel("%support")
plt.ylabel("itemsets")

plt.show()

#### Support vs number of itemset for explicit / not explicit

In [None]:
filter_1 = []
filter_2 = []
for i in range(2, max_supp):
    max_itemsets = apriori(X, target="m", supp=i, zmin=zmin)
    filter_1.append(len([item for item in max_itemsets if "explicit" in item[0]]))
    filter_2.append(len([item for item in max_itemsets if "Not explicit" in item[0]]))

plt.plot(np.arange(2, max_supp), filter_1, label="w/ explicit")
plt.plot(np.arange(2, max_supp), filter_2, label="w/ Not explicit")
plt.legend()
plt.xticks()
plt.xlabel("%support")
plt.ylabel("itemsets")

plt.show()

In [None]:
conf = 60
rules = apriori(X, target="r", supp=supp, zmin=zmin, conf=conf, report="aScl")
rules_df = pd.DataFrame(
    rules,
    columns=[
        "consequent",
        "antecedent",
        "abs_support",
        "%_support",
        "confidence",
        "lift",
    ],
)
rules_df.sort_values(by="lift", axis=0, ascending=False)

In [None]:
rules_df[rules_df["consequent"] == "explicit"]

In [None]:
print("to_predict:", [r for r in rules if r[0] == "explicit"][0][0])
print("how?", [r for r in rules if r[0] == "explicit"][0][1])