### Téléchargement des packages

In [338]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import mlxtend as mlx
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth
import matplotlib.pyplot as plt
from prefixspan import PrefixSpan

### Importation des données

In [351]:
# ==================== DATA MANIPULATION ==================== #
titanic = pd.read_csv("/Users/lemauffjulien/Documents/Titanic_project-master/titanic_data/clean_data/Clean_train.csv")


In [352]:
# //-- Grouped Age using Age_median \\-- #

# Given the shape of the distribution we can separate the Age by group such as
# Age_group = 0_16 if the age is between 0 and 16 included
# Age_group = 17_30 if the age is between 17 and 30 included
# Age_group = 31_40 if the age is between 31 and 40 included
# Age_group = over_40 if the age is strictly higher than 40

def Age_categorical(x):
    if x <= 16:
        return("0_16")
    elif x <= 30:
        return("17_30")
    elif x <= 40:
        return("31_40")
    else:
        return("over_40")

titanic["Age_group"] = titanic.Age_replace.apply(lambda x: Age_categorical(x))

### Sélection et transformation des variables

In [353]:
titanic1 = titanic [["Survived", "Pclass", "Title", "Famsize", "Age_group"]] 

# //-- Survived \\-- #

def Survived_process(x):
    if x == 1.0:
        return("Survived")
    else:
        return("Died")

titanic1.loc[:, "Survived"] = titanic1.Survived.apply(lambda x: Survived_process(x))

# //-- Pclass \\-- #

def Pclass_process(x):
    if x == 1:
        return('First_class')
    elif x == 2:
        return('Second_class')
    elif x == 3:
        return('Third_class')
    else:
        return("ERROR")

titanic1.loc[:, "Pclass"] = titanic1.Pclass.apply(lambda x: Pclass_process(x))

titanic1.head()

Unnamed: 0,Survived,Pclass,Title,Famsize,Age_group
0,Died,Third_class,Mr,small_family,17_30
1,Survived,First_class,Mrs,small_family,31_40
2,Survived,Third_class,Miss,solo,17_30
3,Survived,First_class,Mrs,small_family,31_40
4,Died,Third_class,Mr,solo,31_40


### Règles d'association et fouille de séquence

In [348]:
# ==================== Dataframe into List ==================== #
titanic_list = titanic1.values.tolist()
te = TransactionEncoder()
te_ary = te.fit(titanic_list).transform(titanic_list)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head()

Unnamed: 0,0_16,17_30,31_40,Died,First_class,Master,Miss,Mr,Mrs,Nobility,Officer,Second_class,Survived,Third_class,big_family,over_40,small_family,solo
0,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False
1,False,False,True,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False
2,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,False,False,True
3,False,False,True,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False
4,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True


In [349]:
frequent_itemsets = fpgrowth (df, min_support=0.3,use_colnames=True)
resultPD = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.01)

# ==================== Règles d'association ==================== #

subresult = resultPD[["antecedents", "consequents", "support", "confidence"]]
print(subresult)

            antecedents          consequents   support  confidence
0                (Died)                 (Mr)  0.489338    0.794171
1                  (Mr)               (Died)  0.489338    0.843327
2                  (Mr)               (solo)  0.445567    0.767892
3                (solo)                 (Mr)  0.445567    0.739292
4            (Died, Mr)               (solo)  0.377104    0.770642
5          (Died, solo)                 (Mr)  0.377104    0.898396
6            (Mr, solo)               (Died)  0.377104    0.846348
7                (Died)           (Mr, solo)  0.377104    0.612022
8                  (Mr)         (Died, solo)  0.377104    0.649903
9                (solo)           (Died, Mr)  0.377104    0.625698
10        (Third_class)               (Died)  0.417508    0.757637
11               (Died)        (Third_class)  0.417508    0.677596
12        (Third_class)                 (Mr)  0.358025    0.649695
13                 (Mr)        (Third_class)  0.358025    0.61

In [350]:
# ==================== Fouille de séquences ==================== #
ps = PrefixSpan(titanic_list)
freqSequences = ps.frequent(50, closed = True)

sequence_pd = pd.DataFrame.from_records(freqSequences)
print(sequence_pd)

       0                                     1
0    549                                [Died]
1    372                   [Died, Third_class]
2    283               [Died, Third_class, Mr]
3    200        [Died, Third_class, Mr, 17_30]
4    232         [Died, Third_class, Mr, solo]
5    169  [Died, Third_class, Mr, solo, 17_30]
6     67     [Died, Third_class, small_family]
7    238            [Died, Third_class, 17_30]
8    255             [Died, Third_class, solo]
9    186      [Died, Third_class, solo, 17_30]
10    53            [Died, Third_class, 31_40]
11    50       [Died, Third_class, big_family]
12    51             [Died, Third_class, Miss]
13   436                            [Died, Mr]
14    89              [Died, Mr, small_family]
15   257                     [Died, Mr, 17_30]
16   336                      [Died, Mr, solo]
17    51               [Died, Mr, solo, 31_40]
18   208               [Died, Mr, solo, 17_30]
19    70             [Died, Mr, solo, over_40]
20    75     