In [2]:
import pandas as pd
import glob

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
path_files = r"/content/drive/MyDrive/Colab Notebooks/1M_MinStp_clusters/"

In [5]:
# récupérer les fichier du dossier contenant les fihcier de clusters
# glob aide à faire un trie par regex afin d'en choisir les fichiers nettoyés
list_km= glob.glob(f"{path_files}*km*")
list_som= glob.glob(f"{path_files}*som*")

In [6]:
list_km

['/content/drive/MyDrive/Colab Notebooks/1M_MinStp_clusters/km_25',
 '/content/drive/MyDrive/Colab Notebooks/1M_MinStp_clusters/km_100',
 '/content/drive/MyDrive/Colab Notebooks/1M_MinStp_clusters/km_50']

In [7]:
import re

def extr_num(string):
    num = re.search("([0-9]+)", string)
    return int(num.group(1)) if num else -1

In [8]:
def make_liste(liste):
  liste_clean=[]
  liste_brut=[]
  liste.sort(key=extr_num)
  for cluster in liste:
    liste_clean.append(glob.glob(cluster+'/*clean*'))
    liste_brut.append(glob.glob(cluster+'/*raw*'))
  return liste_clean, liste_brut

In [9]:
liste_clean_som , liste_brut_som = make_liste(list_som)

In [10]:
liste_clean_km , liste_brut_km = make_liste(list_km)

In [11]:
def extr_num2(string):
    num = re.search("cluster_([0-9]+)_", string)
    return int(num.group(1)) if num else -1

In [12]:
# fonction pour lire une liste de fichier 
# argument : une liste de fichiers
# sortie: liste
def read_cluster(files):
  liste=[]
  for i,cluster in enumerate(files):
    with open(cluster, "r") as file:
      liste.append(file.readlines())
  return liste

In [13]:
def matrice(liste):
  cluster_km=[]
  for file in liste:
    file.sort(key=extr_num2)
    cluster_km.append(read_cluster(file))
  return cluster_km

In [14]:
clean_cluster_km= matrice(liste_clean_km)

In [None]:
clean_cluster_km[0][24]

In [15]:
brut_cluster_km= matrice(liste_brut_km)

In [16]:
clean_cluster_som= matrice(liste_clean_som)

In [17]:
brut_cluster_som= matrice(liste_brut_som)

In [None]:
brut_cluster_km[0]

In [18]:
def make_df(liste_brut , liste_clean):
  df=[[] for _ in range(len(liste_brut))]
  for i in range(len(liste_brut)):
    for n in range(len(liste_brut[i])):
      df[i].append(pd.DataFrame(list(zip(liste_brut[i][n] ,liste_clean[i][n])), columns = ['tweet', 'clean_twt']))
  return df

In [19]:
data_km = make_df(brut_cluster_km, clean_cluster_km)

In [20]:
data_som= make_df(brut_cluster_som,clean_cluster_som)

data_SOM : 
1.   index 0 : som100
2.   index 1: som225
3.   index 2: som400


data_km : 
1.   index 0 : km25
2.   index 1: km50
3.   index 2: km100

In [None]:
!pip install gensim==4.0
!pip install python-Levenshtein

In [23]:
from gensim.models import KeyedVectors

In [24]:
# récupérer les embeddings entrainés sur nos données
w2v_minstp = KeyedVectors.load('/content/drive/MyDrive/Colab Notebooks/w2v/w2vec_model_d300_1M_MinStp')

In [25]:
import numpy as np

In [26]:
#encode un message
def encode(msg):
    return np.mean([w2v_minstp[word] for word in msg if word in w2v_minstp] or [np.zeros(300)], axis = 0)

In [None]:
w2v_minstp.similar_by_word("sale")

In [28]:
# Encoder une liste de messages 
def encoded(liste):
  w2v=[]
  for i in range(len(liste)):
    w2v.append(np.array([encode(msg.split()) for msg in liste[i]]))
  return w2v

Encodage des différents set de clusters

In [338]:
len(clean_cluster_km)

3

In [33]:
def liste_encoded(liste_cluster):
  dataset_encoded=[]
  for i in range(len(liste_cluster)):
    dataset_encoded.append(encoded(liste_cluster[i]))
  return dataset_encoded

In [34]:
dataset_encoded_km= liste_encoded(clean_cluster_km)

In [None]:
dataset_encoded_km[0][1]

In [42]:
dataset_encoded_som= liste_encoded(clean_cluster_som)

In [39]:
%%time
#encodage de l'échantillon pour créer le jeu de données
dataset_encoded_km_25 = encoded(clean_cluster_km[0])

CPU times: user 28.2 s, sys: 790 ms, total: 29 s
Wall time: 28.9 s


In [None]:
dataset_encoded_km_25[1]

Entrainement des différents set de clusters

In [44]:
from sklearn.ensemble import IsolationForest

# definir le modèle ainsi que ses paramètres
if_model=IsolationForest(n_jobs=-1, random_state=42)


Créer une matrice qui stock les différents models d'IF correspondant aux différents clusters

In [48]:
def train_if (dataset):
  models=[[] for _ in range(len(dataset))]
  for i in range(len(dataset)):
    for n in range(len(dataset[i])):
      if_model=IsolationForest(n_jobs=-1, random_state=42)
      models[i].append(if_model.fit(dataset[i][n]))
  return models

Appliquer IF sur les clusters de Km

In [49]:
models = train_if(dataset_encoded_km)

Appliquer IF sur les clusters de SOM

In [50]:
models_som = train_if(dataset_encoded_som)

In [None]:
from sklearn.externals import joblib
from google.colab import files

#you can save variable into file on colab files

joblib.dump(models,  'models_if_km.pkl')   
 
#this will download file to your local downloads

files.download('models_if_km.pkl')       

#reload your saved data.

#var = joblib.load('var.pkl')    

Prédictions pour les différents modèles

In [53]:
def pred(dataset,model):
  preds=[[] for _ in range(len(dataset))]
  for i in range(len(dataset)):
    for n in range(len(dataset[i])):
      preds[i].append(model[i][n].predict(dataset[i][n]))
  return preds

In [54]:
pred_km= pred(dataset_encoded_km, models)

In [57]:
pred_som= pred(dataset_encoded_som, models_som)

In [71]:
pred_km[1][99]

array([1, 1, 1, ..., 1, 1, 1])

In [62]:
def add_pred_to_df(dataset,pred):
  for i in range(len(pred)):
    for n in range(len(pred[i])):
      dataset[i][n]['pred']= pred[i][n]

In [63]:
add_pred_to_df(data_km, pred_km)

In [65]:
add_pred_to_df(data_som, pred_som)

In [64]:
data_km[0][24]

Unnamed: 0,tweet,clean_twt,pred
0,cette journée m’a épuisé\n,journée m’épuisé\n,1
1,https://t.co/uys4y8Zt1W Très bonne analyse!...\n,bonne analyse! ...\n,1
2,Bonne chance !Et bonne journée !\n,bonne chance! bonne journée!\n,1
3,Suis-je la bonne personne vu que je m'aime ? h...,-je bonne m'aime?\n,1
4,Bonne journée https://t.co/GCHxxvuFuZ\n,bonne journée\n,1
...,...,...,...
3262,bonne journée ♡\n,bonne journée\n,1
3263,Bonne route à toi\n,bonne route\n,1
3264,La journée est refaite merci\n,journée refaite\n,1
3265,Bonne journée Roxane 😊😘😘\n,bonne journée roxane\n,1


In [86]:
data_km[2][3]

Unnamed: 0,tweet,clean_twt,pred
0,mds ele 😭😭😭\n,mds ele\n,1
1,QUE DOR MDS 😭😭😭😭😭\n,dor mds\n,1
2,mds q agoniante esperar vai logo mds vou infar...,mds agoniante esperar vai logo mds vou infartar\n,1
3,Mds que amor\n,mds amor\n,1
4,o george mds https://t.co/mgJdDQNIqM\n,george mds\n,1
...,...,...,...
3169,"Mds, denovo?\n","mds, denovo?\n",1
3170,mds amiga que\n,mds amiga\n,1
3171,Mds eu to gordaaaa 😥\n,mds to gordaaaa\n,-1
3172,KKKKKKKK MDS\n,kkkkkkkk mds\n,1


In [93]:
# Trouver les outliers de chaque cluster
# ceux dont l'index est -1
def outliers(df):
  outliers=[]
  for i in range(len(df)):
    outliers.append(df[i]['tweet'].loc[df[i]['pred']==-1])
    outlier_index=list(outliers[i].index)
    print("cluster"+str(i)+":"+"\n")
    print(df[i]['pred'].value_counts())
    print("\n")
  return outliers

In [None]:
outliers_km_25 = outliers(data_km[0])

In [None]:
outliers_km_50 = outliers(data_km[2])

In [None]:
outliers_km_100 = outliers(data_km[1])

In [None]:
outliers_som_100 = outliers(data_som[0])

In [None]:
outliers_som_400 = outliers(data_som[1])

In [None]:
outliers_som_225 = outliers(data_som[2])

In [107]:
def save_outliers(outliers,string):
  for i in range(len(outliers)):
    with open(f"drive/MyDrive/minstp/"+string+f"/outliers{i}", 'w') as f:
      for line in outliers[i]:
        f.write(line)

In [108]:
save_outliers(outliers_km_25,"km25")

In [109]:
save_outliers(outliers_km_50,"km50")

In [110]:
save_outliers(outliers_km_100,"km100")

In [111]:
save_outliers(outliers_som_100,"som100")

In [112]:
save_outliers(outliers_som_400,"som400")

In [113]:
save_outliers(outliers_som_100,"som100")