In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split, PredefinedSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation

from self_training import SelfTraining
from utils import generate_data, ToArray

In [2]:
NB_PER_CAT = 20 

# Same parameters for each experiment
SEED = 42
TEST_SIZE = 0.2
SCORING = 'micro'

- Looking at data.

In [3]:
data = pd.read_csv("data/complete_enron.csv")
data.head()

Unnamed: 0,subject,from_,to,attachment_names,body,date,target
0,Translation of articles,enron.com,enron.com;enron.com;enron.com;enron.com;enron....,klay.nsf,\n\nKaren\n\nHere it is!\n\nPlenty of good Hou...,1999-10-18 08:47:00,0
1,TW Gas Sales: PRIVILEGED AND CONFIDENTIAL ATTO...,enron.com,enron.com,dfossum.nsf,In anticipation of potential litigation involv...,2001-03-05 16:23:00,0
2,TW Gas Sales: PRIVILEGED AND CONFIDENTIAL ATTO...,enron.com,enron.com;enron.com,dfossum.nsf,Julia and Steve--here are some questions I've ...,2001-03-06 08:59:00,0
3,TW Gas Sales: PRIVILEGED AND CONFIDENTIAL ATTO...,enron.com,enron.com;enron.com,DFOSSUM (Non-Privileged).pst,Julia and Steve--here are some questions I've ...,2001-03-06 19:59:00,0
4,TW Gas Sales: PRIVILEGED AND CONFIDENTIAL ATTO...,enron.com,enron.com,DFOSSUM (Non-Privileged).pst,In anticipation of potential litigation involv...,2001-03-06 03:23:00,0


- Creation of the dataset with labeled and unlabeled data.

In [4]:
X = data.body
y = data.target

In [5]:
u_index, l_index = generate_data(X, y, NB_PER_CAT, SEED)

Labeled data PER category: (array([0, 1, 2, 3, 4, 5]), array([20, 20, 20, 20, 20, 20]))


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X.loc[l_index], y.loc[l_index], test_size=TEST_SIZE, stratify=y.loc[l_index])

In [7]:
X_train_u = pd.concat([X_train, X.loc[u_index]], axis=0)
y_train_u = pd.concat([y_train, pd.Series([-1] * len(u_index))], axis=0)

X_train_u.reset_index(drop=True, inplace=True)
y_train_u.reset_index(drop=True, inplace=True)

X_train, y_train = X_train_u, y_train_u

- Creation of the vectorizer to create vector for each mail *(arbitrary hyper-parameters)*.
- Creation of the semi-supervised wrapper method classifier which is using *LogisticRegression* classifier *(arbitrary hyper-parameters)*.

In [8]:
vectorizer = TfidfVectorizer(analyzer="word",
                            ngram_range=(1,3),
                            max_features=5000)

classifier = SelfTraining(learner=LogisticRegression(), 
                          iterations_nb=20, 
                          pool_size=50, 
                          training_way=2, 
                          replacement=True, 
                          random_state=SEED)

- Creation of the pipeline and training.

In [9]:
pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', classifier)
])

In [10]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
  ...bose=0, warm_start=False),
       pool_size=50, random_state=42, replacement=True, training_way=2))])

- Testing model on test set.

In [11]:
y_pred = pipeline.predict(X_test)
f1_score(y_test, y_pred, average=SCORING)

0.4583333333333333