In [1]:
import pandas as pd
import numpy as np

import sqlite3
db = "new_db.db"
con = sqlite3.connect(db)
cur = con.cursor()

from sklearn.linear_model import SGDClassifier

import os.path
from os import path

from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib

import pickle

import time
start = time.time()

In [2]:
def main():
    emails = list(cur.execute("""SELECT (
                                    COALESCE(email_from, '') || ' ' ||
                                    COALESCE(email_to, '') || ' ' ||
                                    COALESCE(email_cc, '') || ' ' ||
                                    COALESCE(email_bcc, '') || ' ' ||
                                    COALESCE(email_subject, '') || ' ' ||
                                    COALESCE(email_message, '')
                                )

                                FROM emails_main
                                WHERE folder_directory IS NOT NULL"""))

    emails = [item[0] for item in emails]

    emails_folder = list(cur.execute("""SELECT folder_directory 
                                        FROM emails_main
                                        WHERE folder_directory IS NOT NULL"""))

    emails_folder = [item[0] for item in emails_folder]

    emails_tobeprocessed = list(cur.execute("""SELECT (
                                                COALESCE(email_from, '') || ' ' ||
                                                COALESCE(email_to, '') || ' ' ||
                                                COALESCE(email_cc, '') || ' ' ||
                                                COALESCE(email_bcc, '') || ' ' ||
                                                COALESCE(email_subject, '') || ' ' ||
                                                COALESCE(email_message, '')
                                            )

                                            FROM emails_main
                                            WHERE folder_directory IS NULL"""))

    emails_tobeprocessed = [item[0] for item in emails_tobeprocessed]

    emails_tobeprocessed_messageid = list(cur.execute("""SELECT message_id
                                                         FROM emails_main
                                                         WHERE folder_directory IS NULL"""))

    emails_tobeprocessed_messageid = [item[0] for item in emails_tobeprocessed_messageid]

    emails_tobeprocessed_tuple = list(zip(emails_tobeprocessed_messageid, emails_tobeprocessed))

    X_train = emails

    labelencoder = LabelEncoder()
    labelencoder.fit(emails_folder)
    labelencoder_dict = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))

    y_train = labelencoder.transform(emails_folder)

    from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer

    vectorizer = HashingVectorizer()
    vectorizer.fit(X_train)

    with open('vectorizer.pickle', 'wb') as handle:
        pickle.dump(vectorizer, handle)

    X_train = vectorizer.transform(X_train)

    X_predict = [email[1] for email in emails_tobeprocessed_tuple]
    X_predict = vectorizer.transform(X_predict)

    # OneClassSVM - to weed out outliers
    from sklearn.svm import OneClassSVM
    oneclasssvm = OneClassSVM()
    oneclasssvm.fit(X_train)
    oneclass_preds = list(oneclasssvm.predict(X_predict))

    # Get indexes of outliers
    # Outliers = -1, Inliers = 1
    outliers_indexes = [i for i,x in enumerate(oneclass_preds) if x == -1]

    if len(oneclass_preds) != len(outliers_indexes):

        # Delete the outliers, in reverse order so it doesn't throw off the subsequent indexes
        for index in sorted(outliers_indexes, reverse=True):
            del emails_tobeprocessed_tuple[index]

        # New value for X_predict after deletion of outliers
        X_predict = [email[1] for email in emails_tobeprocessed_tuple]
        X_predict = vectorizer.transform(X_predict)

        if path.exists('supervised_model.pkl') == False:
            model = SGDClassifier(warm_start=True)
            model.partial_fit(X_train, y_train, classes=np.arange(1000))

        else:
            print('using pickled model')
            
            # Load previously saved fitted model
            model = joblib.load('supervised_model.pkl')
            
            # Load previously saved fitted vectorizer
            with open('vectorizer.pickle', 'rb') as handle:
                new_vectorizer = pickle.load(handle)

            new_X_train = new_vectorizer.transform(emails)

            model.partial_fit(new_X_train, y_train)

            joblib.dump(model, 'supervised_model.pkl')

        folder_directory = list(model.predict(X_predict))

        # Get indexes of emails that are predicted to be in an unseen category
        unseen_labels_indexes = [i for i,x in enumerate(folder_directory) if x not in labelencoder_dict.values()]

        # Delete emails with unseen labels, in reverse order so it doesn't throw off the subsequent indexes
        for index in sorted(unseen_labels_indexes, reverse=True):
            del emails_tobeprocessed_tuple[index]


        # Get list of labels for emails that are predicted to be in a seen category
        seen_labels_emails = []
        for i in folder_directory:
            if i in labelencoder_dict.values():
                seen_labels_emails.append(i)

        # Transform the labels back to the original encoding
        folder_directory = list(labelencoder.inverse_transform(seen_labels_emails))

        # Unzip tuple to lists
        message_id, email = zip(*emails_tobeprocessed_tuple)

        supervised_temp_df = pd.DataFrame({'message_id': message_id, 
                                           'folder_directory': folder_directory})

        # Create a temporary table to store the results of supervised learning
        supervised_temp_df.to_sql('supervised_temp', con, if_exists='replace')

        # Update folder_directory in emails table and delete temporary table for supervised learning
        cur.executescript("""UPDATE emails_main
                             SET folder_directory = (
                                SELECT folder_directory 
                                FROM supervised_temp 
                                WHERE message_id = emails_main.message_id)
                             WHERE emails_main.folder_directory IS NULL;

                             DROP TABLE supervised_temp;""")
        con.commit()    

In [3]:
main()

end = time.time()
print(end - start)



107.0433976650238


In [9]:
con.close()