## Imports and constants

In [None]:
import datetime
import os
import re
import requests
import string
import sys
import pandas as pd
import numpy as np
import json
import emoji
from emoji.unicode_codes import UNICODE_EMOJI
import itertools
import multiprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score
from sklearn.model_selection import train_test_split

import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer  # import the French stemming library

VOWELS = ["a", "e", "i", "o", "u", "y"]
stemmer = EnglishStemmer()

DFS_PATH = '../data/training_set.csv'

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

### Cleaning Functions

In [None]:
def stopwords_list(language):
    return stopwords.words(str(language))


def filter_sentence_stopwords(tokens, list_stopwords):

    clean_tokens = [w for w in tokens if w not in list_stopwords and len(w) > 1
                    and w.isalpha()]
    return clean_tokens


def lowercase(text):
    s = []
    words = text.split(' ')
    for word in words:
        if word.isupper():
            s.append(word.lower())
        else:
            s.append(word)

    return ' '.join(s)


def clean_ponctuation(text):
    exclude = set((string.punctuation + '«»').replace('.', '').replace('!', '')
                  .replace('?', ''))
    s = ''.join(ch for ch in text if ch not in exclude)
    return s


def clean_http(text):
    return re.sub(r'http\S+', '', text)


def cleaning_words(text):
    s = []
    words = text.split(' ')
    for word in words:
        if word.isupper():
            s.append(word.lower())
        elif word.islower():
            s.append(word)
        else:
            for word in re.findall('[A-Z][^A-Z]*', word):
                s.append(word)

    return ' '.join(s)


def text_cleaning(text):
    text_cleaned_http = clean_http(text)
    text_cleaned_ponctuation = clean_ponctuation(text_cleaned_http)
    text_cleaned = cleaning_words(text_cleaned_ponctuation)

    return text_cleaned


def word_tokenizer(post):
    return nltk.word_tokenize(post.lower())


def stemmer_post(tokens):
    stemmer_word = []
    for w in tokens:
        stemmer_word.append(stemmer.stem(w))
    return stemmer_word


def _find_emojis(message):
    emoji_list = []
    for word in message:
        if word in emoji.UNICODE_EMOJI:
            emoji_list.append(word)
    return emoji_list


def _erase_emojis(message, emojis_list):
    for emo in emojis_list:
        message = message.replace(emo, '')
    return message


def _add_emojis_tokens(tokens_list, emojis_list):
    return emojis_list + tokens_list


def emoji_to_text(s):
    res = []
    for e in s:
        r = UNICODE_EMOJI[e].replace(':','')
        try:
            r = r.split('_')
            res.append(r)
        except:
            res.append([r])
    try:
        return [item for sublist in res for item in sublist]
    except:
        return ['']
    
def text_to_tokens(text):
    try:
        text = text.replace('❤','love')
    except:
        pass
    emojis_list = _find_emojis(text)
    emojis_list = emoji_to_text(emojis_list)
    wes = _erase_emojis(text, emojis_list)
    wes = text_cleaning(wes)
    stopwords = stopwords_list('english') #OUT
    wes = wes.lower().split(' ')
    wes = filter_sentence_stopwords(wes, stopwords)
    wes = stemmer_post(wes)

    return _add_emojis_tokens(wes, emojis_list)

def processing_post(post):
    tokens = text_to_tokens(post)

    cleaned_post = ' '.join(tokens)
    return cleaned_post

def processing_corpus(corpus):
    cleaned_corpus = [processing_post(post) for post in corpus]
    corpus = list(cleaned_corpus)
    return corpus

### Import Datas Set 

In [None]:
data = pd.read_csv(DFS_PATH)

### Clean Corpus

In [None]:
data['content'] = processing_corpus(data['content'])

### Split Train/Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['content'].values, 
                                                    data['label'].values, 
                                                    test_size=0.1, 
                                                    random_state=42)

# UlmFit

### Prepare corpus

In [None]:
training = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
validating = pd.concat([X_val, y_val], axis=1).reset_index(drop=True)

training.columns = ['text', 'label']
validating.columns = ['text', 'label']

validating = validating.reindex(sorted(validating.columns), axis=1)
training = training.reindex(sorted(training.columns), axis=1)

In [None]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = training, valid_df = validating, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", 
                                      train_df = training, 
                                      valid_df = validating, 
                                      vocab=data_lm.train_ds.vocab,
                                      bs=32)

### Fit UlmFit model

In [None]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn.fit_one_cycle(10, 1e-2)

In [None]:
learn.save_encoder('ft_enc')

### Fine tuning with our data

In [None]:
learn = text_classifier_learner(data_clas, drop_mult=0.7)
learn.load_encoder('ft_enc')

In [None]:
learn.fit_one_cycle(25, 1e-2)

### Predictions and results

In [None]:
preds, targets = learn.get_preds()
predictions = np.argmax(preds, axis = 1)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
f1_score(y_test, predictions, average=None)

In [None]:
precision_score(y_test, predictions, average=None) 

In [None]:
confusion_matrix(y_test, predictions)