In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from nltk.corpus import stopwords
from tqdm import tqdm
import numpy as np

In [None]:
stopwords = stopwords.words('french')
import Stemmer
stemmer = Stemmer.Stemmer('french')

In [None]:
def stem(x):
    x = x.lower()
    x = x.replace("l'",'').replace("d'",'')
    string = ' '.join(
        stemmer.stemWords(
            [x for x in x.split(' ') if x not in stopwords])
        )
    string = string.replace(',','')
    return string

tqdm.pandas()

In [None]:
train = pd.read_csv('data/raw/naf_activite.csv', sep='|', index_col=0)
train.dropna(inplace=True)
test = pd.read_csv('data/raw/test.csv', index_col=0)

In [None]:
train['ACTIVITE'] = train['ACTIVITE'].progress_apply(stem)
test['text'] = test['text'].progress_apply(stem)

In [None]:
mapping = pd.read_csv('https://www.data.gouv.fr/fr/datasets/r/7bb2184b-88cb-4c6c-a408-5a0081816dcd', sep=',')

# statistics above have no in-line sepators - remove from mapping
mapping['id_5'] = mapping['id_5'].str.replace('.','')

# create series to merge
#naf5_naf2_naf1 = mapping.set_index('')[['id_2', 'id_1']]
train = train.merge(
    mapping[['id_2', 'id_5']],
    left_on='NAF_CODE', right_on='id_5',
    how='inner')

train = train[['ACTIVITE','id_2']]

In [None]:
train = train.iloc[:250_000]

vectorizer = TfidfVectorizer(max_df=.7, max_features=300)
#pca = PCA(n_components=0.95)

test_text = vectorizer.fit_transform(raw_documents=test['text']).toarray()
train_text = vectorizer.transform(raw_documents=train['ACTIVITE']).toarray()

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
classifier = ExtraTreesClassifier(n_jobs=-1, verbose=10)

In [None]:
classifier.fit(train_text, train['id_2'])

In [None]:
preds  = pd.DataFrame(classifier.predict_proba(test_text), columns=classifier.classes_)

In [None]:
priors = pd.read_csv('data/priors.csv')[['naf2','employees']].set_index('naf2')
priors = np.log(priors)
priors = priors.loc[~priors.index.isin([84,85,86])]

In [None]:
multiplied_preds = {}
for i in preds.index:
    pred_series = pd.Series(np.log(preds.loc[i]))
    #pred_series = pred_series.add(priors.T, fill_value=0).T
    pred_str = ' '.join([str(i) for i in pred_series.sort_values(ascending=False).index[:10]])
    multiplied_preds[i] = pred_str

In [None]:
multiplied_preds

In [None]:
pred_df = pd.DataFrame(pd.Series(multiplied_preds), columns=['Predicted'])

In [None]:
pred_df

In [None]:
pred_df.index = pred_df.index.rename('NewsId')

In [None]:
pred_df.to_csv('tfidf.csv')

In [None]:
priors

In [None]:
preds