# Notebook de test 

#### Sommaire
<ol>
    <li ><a href=#Approche-non-supervisée> Approche non-supervisée</a></li>
    <ol>
        <li> <a href=#LatentDirichletAllocation>LatentDirichletAllocation</a></li>
        <li> <a href=#NonNegativeMatrixFactorisation>NonNegativeMatrixFactorisation</a></li>
    </ol>
    <li> <a href=#Approche-mixte> Approche mixte</a></li>
    <li> <a href=#Approche-supervisée> Approche supervisée</a></li>
    <li> <a href=#Modèle-retenu>Modèle retenu</a></li>
</ol>

## Import des bibliothèques

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from bs4 import BeautifulSoup
import xml
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD,LatentDirichletAllocation,NMF
from sklearn.cluster import DBSCAN,AgglomerativeClustering,SpectralClustering, AffinityPropagation
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BaseNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from joblib import Parallel, delayed


In [2]:
query_df=pd.read_csv("exported_query_df.csv",index_col='Id')
corpus_by_tag_df=pd.read_csv("corpus_by_tag.csv")

In [3]:
corpus_by_tag_df.index=corpus_by_tag_df['Unnamed: 0']
corpus_by_tag_df.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
query_df.columns

Index(['CreationDate', 'AnswerCount', 'Body', 'CommentCount', 'FavoriteCount',
       'Score', 'Title', 'ViewCount', 'Body_text', 'Tag_list0', 'Tag_list1',
       'Tag_list2', 'Tag_list3', 'Tag_list4'],
      dtype='object')

In [5]:
all_tag=[]
for i in query_df.columns:
    if 'Tag_l' in i :
        all_tag.extend(query_df[i].unique())
all_tag=set(all_tag)

In [6]:
len(all_tag-set(corpus_by_tag_df.index))
map_dict={i:np.nan for i in all_tag-set(corpus_by_tag_df.index)}

In [7]:
for i in query_df.columns:
    if 'Tag_l' in i :
        query_df[i]=query_df[i].replace(map_dict)

In [8]:
#for k in np.arange(3):
#    print(k,'-iter')
#    try:
#        for i in query_df[[i for i in query_df.columns if 'Tag_l' in i]].columns:
#            print(i)
#            for j in query_df.index:
#                if query_df.loc[j,i] is np.nan:
#                    query_df.loc[j,i]=query_df.loc[j,i[:-1]+str(int(i[-1])+1)]
#    except:
#        pass


In [9]:
query_df[[i for i in query_df.columns if 'Tag_l' in i]].count()

Tag_list0    65161
Tag_list1    43785
Tag_list2    19272
Tag_list3     6155
Tag_list4     1287
dtype: int64

In [10]:
test_df=query_df[query_df.Tag_list0.notna()].copy()
test_df['Tag']=test_df['Tag_list0']
for i in query_df.columns:
    if 'Tag_l' in i and i[-1]!='0':
        x=query_df[query_df[i].notna()].copy()
        x['Tag']=x[i]
        test_df=pd.concat([test_df,x],axis=0)

In [11]:
keys_to_keep=list(test_df.Tag.value_counts()[test_df.Tag.value_counts()>=400].keys())

In [12]:
test_df.index=np.arange(test_df.shape[0])

In [13]:
idx=[i for i in test_df.index if test_df.loc[i,'Tag'] in keys_to_keep]

In [14]:
test_df=test_df.loc[idx,:]

## Approche non-supervisée

In [15]:
query_df.shape

(65666, 14)

### LatentDirichletAllocation

In [16]:
lda=LatentDirichletAllocation(n_components=30)

In [17]:
tfidf=TfidfVectorizer(ngram_range=(1,3),stop_words='english',max_df=0.9,min_df=0.0025,sublinear_tf=True)
tfidf_titre=TfidfVectorizer(ngram_range=(1,3),max_df=0.95,min_df=0.001,stop_words='english',sublinear_tf=True)

In [18]:
tf_df=tfidf.fit_transform(query_df.Body_text.apply(str))

In [19]:
lda_df=lda.fit_transform(tf_df)

In [20]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 15
display_topics(lda, tfidf.get_feature_names(), no_top_words)


Topic 0:
app ios android google device user api application play using iphone facebook use maps video
Topic 1:
div class div class html css div div li href width td src img id bootstrap body
Topic 2:
windows command install file installed files path error directory version run program folder python exe
Topic 3:
table database sql query select mysql db id column sql server insert tables data server create
Topic 4:
page chrome window browser button way using want click like javascript tab open firefox know
Topic 5:
git project xcode error build branch file repository commit app github android run files eclipse
Topic 6:
dataframe list df column columns data values pandas like value want string data frame row frame
Topic 7:
android layout_width layout_height android layout_width android layout_height xml android id id id android id id wrap_content id xmlns match_parent wrap_content android schemas
Topic 8:
int function std class foo use return code object does like type variable value diff

In [21]:
tf_df=tfidf.fit_transform(corpus_by_tag_df['0'])

In [47]:
lda_df=lda.fit_transform(tf_df)

In [48]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 15
display_topics(lda, tfidf.get_feature_names(), no_top_words)


Topic 0:
component react import const int export props std return include render angular int main main class
Topic 1:
use code int like question does way function just know class using understand example time
Topic 2:
xcode app docker ios error run running build file server device using version project application
Topic 3:
file files command script directory txt line bash path echo want shell folder like filename
Topic 4:
android layout_width layout_height android layout_width android layout_height android id android id id id id wrap_content id xml match_parent wrap_content android main string static void main
Topic 5:
df dataframe column data columns number pandas 10 like list values array want numbers row
Topic 6:
python py packages import module file pip install py line line error lib windows installed site packages
Topic 7:
npm 00 js node 12 node_modules 01 10 11 error 13 15 16 date 00 00
Topic 8:
numpy np plot import csv array matplotlib python axis data matrix notebook import num

In [49]:
corpus_by_tag_df

Unnamed: 0_level_0,0
Unnamed: 0,Unnamed: 1_level_1
javascript,"['how', 'can', 'i', 'get', 'react', 'to', 're..."
android,"['i', 'have', 'a', 'multi', 'project', '10', ..."
java,"['i', 'have', 'intellij', 'ultimate', '11', '..."
python,"['if', 'ones', 'catches', 'an', 'exception', ..."
ios,"['i', 've', 'been', 'trying', 'to', 'find', '..."
...,...
vector,"['over', 'the', 'past', 'few', 'days', 'i', '..."
ionic2,"['i', 'm', 'trying', 'to', 'implement', 'a', ..."
templates,"['how', 'can', 'i', 'output', 'the', 'value',..."
events,"['how', 'can', 'i', 'catch', 'the', 'event', ..."


### NonNegativeMatrixFactorisation

In [50]:
nmf =NMF(n_components=30,random_state=2)

In [51]:
tf_df=tfidf.fit_transform(query_df.Body_text.apply(str))

In [52]:
nmf_df=nmf.fit_transform(tf_df)

In [53]:
display_topics(nmf, tfidf.get_feature_names(), no_top_words)


Topic 0:
way like use know need using just does question ve work example possible make time
Topic 1:
android gradle app android studio build com google com android studio support project sdk android support build gradle compile
Topic 2:
android layout_width android layout_width layout_height android layout_height id android id android id id id id wrap_content match_parent wrap_content android match_parent android android com layout_height wrap_content
Topic 3:
public class void new public class public void private static override method public static override public return static void null
Topic 4:
div class div class div div li div div class id span div id div div div href ul bootstrap li li col
Topic 5:
function var console log console log return code false true object document scope data function return javascript
Topic 6:
python py import line module py line packages lib site packages print pip site traceback python2 traceback recent
Topic 7:
table select query sql database id mysq

## Approche mixte

## Approche supervisée

On va maintenant s'intéresser à des méthode supervisée pour classifier les textes. On va utiliser un nombre restreints de colonnes ainsi qu'une grille de recherche et des fonctions de préprocessing.

### Séparation des données et préparation des modèles.

In [54]:
query_df.columns

Index(['CreationDate', 'AnswerCount', 'Body', 'CommentCount', 'FavoriteCount',
       'Score', 'Title', 'ViewCount', 'Body_text', 'Tag_list0', 'Tag_list1',
       'Tag_list2', 'Tag_list3', 'Tag_list4', 'year'],
      dtype='object')

In [55]:
query_df['year']=pd.to_datetime(query_df.CreationDate).dt.year

In [56]:
test_df['year']=pd.to_datetime(test_df.CreationDate).dt.year

In [57]:
non_word_column=['AnswerCount','CommentCount', 'FavoriteCount',
       'Score','ViewCount','year']
words_columns=['Title','Body_text']
target_columns=['Tag_list0', 'Tag_list1',
       'Tag_list2', 'Tag_list3', 'Tag_list4']

In [58]:
rsmpl_df=resample(test_df,n_samples=45000)

In [59]:
rsmpl_df

Unnamed: 0,CreationDate,AnswerCount,Body,CommentCount,FavoriteCount,Score,Title,ViewCount,Body_text,Tag_list0,Tag_list1,Tag_list2,Tag_list3,Tag_list4,Tag,year
119208,2015-03-13 08:33:20,7,<p>I created a <code>carousel</code> with <cod...,1,15,68,Getting Cannot read property 'offsetWidth' of ...,107737,I created a carousel with Bootstrap 3.3 and it...,javascript,jquery,twitter-bootstrap,,,twitter-bootstrap,2015
81934,2013-05-18 01:08:15,14,<p>When I boot up Android Studio and select &q...,1,16,98,Failed to import new Gradle project: failed to...,136844,"When I boot up Android Studio and select ""New ...",java,android,gradle,android-studio,,android,2013
40360,2014-08-04 21:17:26,8,<p>I get so confused about 2D arrays in Swift....,3,37,116,Two-dimensional array in Swift,193770,I get so confused about 2D arrays in Swift. Le...,arrays,swift,,,,arrays,2014
91180,2015-04-01 10:42:37,6,<p>I installed last version of Node.js (12.2 x...,1,3,15,npm hangs on any command,14669,I installed last version of Node.js (12.2 x64 ...,windows,node.js,npm,,,node.js,2015
59750,2017-05-12 23:03:05,5,<p>I am new to Flutter and I was trying do exe...,0,11,69,No Material widget found,50791,I am new to Flutter and I was trying do execut...,flutter,,,,,flutter,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49589,2014-11-19 10:14:59,7,<p>I have used openCV python and encountered a...,2,5,38,Failure to use adaptiveThreshold: CV_8UC1 in f...,77101,I have used openCV python and encountered an e...,python,opencv,,,,python,2014
79511,2013-08-12 14:11:33,9,"<p>I'm building a new website, and I'm looking...",1,11,61,Is there a color code for transparent in HTML?,712605,"I'm building a new website, and I'm looking fo...",html,css,,,,css,2013
53935,2015-04-27 02:28:36,7,<p>I'm getting this error on just one server r...,4,2,17,C# System.Net.WebException: The underlying con...,95615,I'm getting this error on just one server runn...,c#,,,,,c#,2015
63904,2019-06-15 01:59:30,7,<p>While executing <strong>ng build --prod --b...,3,7,32,Angular 8 ng-build throwing MIME error with co...,13051,While executing ng build --prod --base-href ./...,angular,angular-cli,,,,angular,2019


In [60]:
lab_enc=LabelEncoder()

X=test_df[non_word_column+words_columns]
y=test_df['Tag']


In [61]:
y=lab_enc.fit_transform(y.fillna(''))

In [62]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [63]:
#a=tfidf_titre.fit_transform(rsmpl_df.Title)

In [64]:
#a

In [65]:
#a=tsvd2.fit_transform(a)

In [66]:
#print(np.cumsum(np.round(tsvd2.explained_variance_ratio_, decimals=4)*100))
#plt.plot(np.cumsum(np.round(tsvd2.explained_variance_ratio_, decimals=4)*100))

In [67]:
std=StandardScaler()
tsvd=TruncatedSVD(n_components=150, n_iter=20, random_state=2)
tsvd2=TruncatedSVD(n_components=60, n_iter=25, random_state=2)


word_proc=Pipeline([
    ('Tf-Idf',tfidf),
    ('truncatedSVD_word',tsvd),
],verbose=True)
title_proc=Pipeline([
    ('Tf-Idf',tfidf_titre),
    ('truncatedSVD',tsvd2),
],verbose=True)


col_proc=ColumnTransformer([
    ('Scale',std,non_word_column),
    ('Word_processing',word_proc,'Body_text'),
    ('Title_processing',title_proc,'Title'),
],n_jobs=-3,verbose=True)

def pipe(model) :
    pipe=Pipeline([
#        ('Préparation des données',col_proc),
        (str(model)[:5],model),
    ],verbose=True)
    return pipe

In [68]:
def proba_label(model,X,n_class):
    x=pd.DataFrame(model.predict_proba(X))
    x.columns=lab_enc.inverse_transform(x.columns)
    z=pd.DataFrame(index=x.index,columns=[str(n_class)+' most likely tags'])
    z[str(n_class)+' most likely tags']=[
        {k:v for k,v in x.transpose()[i].sort_values(ascending=False)[:n_class].items()} for i in x.index
    ]
    return (x,z[str(n_class)+' most likely tags'])

In [69]:
stratK=StratifiedKFold(n_splits=10,shuffle=True,random_state=2)
kfolds=stratK.split(X_train,y_train)
i=0
train_folds=dict()
test_folds=dict()
train_folds_target=dict()
test_folds_target=dict()
for train,test in kfolds:
    train_folds[i]=X_train.iloc[train,:]
    test_folds[i]=X_train.iloc[test,:] 
    train_folds_target[i]=y_train[train]
    test_folds_target[i]=y_train[test] 
    i+=1
del i

In [74]:
col_proc_folds=dict()
prepared_train_folds=dict()
for i in np.arange(len(train_folds)):
    col_proc_folds[i]=ColumnTransformer([
    ('Scale',std,non_word_column),
    ('Word_processing',word_proc,'Body_text'),
    ('Title_processing',title_proc,'Title'),
],n_jobs=-3,verbose=True)
    prepared_train_folds[i]=col_proc_folds[i].fit_transform(train_folds[i])
    print('fold - %s prepared'%i)

fold - 0 prepared
fold - 1 prepared
fold - 2 prepared
fold - 3 prepared
fold - 4 prepared
fold - 5 prepared
fold - 6 prepared
fold - 7 prepared
fold - 8 prepared
fold - 9 prepared


In [75]:
prepared_test_folds=dict()
for i in np.arange(len(train_folds)):
    prepared_test_folds[i]=col_proc_folds[i].transform(test_folds[i])
    print('fold - %s prepared'%i)

fold - 0 prepared
fold - 1 prepared
fold - 2 prepared
fold - 3 prepared
fold - 4 prepared
fold - 5 prepared
fold - 6 prepared
fold - 7 prepared
fold - 8 prepared
fold - 9 prepared


In [102]:
def k_fold_gridsearch(estimator,kfoldtrain,kfoldtest,target_train,target_test,params,verbose,n_jobs):
    grid=dict()
    score=dict()
    param=dict()
    print('fitting %s folds for % candidates'
          %(len(kfoldtrain.keys()),np.prod([len(params[i].values()) for i in params.keys()]) )
    
    for key, values in params.items:
          
    for i in np.arange(len(params.keys())):
        grid[i]=GridSearchCV(estimator=estimator,param_grid=params,cv=5,n_jobs=n_jobs,verbose=verbose)
    print('grids instanciated')
    
          
    for i in np.arange(len(kfoldtrain)):
            grid[i].fit(kfoldtrain[i],target_train[i])
            print('grid - %s fitted'%i)
            param[i]=grid[i].best_params_
            score=grid[i].score(kfoldtest[i],target_test[i])
            print('score for %s is %s and parameters %s'%(i,score[i],param[i]))
    return grid,score,param

SyntaxError: invalid syntax (<ipython-input-102-b6f8f0a53023>, line 6)

### Modèles.

#### Naive Base

In [86]:
nb=BernoulliNB()
pipebnb=pipe(nb,)

In [91]:
nb.get_params()

{'alpha': 1.0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}

In [93]:
prepared_test_folds

{0: array([[ 5.63695975e-01,  1.15378947e-01, -6.94865931e-02, ...,
         -2.07784532e-02, -7.16709072e-02, -2.33475444e-03],
        [ 3.57121402e-01, -8.17331979e-01,  1.16338294e-01, ...,
         -7.79788884e-02, -5.37879910e-02,  1.73523436e-02],
        [ 1.18341969e+00, -8.17331979e-01, -2.55311480e-01, ...,
         -4.96176254e-02, -4.49105178e-02,  6.46093448e-02],
        ...,
        [-4.69176890e-01, -5.06428337e-01,  1.98451743e-03, ...,
         -2.59603512e-02,  1.01152309e-02,  2.56174730e-03],
        [ 2.83601628e+00, -1.95524695e-01,  3.34683249e+00, ...,
          3.72095795e-03, -1.30700941e-03,  4.82381609e-03],
        [-2.62602317e-01,  1.98080080e+00, -1.98134592e-01, ...,
         -1.75642334e-02, -1.04482483e-01, -5.27544605e-02]]),
 1: array([[-6.72766421e-01, -8.18020627e-01, -2.27009552e-01, ...,
         -5.07778163e-02, -1.18742147e-01,  1.42270743e-01],
        [-2.62798976e-01, -5.06268474e-01, -1.27350281e-01, ...,
         -1.39338425e-01,  7.436

In [195]:
grid_bnb=GridSearchCV(nb,{'Berno__alpha': np.arange(0.1,1,0.1),},cv=np.arange(len(prepared_test_folds.keys())),n_jobs=-3,verbose=3)


In [196]:
#pipebnb.fit(X_train.iloc[:],y_train[:])
grid_bnb.fit(prepared_train_folds,prepared_test_folds)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


TypeError: cannot unpack non-iterable numpy.int32 object

In [50]:
grid_bnb.score(X_test,y_test)

0.41503866503866504

In [51]:
grid_bnb.best_params_

{'Berno__alpha': 0.8}

#### Logistic Regression


In [52]:
logreg=LogisticRegression(solver='saga',n_jobs=-3,verbose=10,max_iter=500,random_state=2)
pipelog=pipe(logreg)

In [53]:
pipelog.get_params()

{'memory': None,
 'steps': [('Préparation des données',
   ColumnTransformer(n_jobs=-3,
                     transformers=[('Scale', StandardScaler(),
                                    ['AnswerCount', 'CommentCount',
                                     'FavoriteCount', 'Score', 'ViewCount',
                                     'year']),
                                   ('Word_processing',
                                    Pipeline(steps=[('Tf-Idf',
                                                     TfidfVectorizer(max_df=0.9,
                                                                     min_df=0.0025,
                                                                     ngram_range=(1,
                                                                                  3),
                                                                     stop_words='english',
                                                                     sublinear_tf=True)),
                       

In [54]:
grid_log=GridSearchCV(pipelog,{ 'Logis__penalty': ['l2','elasticnet'],'Logis__C':np.arange(0.1,1,0.1)
},cv=10,n_jobs=-3,verbose=3)

In [55]:
#x=tfidf.fit_transform(X_train['Body_text'])

In [56]:
X_train

Unnamed: 0,AnswerCount,CommentCount,FavoriteCount,Score,ViewCount,year,Title,Body_text
61753,7,5,4,17,60708,2015,Laravel 5 - Php artisan syntax error,I am currently developing an app with Laravel ...
89669,7,0,4,19,28582,2015,Pandas type error trying to plot,I'm trying to create a basic scatter plot base...
103057,5,0,3,16,60527,2015,Slick Slider Next Arrows not showing,I'm trying to get the next and previous arrows...
60201,6,1,6,45,67905,2014,How to get a complete row or column from 2D ar...,I do not want to use a jagged array and I have...
57840,5,1,23,41,16275,2014,How to plug my Autofac container into ASP. NET...,I have been looking into the new features of t...
...,...,...,...,...,...,...,...,...
52677,8,1,53,195,201902,2016,"How to get rid of ""Unnamed: 0"" column in a pan...",I have a situation wherein sometimes when I re...
55101,18,11,7,19,12136,2017,How to create a method to return 1 or 0 withou...,I was asked a question in an interview to retu...
72405,7,3,9,29,28634,2012,C++ string::find complexity,Why the c++'s implemented string::find() doesn...
42365,6,2,22,45,68740,2015,Override PHP base dependency in composer,I try to install Laravel 5.1 on a host which ...


In [None]:
grid_log.fit(X_train,y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-3)]: Done  20 tasks      | elapsed: 157.9min


In [None]:
#logreg.fit(X_train[non_word_column].iloc[:1000],y_train[:1000])

In [None]:
grid_log.score(X_test,y_test)

In [None]:
grid_log.best_params_

####  XGBoost

In [None]:
xgb=XGBClassifier(n_estimators=800,n_jobs=-3,verbose=30,random_state=2)
pipexgb=pipe(xgb)

In [None]:
pipexgb.get_params()

In [None]:
grid_xgb=GridSearchCV(pipexgb,{ 'XGBCl__max_depth': np.arange(3,20,3),
                               'XGBCl__gamma': np.arange(0.1,1.5,0.1),'XGBCl__learning_rate': np.arange(0.1,5,0.7),
},cv=4,n_jobs=-3,verbose=3)

#x=tfidf.fit_transform(X_train['Body_text'])

In [None]:
grid_xgb.fit(X_train,y_train)

In [None]:
#grid_xgb.predict_proba(X_test)

In [None]:
#xgb.fit(X_train[non_word_column].iloc[:1000],y_train[:1000])

In [None]:
#xgb.predict_proba(X_test[non_word_column]).shape

In [None]:
grid_xgb.score(X_test,y_test)

In [None]:
grid_xgb.best_params_

In [None]:
x,z =proba_label(grid_bnb,X_test,7)

In [None]:
z[7005]

In [None]:
X_test.iloc[705].Title

In [None]:
lab_enc.inverse_transform(pipebnb.predict(X_test.iloc[[380,136]]))

## Modèle retenu

In [None]:
X_train['Title']

In [None]:
word_proc.fit(X_train[words_columns])

In [None]:
col_proc

In [None]:
(tsvd.fit_transform,tsvd.fit_transform)(x,x)

In [None]:
word_proc.fit(X_train[['Body_text']])