## Libraries

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import pandas as pd
import numpy as np

## Load features

In [36]:
# load scores calculated with alvisnlp 
df = pd.read_csv("output/tfidf.csv", sep = '\t', encoding = 'utf8', names = ['id', 'word', 'score'])
df

Unnamed: 0,id,word,score
0,20180911_bsv_grandes_cultures-26_cle0f15a8,colza,17.843761
1,20180911_bsv_grandes_cultures-26_cle0f15a8,maïs,10.543776
2,20180911_bsv_grandes_cultures-26_cle0f15a8,prairie,5.906345
3,20180911_bsv_grandes_cultures-26_cle0f15a8,maïs fourrage,4.744932
4,20180911_bsv_grandes_cultures-26_cle0f15a8,fourrage annuel,4.339467
...,...,...,...
2537,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,arbuste à baies,2.935575
2538,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,pêcher,2.393557
2539,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,vigne de cuve,1.972343
2540,bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,céréale,1.233387


In [37]:
matrix = df.pivot(index='id', columns='word', values='score') # transpose df to a tf-idf matrix
matrix = matrix.fillna(0) # replace all NaN
matrix

word,Chou cabus blanc,Culture,abricotier,abricotier pays,agrume,ail,amandier,ananas,arboriculture,arboriculture fruitière,...,tomate,tournesol,triticale,trèfle,vigne,vigne de cuve,vigne de table,zone non agricole,échalote,épinard
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20180802_bsvmaraichage_19_cle04c2cf,0.0,0.488958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,16.768792,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
20180911_bsv_grandes_cultures-26_cle0f15a8,0.0,1.548368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,2.825455,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
20180920_bsvmaraichage_cle0649bf,0.0,1.303889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.476309,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
20181002_bsv_grandes_cultures_29_cle0423a6,0.0,0.896423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
20181011_bsvmaraichage_27_cle09c363,0.0,0.977916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.061275,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,0.0,1.140902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,5.650910,0.0,0.0,13.250103,1.972343,0.0,3.358638,0.0,0.0
char_gdes_Cultures_no11_du_30-04-19_cle096f9c,0.0,1.466875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,9.889093,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
char_gdes_Cultures_no14_du_22-05-19_cle0b1586,0.0,1.466875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,7.063638,0.0,0.0,0.000000,0.000000,0.0,6.717276,0.0,0.0
char_gdes_Cultures_no26_du_14-08-19_cle0fb929,0.0,0.488958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0


In [53]:
matrix.iloc[:,0:-1]

word,Chou cabus blanc,Culture,abricotier,abricotier pays,agrume,ail,amandier,ananas,arboriculture,arboriculture fruitière,...,tabac,tomate,tournesol,triticale,trèfle,vigne,vigne de cuve,vigne de table,zone non agricole,échalote
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20180802_bsvmaraichage_19_cle04c2cf,0.0,0.488958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,16.768792,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
20180911_bsv_grandes_cultures-26_cle0f15a8,0.0,1.548368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,2.825455,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
20180920_bsvmaraichage_cle0649bf,0.0,1.303889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,23.476309,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
20181002_bsv_grandes_cultures_29_cle0423a6,0.0,0.896423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
20181011_bsvmaraichage_27_cle09c363,0.0,0.977916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.061275,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bsv_viti_mp_gaillac_n18_30072019_cle0fe8aa,0.0,1.140902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,5.650910,0.0,0.0,13.250103,1.972343,0.0,3.358638,0.0
char_gdes_Cultures_no11_du_30-04-19_cle096f9c,0.0,1.466875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,9.889093,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
char_gdes_Cultures_no14_du_22-05-19_cle0b1586,0.0,1.466875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,7.063638,0.0,0.0,0.000000,0.000000,0.0,6.717276,0.0
char_gdes_Cultures_no26_du_14-08-19_cle0fb929,0.0,0.488958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0


In [46]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
X = matrix.iloc[:, 0:4].values
y = matrix.iloc[:, 4].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=1)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)


from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)



from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy' + str(accuracy_score(y_test, y_pred)))

KeyError: 0

In [39]:
# 
U, sigma, V = np.linalg.svd(matrix)
print ("V = ")
np.round(V, decimals=2)

V = 


array([[-0.03, -0.05, -0.  , ..., -0.  , -0.03, -0.07],
       [ 0.  , -0.02,  0.  , ..., -0.01,  0.  ,  0.01],
       [-0.04,  0.02,  0.01, ...,  0.  , -0.01, -0.  ],
       ...,
       [ 0.  , -0.  , -0.12, ...,  0.  ,  0.  ,  0.03],
       [ 0.  , -0.  , -0.28, ..., -0.  , -0.  , -0.  ],
       [ 0.  ,  0.  ,  0.04, ...,  0.  ,  0.  , -0.  ]])