In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std

In [120]:
sport_df = pd.read_excel('glossary/deportes.xlsx', names = ['word', 'rank'] , usecols = [0,1])
sport_df['class'] = 0

health_df = pd.read_excel('glossary/salud.xlsx', names = ['word', 'rank'] , usecols = [0,1])
health_df['class'] = 1

politics_df = pd.read_excel('glossary/politica.xlsx', names = ['word', 'rank'] , usecols = [0,1])
politics_df['class'] = 2

In [121]:
print('Sports Glossary Size:', len(sport_df))
print('Health Glossary Size:', len(health_df))
print('Politics Glossary Size:', len(politics_df))

print('Total Glossary Size:', (len(politics_df) + len(health_df) + len(sport_df)))

Sports Glossary Size: 5961
Health Glossary Size: 4779
Politics Glossary Size: 6489
Total Glossary Size: 17229


In [122]:
glossariy_df = pd.concat([sport_df, health_df, politics_df])
glossariy_df = glossariy_df.drop_duplicates(subset='word', keep = False)
glossariy_df.reset_index(drop=True, inplace=True)

In [123]:
print('Sports Glossary Size:', len(glossariy_df[glossariy_df['class'] == 0]))
print('Health Glossary Size:', len(glossariy_df[glossariy_df['class'] == 1]))
print('Politics Glossary Size:', len(glossariy_df[glossariy_df['class'] == 2]))

print('Total Glossary Size:', len(glosariy_df))

Sports Glossary Size: 4621
Health Glossary Size: 3473
Politics Glossary Size: 4906
Total Glossary Size: 13000


In [124]:
df_list = [glossariy_df[glossariy_df['class'] == 0][:100], glossariy_df[glossariy_df['class'] == 1][:100], glossariy_df[glossariy_df['class'] == 2][:100]]

glossariy_df = pd.concat(df_list, ignore_index=True)

In [125]:
glossariy_df

Unnamed: 0,word,rank,class
0,equipos,0.733216,0
1,juegos,0.648853,0
2,arco,0.513120,0
3,tiro,0.476046,0
4,federación,0.452823,0
...,...,...,...
295,militares,0.191268,2
296,responde,0.189518,2
297,calvo,0.188902,2
298,carmen calvo,0.188902,2


In [126]:
myvocabulary = list(glossariy_df['word'])

tfidf_global = TfidfVectorizer(input='filename', vocabulary = myvocabulary, ngram_range=(1,3))

In [127]:
x_train = tfidf_global.fit_transform(['processed-corpus/Deportes_1.txt', 'processed-corpus/Salud_1.txt'])
x_train = x_train.todense()

In [128]:
df_x_train = pd.DataFrame(x_train)
df_x_train.columns = myvocabulary
df_x_train

Unnamed: 0,equipos,juegos,arco,tiro,federación,balón,oro,set,brown,serbia,...,franja,socialista,fernández,ámbito,judicial,militares,responde,calvo,carmen calvo,afirmado
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [129]:
x_test = tfidf_global.transform(['processed-corpus/Deportes_2.txt'])
x_test = x_test.todense()

In [130]:
df_y = pd.DataFrame({'Class':[0, 1]})

In [135]:
model = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo', probability=True)

In [136]:
y_train = df_y.values
x_train = df_x_train.values

In [137]:
cv = RepeatedKFold(n_splits=2, n_repeats=3, random_state=1)
metrics = cross_validate(model, x_train, y_train, scoring=['precision_macro', 'recall_macro'], cv=cv, n_jobs=-1)

print('Precision: ', str(round((mean(metrics["test_precision_macro"])),3)), '| Desviación típica: ', str( round( std(metrics["test_precision_macro"]), 3)))

ValueError: 
All the 6 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\AdrianAD\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\AdrianAD\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py", line 182, in fit
    y = self._validate_targets(y)
  File "C:\Users\AdrianAD\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py", line 739, in _validate_targets
    raise ValueError(
ValueError: The number of classes has to be greater than one; got 1 class


In [140]:
clasificador = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo', probability=True).fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [None]:
predicciones = clasificador.predict_proba(x_test)
predicciones_rounded = [np.round(x,3) for x in predicciones]

df_predicciones = pd.DataFrame(predicciones_rounded)
df_predicciones.columns=['Salud', 'Politica', 'Deportes']
df_predicciones.index.name = 'Documento'
df_predicciones['Clase_real'] = y_test
df_predicciones.to_excel('/content/drive/MyDrive/Ignieria_Linguistica/modelo_SVM_glosario/predicciones.xlsx')