In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score

In [None]:
df1 = pd.read_csv('decision.csv', index_col = 0)

df1

In [None]:
df1.preliminary_decision.value_counts()

In [None]:
# Checking for the preliminary decision
print(round(df1.preliminary_decision.value_counts(normalize=True)*100, 2))
round(df1.preliminary_decision.value_counts(normalize=True)*100, 2).plot(kind='bar')
plt.title('Percentage preliminary decision by book reviews')
plt.show


In [None]:
df2 = pd.read_csv('lemma.csv', index_col = 0)

df2

In [None]:
# Checking for the confidence
print(round(df2.confidence.value_counts(normalize=True)*100, 2))
round(df2.confidence.value_counts(normalize=True)*100, 2).plot(kind='bar')
plt.title('Percentage of confidence')
plt.show

In [None]:
# Checking for the evaluation
print(round(df2.evaluation.value_counts(normalize=True)*100, 2))
round(df2.evaluation.value_counts(normalize=True)*100, 2).plot(kind='bar')
plt.title('Percentage of evaluation')
plt.show

In [None]:
# Checking for the orientation
print(round(df2.orientation.value_counts(normalize=True)*100, 2))
round(df2.orientation.value_counts(normalize=True)*100, 2).plot(kind='bar')
plt.title('Percentage of orientation')
plt.show

In [None]:
# Extract 2 columns: level_0 and Lemma_text

In [None]:
df2_new = df2[['level_0','Lemma_text']]

df2_new.head()

In [None]:
# Checking 1st row before concatanation
df2_new['Lemma_text'][0]

In [None]:
# Merge rows by same value in the column ('level_0')
d = {'Lemma_text': lambda x: ' '.join(x)}
df_new = df2_new.groupby(df2_new['level_0']).aggregate(d)
df_new.head()

In [None]:
# Checking if the rows are well concatenated
df_new['Lemma_text'][0]

In [None]:
## Merge two dataframes
df_merge = pd.merge(df1, df_new, left_index=True, right_index=True)
df_merge.head()

In [None]:
# Drop review column
df_merge_drop = df_merge.drop(['review'], axis =1)
df_merge_drop.head()

### Model training

In [None]:
from sklearn.model_selection import train_test_split

independent_var = df_merge_drop.Lemma_text
target_var = df_merge_drop.preliminary_decision

X_train, X_test, y_train, y_test = train_test_split(independent_var, target_var, test_size=0.2, random_state = 42) 

print('X_train: ', len(X_train))
print('X_test: ', len(X_test))
print('y_train: ', len(y_train))
print('y_test: ', len(y_test))

### Vectorization Feature Engineering (TF-IDF)

In [None]:
tfvec = TfidfVectorizer()
#clf = LogisticRegression(solver = "liblinear") # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
clf = LinearSVC() # This has increase the ccuracy

In [None]:
model = Pipeline([('vectorizer', tfvec), ('classifier', clf)])
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
confusion_matrix(predictions, y_test)

### Model prediction

In [None]:
print("Accuracy: ", accuracy_score(predictions, y_test ))
print("Precision: ", precision_score(predictions, y_test, average = 'weighted' ) )
print("Recall: ", recall_score(predictions, y_test, average = 'weighted', labels=np.unique(predictions)))

In [None]:
# classification_report() builds a text report showing the main classification metrics.

print(classification_report(y_test, predictions))# , labels=np.unique(predictions)))


In [None]:
### Try a new review

review = ['su novela es una historia mágica que revela la esencia de la vida, el amor y la soledad. Debido a la simplicidad del estilo de la prosa, las ilustraciones divertidas y divertidas, la viveza de la imaginación, esta historia simple pero reveladora se considera una de las mejores obras literarias de todos los tiempos.']
result = model.predict(review)

print(result)