#### 1. Importing the libraries

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score,confusion_matrix
import pandas as pd
import numpy as np

#### 2. Load [data](https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews/code) from kaggle

In [None]:
data = pd.read_csv('Reviews.csv')[:2000]

#### Splitting data

In [None]:
# Selecting an specific data
data_sa_text = data.loc[:,['Text', 'Score']]

x_train, x_test, y_train,y_test = train_test_split(data_sa_text['Text'],data_sa_text['Score'],test_size=0.25) # stratify=data_sa_text['Score']

In [None]:
data['Score'].value_counts()

#### Sentimental Analysis

##### Entropy | class_weight='balanced'

In [None]:
# Creating CountVectorizer
vectorizer = CountVectorizer(strip_accents='ascii',stop_words='english')

# Training CountVectorizer model and transformr x_test
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

# Creating Tree Classifier
dtc = DecisionTreeClassifier(criterion='entropy',splitter='random',class_weight='balanced',random_state=42)

# Training the Tree Classifier
dtc.fit(x_train_vectorized,y_train)

# Predict for x_test_vectorized
y_test_predict = dtc.predict(x_test_vectorized)

# Check the score of the model
print(dtc.score(x_test_vectorized,y_test))
confusion_matrix(y_test,y_test_predict)

In [None]:
# vectorizer = CountVectorizer(strip_accents='ascii',stop_words='english',ngram_range=(2,3),min_df=5)
tfidf_vectorizer = TfidfVectorizer(stop_words='english',strip_accents='ascii',ngram_range=(2,2))

# Training CountVectorizer model and transformr x_test
x_train_vectorized = tfidf_vectorizer.fit_transform(x_train)
x_test_vectorized = tfidf_vectorizer.transform(x_test)

# Creating Tree Classifier
dtc = DecisionTreeClassifier(criterion='gini',splitter='best',random_state=42,class_weight='balanced')

# Training the Tree Classifier
dtc.fit(x_train_vectorized,y_train)

# Predict for x_test_vectorized
y_test_predict = dtc.predict(x_test_vectorized)


# Check the score of the model
print(dtc.score(x_test_vectorized,y_test))
confusion_matrix(y_test,y_test_predict)

In [None]:
plot_tree(dtc)

In [None]:
# vectorizer = CountVectorizer(strip_accents='ascii',stop_words='english',ngram_range=(2,3),min_df=5)
tfidf_vectorizer = TfidfVectorizer(stop_words='english',strip_accents='ascii',ngram_range=(2,3))

# Training CountVectorizer model and transformr x_test
x_train_vectorized = tfidf_vectorizer.fit_transform(x_train)
x_test_vectorized = tfidf_vectorizer.transform(x_test)

# Creating Tree Classifier
dtc = DecisionTreeClassifier(criterion='gini',splitter='best',random_state=42,class_weight='balanced')

# Training the Tree Classifier
dtc.fit(x_train_vectorized,y_train)

# Predict for x_test_vectorized
y_test_predict = dtc.predict(x_test_vectorized)


# Check the score of the model
print(dtc.score(x_test_vectorized,y_test))
confusion_matrix(y_test,y_test_predict)

In [None]:
plot_tree(dtc)

#### SVM

In [None]:
from sklearn.svm import SVC

In [None]:
# Selecting an specific data
data_sa_text = data.loc[:,['Text', 'Score']]

x_train, x_test, y_train,y_test = train_test_split(data_sa_text['Text'],data_sa_text['Score'],test_size=0.25) # stratify=data_sa_text['Score']bb

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vecto = CountVectorizer(stop_words='english',strip_accents='ascii')

x_train_vecto_svm = vecto.fit_transform(x_train)

x_test_vecto_svm = vecto.transform(x_test)

In [None]:
svm = SVC(kernel='rbf')

svm.fit(x_train_vecto_svm,y_train)

In [None]:
y_predict = svm.predict(x_test_vecto_svm)

In [None]:
svm.score(x_test_vecto_svm,y_test)
confusion_matrix(y_test,y_test_predict)

In [None]:
tdif_vecto = TfidfVectorizer(stop_words='english')
x_tdif_vecto = tdif_vecto.fit_transform(x_train)

In [None]:
np.max(x_tdif_vecto)

In [None]:
x_tdif_vecto[0].toarray().flatten()

In [None]:
features = tdif_vecto.get_feature_names_out()


In [None]:
dict_tfidf= zip(features,x_tdif_vecto[0].toarray().flatten())

In [None]:
tdif_vecto.get_feature_names_out()

# Obtener las 10 palabras más importantes de cada documento
for i, doc in enumerate(x_train[:10]):
    print(f"Documento {i+1}:")
    
    # Convertir la fila en un array de (palabra, tfidf) tuplas
    tfidf_scores = zip(features, tfidf_matrix[i].toarray().flatten())
    
    # Ordenar las palabras por TF-IDF y tomar las 10 más importantes
    top_words = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:10]                                                                                                                                   
    for word, score in top_words:
        print(f"{word}: {score:.4f}")
    print()

In [None]:
x_train[:10]