# Natural Language Processing (NLP)

In [None]:
# Si usamos colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

## Carga de datos

In [None]:
# Si usamos colab
df = pd.read_csv("/content/drive/My Drive/nlp/new_drug_train.tsv", sep = "\t")

# Si usamos local
# df = pd.read_csv("new_drug_train.tsv", sep = "\t")

In [None]:
df.info()

In [None]:
df

## Preprocesamiento de datos

El preprocesamiento de datos es una tarea importante en el procesamiento de lenguaje natural. En este caso vamos a realizar las siguientes tareas:
- Convertir el texto a minúsculas
- Eliminar caracteres especiales
- Tokenizar el texto
- Eliminar las stopwords (palabras comunes que no aportan significado).
- Lematizar el texto (reducir las palabras a su raíz).

In [None]:
stop_words = set(stopwords.words('english'))
negation_words = {"no", "not", "never", "none", "n't"}
stop_words = stop_words - negation_words
stop_words

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['clean_review'] = df['review'].apply(preprocess_text)
df[['review', 'clean_review']].head()

In [None]:
df = df.drop(columns = ['review'])

## Split de datos

In [None]:
from sklearn.model_selection import train_test_split
random_state = 17

X, y = df.loc[:, df.columns != 'rating'], df["rating"]
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=(1.0/3),
    random_state=random_state)

## Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Ponemos un min_df =0.1 para reducir la cantidad de vectores a solo aquellos que se repiten mas del 2%
vectorizer = CountVectorizer(stop_words="english", min_df=0.02)

Creamos una matriz con los tokens que aparece cada palabra en cada review. Utilizamos *fit* para que aprenda el vocabulario de los textos (identificar las palabras únicas) y *transform* para convertir convertir cada documentos del corpus en una matriz donde las filas representan las reviews y las columnas las palabras.

In [None]:
X_train_review_tok_matrix = vectorizer.fit_transform(X_train["clean_review"])

Para el conjunto de test, solo utilizamos *transform* ya que debemos utilizar el vocabulario aprendido en train. Hacer *fit* implicaría introducir información de test durante el entrenamiento.

In [None]:
X_test_review_tok_matrix = vectorizer.transform(X_test["clean_review"])

Convertimos las matrices en un DataFrame

In [None]:
X_train_review_tok = pd.DataFrame(X_train_review_tok_matrix.toarray())
X_test_review_tok = pd.DataFrame(X_test_review_tok_matrix.toarray())

Concatenamos el conjunto de train con los vectores y hacemos lo mismo en test.

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
X_train_review_tok.reset_index(drop=True, inplace=True)
X_test_review_tok.reset_index(drop=True, inplace=True)

In [None]:
X_train_concat = pd.concat([X_train, X_train_review_tok], axis=1)
X_test_concat = pd.concat([X_test, X_test_review_tok], axis=1)
X_train_concat

### Modelo de clasificación

Usaremos un árbol de decisión para predecir el rating.

In [None]:
X_train_concat

#### Predecimos usando solamente los datos de las reviews.

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=10, criterion='gini', random_state=42)

tree.fit(X_train_concat.iloc[:, 6:], y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_train_pred = tree.predict(X_train_concat.iloc[:, 6:])
y_test_pred = tree.predict(X_test_concat.iloc[:, 6:])

print("Train accuracy: ", accuracy_score(y_train, y_train_pred))
print("Test accuracy: ", accuracy_score(y_test, y_test_pred))

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(stop_words="english", min_df=0.02)

In [None]:
tfidf_X_train_review_tok_matrix = tf_idf_vectorizer.fit_transform(X_train["clean_review"])
tfidf_X_test_review_tok_matrix = tf_idf_vectorizer.transform(X_test["clean_review"])

In [None]:
tfidf_X_train_review_tok = pd.DataFrame(tfidf_X_train_review_tok_matrix.toarray())
tfidf_X_test_review_tok = pd.DataFrame(tfidf_X_test_review_tok_matrix.toarray())

In [None]:
tfidf_X_train_review_tok.reset_index(drop=True, inplace=True)
tfidf_X_test_review_tok.reset_index(drop=True, inplace=True)

In [None]:
tfidf_X_train_concat = pd.concat([X_train, tfidf_X_train_review_tok], axis=1)
tfidf_X_test_concat = pd.concat([X_test, tfidf_X_test_review_tok], axis=1)
tfidf_X_test_concat

## Modelo de clasificación

In [None]:
tree = DecisionTreeClassifier(max_depth=30, criterion='gini', random_state=42)

tree.fit(tfidf_X_train_concat.iloc[:, 6:], y_train)

In [None]:
y_train_pred = tree.predict(tfidf_X_train_concat.iloc[:, 6:])
y_test_pred = tree.predict(tfidf_X_test_concat.iloc[:, 6:])

print("Train accuracy: ", accuracy_score(y_train, y_train_pred))
print("Test accuracy: ", accuracy_score(y_test, y_test_pred))