In [48]:
import json
import os
import numpy as np
import pandas as pd

import nltk
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('punkt_tab')

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ignacio.milesi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ignacio.milesi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ignacio.milesi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ignacio.milesi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ignacio.milesi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_da

In [49]:
# funciones para el procesamiento del texto
stop_words = set(stopwords.words('english'))
count_vectorizer = CountVectorizer

def vectorize_text(text: list[str]):
    """This function transforms data for prediction"""
    X_vectorized = count_vectorizer.transform([text])
    tfidf_transformer = TfidfTransformer()
    tfidf_matrix = tfidf_transformer.fit_transform(X_vectorized)
    return tfidf_matrix

def preprocessing_fn(text):
    # tokenization, removing stopwords
    tokens = word_tokenize(text.lower())
    tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]
    
    # lemmatization, and extracting nouns
    tagged = pos_tag(tokens_without_stopwords)
    array_nouns = [word for word, pos in tagged if pos.startswith('NN')]
    nouns = ' '.join(array_nouns)

    
    X_vectorized = vectorize_text(nouns)
    return X_vectorized


def run_preprocessing_fn(X):
    """ This functions runs the preprocessing pipeline """
    processed_data = [preprocessing_fn(text) for text in X]

    data_vectorizada= [sparse_matrix.toarray() for sparse_matrix in processed_data]
    data_vectorizada = np.vstack(data_vectorizada) 
    
    return data_vectorizada

In [13]:
def read_json(path: str, file_name: str):
        """This method is used to read the json file"""
        file_path = os.path.join(path, file_name)
        with open(file_path, encoding="utf8") as file:
            datos = json.load(file)
        df_tickets = pd.json_normalize(datos)
        return df_tickets

data_sin_procesar = read_json("data/data_sin_procesar", "tickets_classification_eng.json")

In [14]:
data_sin_procesar_corta = data_sin_procesar[["_source.complaint_what_happened"]]

data_analizar = data_sin_procesar_corta[data_sin_procesar_corta["_source.complaint_what_happened"]!= ""].sample(20)

data_analizar.head(3)

Unnamed: 0,_source.complaint_what_happened
4654,"On XX/XX/2020, I called Chase Sapphire reserve..."
70771,On XXXX/XXXX/XXXX I was approve for a trial mo...
29982,I am facing foreclosure after I have been tryi...


In [None]:
label_mapping = {
    "0": "Bank Account Services",
    "1": "Credit Report or Prepaid Card",
    "2": "Mortgage/Loan"}

In [51]:
count_vectorizer = joblib.load("app/count_vectorizer.pkl")

model = joblib.load("model.pkl")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [38]:
# Vectorizo la data
data_procesada = run_preprocessing_fn(data_analizar["_source.complaint_what_happened"])

# Realizo la predicción y la decodifico ()
preds = model.predict(data_procesada)

preds_list = []

for pred in preds:
    decoded_predictions = label_mapping[str(pred)]
    preds_list.append(decoded_predictions)

print(preds_list)

['Credit Report or Prepaid Card', 'Mortgage/Loan', 'Mortgage/Loan', 'Mortgage/Loan', 'Credit Report or Prepaid Card', 'Credit Report or Prepaid Card', 'Credit Report or Prepaid Card', 'Bank Account Services', 'Mortgage/Loan', 'Bank Account Services', 'Bank Account Services', 'Bank Account Services', 'Credit Report or Prepaid Card', 'Bank Account Services', 'Mortgage/Loan', 'Credit Report or Prepaid Card', 'Credit Report or Prepaid Card', 'Credit Report or Prepaid Card', 'Bank Account Services', 'Credit Report or Prepaid Card']
