# Setup

In [None]:
!pip install --upgrade --no-cache-dir gdown
!pip install imbalanced-learn
!gdown https://drive.google.com/drive/folders/1tTYmeHspNsESUe5-qOBjwLeodyVSxPb9?usp=sharing -O /tmp/ --folder

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Retrieving folder list
Processing file 11lcXmCAhzu89eClIIlD2P42ehuLoDSKP complaints_processed.csv
Processing file 1z_K5OcQsPdediNK_ikli3ea1sjMHvpzo final-dataset-v2.csv
Processing file 1HyF-lkrRwRleSkoxtmBkIXRudgXz1W-i final-dataset.csv
Processing file 17Gm1okAxEznrNzvvEVlUen0QpCdlfLAXvgSsTzn6JBk SADP - Triagem Automatizada
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=11lcXmCAhzu89eClIIlD2P42ehuLoDSKP
T

In [None]:
import pandas as pd

dataset = pd.read_csv('/tmp/Sistemas de Apoia a Decisão/final-dataset-v2.csv')
dataset.drop('Unnamed: 0', axis=1, inplace=True)
dataset.head()

Unnamed: 0,narrative_transformed,y_target_label
0,purchase order day shipping amount receive pro...,0
1,forwarded message date tue subject please inve...,0
2,forwarded message cc sent friday pdt subject f...,1
3,payment history missing credit report speciali...,2
4,payment history missing credit report made mis...,2


# Trainning

In [None]:
black_list = ['told', 'stated', 'said', 'asked', 'time', 'please']
def remove_words(sentence):
  sentences = sentence.split()
  filtered = [word for word in sentences if word not in black_list]
  return ' '.join(filtered)

dataset['narrative_transformed'] = dataset['narrative_transformed'].apply(remove_words)
dataset['narrative_transformed'].head()

0    purchase order day shipping amount receive pro...
1    forwarded message date tue subject investigate...
2    forwarded message cc sent friday pdt subject f...
3    payment history missing credit report speciali...
4    payment history missing credit report made mis...
Name: narrative_transformed, dtype: object

### Train-Test split

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from gensim.models import Word2Vec

narrative = dataset['narrative_transformed']
y_targets = dataset['y_target_label']

X_train, X_test, y_train, y_test = train_test_split(narrative, y_targets, test_size=0.2, stratify=dataset['y_target_label'], random_state=40)

### Text embedding (Vectorization)

In [None]:
VECTOR_SIZE = 150
words = [sentence.split() for sentence in X_train]
vectorizer = Word2Vec(sentences=words, vector_size=VECTOR_SIZE, window=5, min_count=2)


In [None]:
import numpy as np

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [vectorizer.wv[word] for word in words if word in vectorizer.wv]
    if len(words_vecs) == 0:
        return np.zeros(VECTOR_SIZE)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train_vect = np.array([vectorize(sentence) for sentence in X_train])
X_test_vect = np.array([vectorize(sentence) for sentence in X_test])

### Scale the data and train

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC

model = make_pipeline(MinMaxScaler(), SelectKBest(chi2, k=130), SVC())
model.fit(X_train_vect, y_train)

# Model Results

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
model.score(X_test_vect, y_test)

0.8693162577348151