In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score


DATA_PATH = '../data/cannabis.csv'
IMG_DIR = "../docs/roteiro1/img"

## Etapa 1: carregamento e limpeza


In [9]:
df = pd.read_csv(DATA_PATH)
df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]

colunas = ["strain", "type", "rating", "effects", "flavor"]
erros = set(colunas) - set(df.columns)
if erros:
    raise ValueError(f"Colunas ausentes no DataFrame: {erros}")

#Padronizar o rating
df['rating'] = (
    df['rating'].astype(str)
    .str.replace(",", ".")
    .astype(float)
)

for col in ["strain", "effects", "flavor"]:
    df[col] = df[col].fillna("").str.split(",").apply(lambda x: [s.strip() for s in x if s.strip()])
    
#Remover linhas com valores ausentes
df = df.dropna()

## salvar estatísticas

In [None]:
df[["rating"]].describe().to_csv(os.path.join(IMG_DIR, "rating_describe.csv"))

## Graficos simples para visualização

In [11]:
plt.figure()
df["type"].value_counts().plot(kind="bar")
plt.title("Class balance (Type)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(IMG_DIR, "dist_classe.png"))
plt.close()

plt.figure()
df["rating"].dropna().plot(kind="hist", bins=20)
plt.title("Rating histogram")
plt.xlabel("rating")
plt.tight_layout()
plt.savefig(os.path.join(IMG_DIR, "hist_rating.png"))
plt.close()

## Pré-processamento

In [12]:
y = df["type"].astype(str)

# concatenamos textos (SEM descrição)
df["text_all"] = (
    df["effects"].astype(str) + " " +
    df["flavor"].astype(str) + " " +
    df["strain"].astype(str)
)

X = df[["rating", "text_all"]]

## Divisão dos Dados (treino e teste)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

## Treinamento do Modelo

In [None]:
preproc = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), ["rating"]),
        ("txt", TfidfVectorizer(lowercase=True, max_features=10000, ngram_range=(1,2)), "text_all"),
    ],
    remainder="drop"
)

clf = DecisionTreeClassifier(
    criterion="gini",
    random_state=42
)

pipe = Pipeline([("prep", preproc), ("clf", clf)])
pipe.fit(X_train, y_train)

## Avaliação do Modelo

In [18]:
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)

report = classification_report(y_test, y_pred, digits=3)
with open(os.path.join(IMG_DIR, "classification_report.txt"), "w", encoding="utf-8") as f:
    f.write(f"Accuracy: {acc:.4f}\n\n{report}")

cm = confusion_matrix(y_test, y_pred, labels=pipe.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipe.classes_)
disp.plot(include_values=True, xticks_rotation=45)
plt.title(f"Confusion Matrix (Acc={acc:.3f})")
plt.tight_layout()
plt.savefig(os.path.join(IMG_DIR, "matriz_confusao.png"))
plt.close()

## Plot dos primeiros níveis da árvore (treinando DT puro nas features transformadas)

In [None]:
Xt = preproc.fit_transform(X_train)
clf2 = DecisionTreeClassifier(criterion="gini", random_state=42)
clf2.fit(Xt, y_train)

plt.figure(figsize=(16, 10))
plot_tree(clf2, max_depth=3, filled=True, class_names=clf2.classes_)
plt.title("Decision Tree (top levels)")
plt.tight_layout()
plt.savefig(os.path.join(IMG_DIR, "arvore.png"))
plt.close()

print(f"OK! Accuracy: {acc:.4f}")
print("Outputs em docs/roteiro1/img/")