# 2. Training
In this section, the clean dataset is split into two sets:

- training set
- testing set

The chosen model is trained on the training set.

In [None]:
import pandas as pd

clean_dataset_path = "./data/dataset-clean.csv"

try:
    df = pd.read_csv(clean_dataset_path)
except FileNotFoundError:
    print("[dataset]: file not found")

df.head()

In [None]:
# cleaned text as the feature (input)
X = df['clean_text'].fillna('')
# label as the class (output)
y = df['label']

print(f"Shape of features(X): {X.shape}")
print(f"Shape of labels(X): {y.shape}")

### 2.1 Train-test split
- The testing set should never be exposed to the model during training.
- `stratify=y` is important for classification tasks to ensure that the proportion of classes is roughly the same in both training and test sets.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Size of training set: {len(X_train)}")
print(f"Size of testing set: {len(X_test)}")

### 2.2 Model selection
To select the appropriate machine learning algorithms for text classification, a pipeline is built to transform text into numerical features and then classify them.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

pipeline_mnb = Pipeline(
    [
        ("tfdif", TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
        ("clf", MultinomialNB(alpha=1.0)),
    ]
)

pipeline_mnb.fit(X_train, y_train)

pipeline_mnb

In [None]:
pipeline_mnb.predict("what can you do")