
# German Text Phrases Classification Using Naive Bayes
This notebook demonstrates a basic pipeline for text classification using a Naive Bayes classifier.
The steps include data loading,preprocessing,feature extraction,model training, and evaluation.
<br>
Dataset: Given German Short Phrases with class labelling.


In [2]:

import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [3]:
# Loading dataset
file_path = '/Users/ibk/Downloads/sample_data.csv'
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,text,label
0,zucker fabrik,ft
1,Lebensmittel kommssionierung,ft
2,geländer biegen,mr
3,gebäudeausrüstung technische,ct
4,kürbiskernöl softgels,ft


In [5]:
# Handling Missing Values
df.dropna(inplace=True)

### Data Pre-Processing

In [6]:
import re
import nltk
from nltk.stem.snowball import GermanStemmer


nltk.download("punkt")


def preprocess_german_text(text: str) -> str:
    """This function preprocess the german phrases, 
    It takes a string `text` as input and returns a preprocessed text."""

    # Convert to Lowercase
    text = text.lower()

    # Handle German umlauts and special characters
    replacements = {"ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss"}
    for k, v in replacements.items():
        text = text.replace(k, v)

    # Remove E-Mails
    text = re.sub(r"\S+@\S+", "", text)

    # Remove numbers
    text = re.sub(r"\d+", "", text)

    # Remove punctuation and special characters
    text = re.sub(r"[^\w\s]", "", text)

    # Tokenization
    tokens = nltk.word_tokenize(text, language="german")

    # Stemming
    stemmer = GermanStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    return " ".join(stemmed_tokens)

[nltk_data] Downloading package punkt to /Users/ibk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Apply preprocessing to the text column
df['processed_text'] = df['text'].apply(preprocess_german_text)

In [8]:
df.head()

Unnamed: 0,text,label,processed_text
0,zucker fabrik,ft,zuck fabrik
1,Lebensmittel kommssionierung,ft,lebensmittel kommssionier
2,geländer biegen,mr,gelaend bieg
3,gebäudeausrüstung technische,ct,gebaeudeausruest technisch
4,kürbiskernöl softgels,ft,kuerbiskernoel softgel


In [9]:
# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df["processed_text"], df["label"], test_size=0.2, random_state=42
)

In [10]:
# Creating a Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [11]:
# Parameter Grid
param_grid = {
    'tfidf__max_features': [3000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'nb__alpha': [0.1, 1, 10]
}

# Grid search with CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END nb__alpha=0.1, tfidf__max_features=3000, tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   0.2s
[CV] END nb__alpha=0.1, tfidf__max_features=3000, tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   0.1s
[CV] END nb__alpha=0.1, tfidf__max_features=3000, tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   0.1s
[CV] END nb__alpha=0.1, tfidf__max_features=3000, tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   0.1s
[CV] END nb__alpha=0.1, tfidf__max_features=3000, tfidf__ngram_range=(1, 1), tfidf__use_idf=True; total time=   0.1s
[CV] END nb__alpha=0.1, tfidf__max_features=3000, tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time=   0.1s
[CV] END nb__alpha=0.1, tfidf__max_features=3000, tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time=   0.1s
[CV] END nb__alpha=0.1, tfidf__max_features=3000, tfidf__ngram_range=(1, 1), tfidf__use_idf=False; total time=   0.1s

In [12]:
# Best parameters
print("Best parameters: ")
print(grid_search.best_params_)

# Evaluate the best model
model = grid_search.best_estimator_
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Best parameters: 
{'nb__alpha': 0.1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2), 'tfidf__use_idf': False}
              precision    recall  f1-score   support

          ch       0.85      0.84      0.85       706
         cnc       0.86      0.70      0.77       513
          ct       0.94      0.86      0.90      1022
          ft       0.85      0.93      0.89      2281
          mr       0.87      0.81      0.84      1009
         pkg       0.87      0.89      0.88      1908

    accuracy                           0.87      7439
   macro avg       0.87      0.84      0.85      7439
weighted avg       0.87      0.87      0.87      7439



### manual Testing

In [15]:
new_text = "38383883**()"
processed_text = preprocess_german_text(new_text)

if processed_text:
    # Predicting with the model
    predicted_label = model.predict([processed_text])[0]
    print(f"The predicted label for the text '{new_text}' is: {predicted_label}")
else:
    print('Given German Text Phrase is Invalid')



Given German Text Phrase is Invalid


### Exporting Model using Joblib

In [16]:
from joblib import dump
dump(model, 'German_Text_Classifier_NB_Model.joblib')

['German_Text_Classifier_NB_Model.joblib']