In [167]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [168]:
language_dicts = {
    "english": "data/english.txt",
    "finnish": "data/finnish.txt",
    "german": "data/german.txt",
    "italian": "data/italian.txt",
    "polish": "data/polish.txt",
    "spanish": "data/spanish.txt"
}

In [169]:
def split_text_to_parts(text, num_parts=1000):
    part_size = len(text) // num_parts
    return [text[i:i + part_size] for i in range(0, len(text), part_size)][:num_parts]

In [170]:
data = []

for language, file_path in language_dicts.items():
    with open(file_path, 'r', encoding='ISO-8859-1') as in_file:
        text = in_file.read()
        parts = split_text_to_parts(text)
        for part in parts:
            data.append({"Text": part, "Language": language})

In [171]:
df = pd.DataFrame(data)

In [172]:
#Sprawdzenie
def count_records_by_language():
    return df['Language'].value_counts()

count_records_by_language()

Language
english    1000
finnish    1000
german     1000
italian    1000
polish     1000
spanish    1000
Name: count, dtype: int64

In [173]:
train_data = []
test_data = []

for language in df['Language'].unique():
    language_data = df[df['Language'] == language]
    language_train, language_test = train_test_split(language_data, test_size=0.2, random_state=42)
    
    train_data.append(language_train.sample(800, random_state=42))
    test_data.append(language_test.sample(200, random_state=42))

train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

In [174]:
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Rozmiar zestawu treningowego: {train_df.shape}")
print(f"Rozmiar zestawu testowego: {test_df.shape}")

Rozmiar zestawu treningowego: (4800, 2)
Rozmiar zestawu testowego: (1200, 2)


In [175]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3))


X_train = vectorizer.fit_transform(train_df['Text'])
X_test = vectorizer.transform(test_df['Text'])
y_train = train_df['Language']
y_test = test_df['Language']

In [176]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

In [177]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained.")

Naive Bayes trained.
Logistic Regression trained.
Random Forest trained.


In [178]:
from sklearn.metrics import classification_report

for name, model in models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print(f"Model: {name}\n{report}\n")

Model: Naive Bayes
              precision    recall  f1-score   support

     english       1.00      1.00      1.00       200
     finnish       1.00      1.00      1.00       200
      german       1.00      1.00      1.00       200
     italian       1.00      1.00      1.00       200
      polish       1.00      0.99      1.00       200
     spanish       1.00      1.00      1.00       200

    accuracy                           1.00      1200
   macro avg       1.00      1.00      1.00      1200
weighted avg       1.00      1.00      1.00      1200


Model: Logistic Regression
              precision    recall  f1-score   support

     english       1.00      1.00      1.00       200
     finnish       1.00      1.00      1.00       200
      german       1.00      1.00      1.00       200
     italian       1.00      1.00      1.00       200
      polish       1.00      1.00      1.00       200
     spanish       1.00      1.00      1.00       200

    accuracy                  